In [None]:
# Gabriela Aguilar
# Exploratory Data Analysis of Travel Review Ratings from UC Irvine Machine Learning Repository
# Google reviews on attractions from 24 categories across Europe are considered. Google user rating ranges from 1 to 5 and average user rating per category is calculated.


# **1. Acceso e Importación de Datos:**

*   Importación de las librerias necesarias.
*   Decodificación del repositorio de GitHub y acceso a la base de datos en formato CSV.

In [2]:
# Importando todas las librerias necesarias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
sns.set(style="whitegrid")
import plotly.express as px # For dynamic graphics
from getpass import getpass # For GitHub token request
import requests # Access to HTTP from GitHub repository
import base64
from io import StringIO

In [32]:
# Accesso a repositorio de GitHub privado

token = getpass('Enter your GitHub Personal Access Token: ')

Enter your GitHub Personal Access Token: ··········


In [41]:
# Carga de los Datos

# Acceso a repo privado de GitHub y archivo de Datos raw via GitHub API
url = f"https://api.github.com/repos/gabrielaaguiv5/ProjectI/contents/data/raw/google_review_ratings.csv"

# Decodificación con GitHub API y token para acceso a repo privado
headers = {"Authorization": f"token {token}"}
res = requests.get(url, headers=headers)

# Decodificación de contenido en repo y carga en Database con Pandas
if res.status_code == 200:
    content = res.json()['content']
    decoded = base64.b64decode(content).decode('utf-8')
    datos = pd.read_csv(StringIO(decoded), delimiter=',')
    print(datos.head(10))
else:
    print(f"Failed to fetch file: {res.status_code} {res.reason}")

    userid  churches  resorts  beaches  parks  theatres  museums  malls  zoos  \
0   User 1       0.0      0.0     3.63   3.65       5.0     2.92   5.00  2.35   
1   User 2       0.0      0.0     3.63   3.65       5.0     2.92   5.00  2.64   
2   User 3       0.0      0.0     3.63   3.63       5.0     2.92   5.00  2.64   
3   User 4       0.0      0.5     3.63   3.63       5.0     2.92   5.00  2.35   
4   User 5       0.0      0.0     3.63   3.63       5.0     2.92   5.00  2.64   
5   User 6       0.0      0.0     3.63   3.63       5.0     2.92   5.00  2.63   
6   User 7       0.0      5.0     3.63   3.63       5.0     2.92   3.03  2.35   
7   User 8       0.0      5.0     3.63   3.63       5.0     2.92   5.00  2.63   
8   User 9       0.0      5.0     3.64   3.64       5.0     2.92   3.03  2.62   
9  User 10       0.0      5.0     3.64   3.64       5.0     2.92   5.00  2.35   

   restaurants  ...  art galleries dance clubs  swimming pools  gyms  \
0         2.33  ...           1.74  

# **2. Análisis Descriptivo:**

*   Revisión del tipo de datos y cantidad de registros.
*   Identificación y tratamiento de valores faltantes o duplicados.
*   Estadísticas descriptivas (media, mediana, desviación estándar, percentiles, etc.).


In [42]:
# Revisión del tipo de datos y cantidad de registros.
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5456 entries, 0 to 5455
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   userid                 5456 non-null   object 
 1   churches               5456 non-null   float64
 2   resorts                5456 non-null   float64
 3   beaches                5456 non-null   float64
 4   parks                  5456 non-null   float64
 5   theatres               5456 non-null   float64
 6   museums                5456 non-null   float64
 7   malls                  5456 non-null   float64
 8   zoos                   5456 non-null   float64
 9   restaurants            5456 non-null   float64
 10  pubs/bars              5456 non-null   float64
 11  local services         5456 non-null   object 
 12  burger/pizza shops     5455 non-null   float64
 13  hotels/other lodgings  5456 non-null   float64
 14  juice bars             5456 non-null   float64
 15  art 

In [43]:
# Identificación de valores faltantes

print(datos.isnull().sum()) # Valores faltantes en cada fila

print()

print("Valores faltantes en la totalidad de la Database:")

print(datos.isnull().any().any()) # Valores faltantes en la totalidad de la Database, revisa cada columna y genera resultado consolidado

userid                   0
churches                 0
resorts                  0
beaches                  0
parks                    0
theatres                 0
museums                  0
malls                    0
zoos                     0
restaurants              0
pubs/bars                0
local services           0
burger/pizza shops       1
hotels/other lodgings    0
juice bars               0
art galleries            0
dance clubs              0
swimming pools           0
gyms                     0
bakeries                 0
beauty & spas            0
cafes                    0
view points              0
monuments                0
gardens                  1
dtype: int64

Valores faltantes en la totalidad de la Database:
True


In [44]:
# Identificación de valores duplicados

print(datos.duplicated().sum()) # Filas duplicadas

print()

print("Filas duplicadas en la totalidad de la Database:")
print(datos[datos.duplicated()])

0

Filas duplicadas en la totalidad de la Database:
Empty DataFrame
Columns: [userid, churches, resorts, beaches, parks, theatres, museums, malls, zoos, restaurants, pubs/bars, local services, burger/pizza shops, hotels/other lodgings, juice bars, art galleries, dance clubs, swimming pools, gyms, bakeries, beauty & spas, cafes, view points, monuments, gardens]
Index: []

[0 rows x 25 columns]


In [47]:
# Corrección de Null y conversión a variables correspondientes según el contenido de cada columna

# Ajustar tipo de variables según el contenido de cada columna

category_columns = ['userid']
columnas_numericas = ['churches', 'resorts', 'beaches', 'parks', 'theatres', 'museums',
    'malls', 'zoos', 'restaurants', 'pubs/bars', 'local services', 'burger/pizza shops',
    'hotels/other lodgings', 'juice bars', 'art galleries', 'dance clubs', 'swimming pools',
    'gyms', 'bakeries', 'beauty & spas', 'cafes', 'view points', 'monuments', 'gardens']

# Conversión a tipo de datos correspondiente
datos[category_columns] = datos[category_columns].astype('category')
# Aplicando función para corregir datos según error code tras tratar de convertir a columnas_numericas

def limpiezaNull(x):
    try:
        # Reemplazando caractéres no numéricos (excepto por '.') por un espacio vacio
        return float(str(x).replace('\t', '').replace(' ',''))
    except ValueError:
        # En casos en que el retorno es Nan
        return np.nan

datos[columnas_numericas] = datos[columnas_numericas].apply(pd.to_numeric, errors='coerce').fillna(0)


In [48]:
# Revisión de datos corregidos
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5456 entries, 0 to 5455
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   userid                 5456 non-null   category
 1   churches               5456 non-null   float64 
 2   resorts                5456 non-null   float64 
 3   beaches                5456 non-null   float64 
 4   parks                  5456 non-null   float64 
 5   theatres               5456 non-null   float64 
 6   museums                5456 non-null   float64 
 7   malls                  5456 non-null   float64 
 8   zoos                   5456 non-null   float64 
 9   restaurants            5456 non-null   float64 
 10  pubs/bars              5456 non-null   float64 
 11  local services         5456 non-null   float64 
 12  burger/pizza shops     5456 non-null   float64 
 13  hotels/other lodgings  5456 non-null   float64 
 14  juice bars             5456 non-null   f

In [49]:
# Asignar nombres de categorías a columnas correspondientes

datos.rename(columns={
    'userid': 'Unique user id',
    'churches': 'Average ratings on churches',
    'resorts': 'Average ratings on resorts',
    'beaches': 'Average ratings on beaches',
    'parks': 'Average ratings on parks',
    'theatres': 'Average ratings on theatres',
    'museums': 'Average ratings on museums',
    'malls': 'Average ratings on malls',
    'zoos': 'Average ratings on zoo',
    'restaurants': 'Average ratings on restaurants',
    'pubs/bars': 'Average ratings on pubs/bars',
    'local services': 'Average ratings on local services',
    'burger/pizza shops': 'Average ratings on burger/pizza shops',
    'hotels/other lodgings': 'Average ratings on hotels/other lodgings',
    'juice bars': 'Average ratings on juice bars',
    'art galleries': 'Average ratings on art galleries',
    'dance clubs': 'Average ratings on dance clubs',
    'swimming pools': 'Average ratings on swimming pools',
    'gyms': 'Average ratings on gyms',
    'bakeries': 'Average ratings on bakeries',
    'beauty & spas': 'Average ratings on beauty & spas',
    'cafes': 'Average ratings on cafes',
    'view points': 'Average ratings on view points',
    'monuments': 'Average ratings on monuments',
    'gardens': 'Average ratings on gardens',
}, inplace=True)

In [50]:
# Revisión de datos actualizados
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5456 entries, 0 to 5455
Data columns (total 25 columns):
 #   Column                                    Non-Null Count  Dtype   
---  ------                                    --------------  -----   
 0   Unique user id                            5456 non-null   category
 1   Average ratings on churches               5456 non-null   float64 
 2   Average ratings on resorts                5456 non-null   float64 
 3   Average ratings on beaches                5456 non-null   float64 
 4   Average ratings on parks                  5456 non-null   float64 
 5   Average ratings on theatres               5456 non-null   float64 
 6   Average ratings on museums                5456 non-null   float64 
 7   Average ratings on malls                  5456 non-null   float64 
 8   Average ratings on zoo                    5456 non-null   float64 
 9   Average ratings on restaurants            5456 non-null   float64 
 10  Average ratings on pubs/

In [None]:
# 3. Basic Info

# 5. Univariate Analysis
## Numerical
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f"Histogram of {col}")
    plt.subplot(1, 2, 2)
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

## Categorical
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, order=df[col].value_counts().index)
    plt.xticks(rotation=45)
    plt.title(f"Countplot of {col}")
    plt.show()

# 6. Bivariate Analysis
## Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

## Pairplot
# sns.pairplot(df[num_cols])
# plt.show()

# 7. Categorical vs Numerical
# Example: Boxplot of numerical vs. categorical feature
# sns.boxplot(x='categorical_col', y='numerical_col', data=df)
# plt.show()

# 8. Outlier Detection (Z-score example)
# from scipy.stats import zscore
# z_scores = np.abs(zscore(df[num_cols]))
# outliers = (z_scores > 3).sum(axis=0)
# print("Outliers per column:", outliers)

# 9. Skewness and Kurtosis
for col in num_cols:
    print(f"{col}: Skewness = {df[col].skew():.2f}, Kurtosis = {df[col].kurt():.2f}")

# 10. Summary & Insights
# Add markdown cells or print statements summarizing key findings.

# Example:
# print("Insight: Column X shows high skewness, consider log transformation.")
# print("Insight: Column Y has strong correlation with target.")