In [None]:
# Gabriela Aguilar
# Exploratory Data Analysis of Travel Review Ratings from UC Irvine Machine Learning Repository
# Google reviews on attractions from 24 categories across Europe are considered. Google user rating ranges from 1 to 5 and average user rating per category is calculated.


# **1. Acceso e Importación de Datos:**

*   Importación de las librerias necesarias.
*   Decodificación del repositorio de GitHub y acceso a la base de datos en formato CSV.

In [2]:
# Importando todas las librerias necesarias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
sns.set(style="whitegrid")
import plotly.express as px # For dynamic graphics
from getpass import getpass # For GitHub token request
import requests # Access to HTTP from GitHub repository
import base64
from io import StringIO

In [32]:
# Accesso a repositorio de GitHub privado

token = getpass('Enter your GitHub Personal Access Token: ')

Enter your GitHub Personal Access Token: ··········


In [33]:
# Carga de los Datos

# Acceso a repo privado de GitHub y archivo de Datos raw via GitHub API
url = f"https://api.github.com/repos/gabrielaaguiv5/ProjectI/contents/data/raw/google_review_ratings.csv"

# Decodificación con GitHub API y token para acceso a repo privado
headers = {"Authorization": f"token {token}"}
res = requests.get(url, headers=headers)

# Decodificación de contenido en repo y carga en Database con Pandas
if res.status_code == 200:
    content = res.json()['content']
    decoded = base64.b64decode(content).decode('utf-8')
    datos = pd.read_csv(StringIO(decoded), delimiter=',')
    print(datos.head(10))
else:
    print(f"Failed to fetch file: {res.status_code} {res.reason}")

      User  Category 1  Category 2  Category 3  Category 4  Category 5  \
0   User 1         0.0         0.0        3.63        3.65         5.0   
1   User 2         0.0         0.0        3.63        3.65         5.0   
2   User 3         0.0         0.0        3.63        3.63         5.0   
3   User 4         0.0         0.5        3.63        3.63         5.0   
4   User 5         0.0         0.0        3.63        3.63         5.0   
5   User 6         0.0         0.0        3.63        3.63         5.0   
6   User 7         0.0         5.0        3.63        3.63         5.0   
7   User 8         0.0         5.0        3.63        3.63         5.0   
8   User 9         0.0         5.0        3.64        3.64         5.0   
9  User 10         0.0         5.0        3.64        3.64         5.0   

   Category 6  Category 7  Category 8  Category 9  ...  Category 16  \
0        2.92        5.00        2.35        2.33  ...         0.59   
1        2.92        5.00        2.64      

# **2. Análisis Descriptivo:**

*   Revisión del tipo de datos y cantidad de registros.
*   Identificación y tratamiento de valores faltantes o duplicados.
*   Estadísticas descriptivas (media, mediana, desviación estándar, percentiles, etc.).


In [34]:
# Revisión del tipo de datos y cantidad de registros.
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5456 entries, 0 to 5455
Data columns (total 26 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   User         5456 non-null   object 
 1   Category 1   5456 non-null   float64
 2   Category 2   5456 non-null   float64
 3   Category 3   5456 non-null   float64
 4   Category 4   5456 non-null   float64
 5   Category 5   5456 non-null   float64
 6   Category 6   5456 non-null   float64
 7   Category 7   5456 non-null   float64
 8   Category 8   5456 non-null   float64
 9   Category 9   5456 non-null   float64
 10  Category 10  5456 non-null   float64
 11  Category 11  5456 non-null   object 
 12  Category 12  5455 non-null   float64
 13  Category 13  5456 non-null   float64
 14  Category 14  5456 non-null   float64
 15  Category 15  5456 non-null   float64
 16  Category 16  5456 non-null   float64
 17  Category 17  5456 non-null   float64
 18  Category 18  5456 non-null   float64
 19  Catego

In [35]:
# Identificación de valores faltantes

print(datos.isnull().sum()) # Valores faltantes en cada fila

print()

print("Valores faltantes en la totalidad de la Database:")

print(datos.isnull().any().any()) # Valores faltantes en la totalidad de la Database, revisa cada columna y genera resultado consolidado

User              0
Category 1        0
Category 2        0
Category 3        0
Category 4        0
Category 5        0
Category 6        0
Category 7        0
Category 8        0
Category 9        0
Category 10       0
Category 11       0
Category 12       1
Category 13       0
Category 14       0
Category 15       0
Category 16       0
Category 17       0
Category 18       0
Category 19       0
Category 20       0
Category 21       0
Category 22       0
Category 23       0
Category 24       1
Unnamed: 25    5454
dtype: int64

Valores faltantes en la totalidad de la Database:
True


In [36]:
# Identificación de valores duplicados

print(datos.duplicated().sum()) # Filas duplicadas

print()

print("Filas duplicadas en la totalidad de la Database:")
print(datos[datos.duplicated()])

0

Filas duplicadas en la totalidad de la Database:
Empty DataFrame
Columns: [User, Category 1, Category 2, Category 3, Category 4, Category 5, Category 6, Category 7, Category 8, Category 9, Category 10, Category 11, Category 12, Category 13, Category 14, Category 15, Category 16, Category 17, Category 18, Category 19, Category 20, Category 21, Category 22, Category 23, Category 24, Unnamed: 25]
Index: []

[0 rows x 26 columns]


In [29]:
# Ajustar tipo de variables según el contenido de cada columna

category_columns = ['User']
columnas_numericas = ['Category 1', 'Category 2', 'Category 3', 'Category 4', 'Category 5', 'Category 6','Category 7', 'Category 8', 'Category 9', 'Category 10', 'Category 11', 'Category 12','Category 13', 'Category 14', 'Category 15', 'Category 16', 'Category 17', 'Category 18','Category 19', 'Category 20', 'Category 21', 'Category 22', 'Category 23','Category 24', 'Unnamed: 25']

# Conversión a tipo de datos correspondiente
datos[category_columns] = datos[category_columns].astype('category')
# Applying a function to clean and convert the numeric columns
def clean_and_convert(x):
    try:
        # Replace any non-numeric characters (except for '.') with an empty string
        return float(str(x).replace('\t', '').replace(' ',''))
    except ValueError:
        # Handle cases where conversion is not possible, e.g., return NaN
        return np.nan

datos[columnas_numericas] = datos[columnas_numericas].apply(pd.to_numeric, errors='coerce').fillna(0)

In [None]:
# 3. Basic Info
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
df.describe()

# 4. Data Cleaning (example placeholders)
# df.drop_duplicates(inplace=True)
# df['column'] = df['column'].fillna(value)
# df['date_col'] = pd.to_datetime(df['date_col'])

# 5. Univariate Analysis
## Numerical
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f"Histogram of {col}")
    plt.subplot(1, 2, 2)
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

## Categorical
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, order=df[col].value_counts().index)
    plt.xticks(rotation=45)
    plt.title(f"Countplot of {col}")
    plt.show()

# 6. Bivariate Analysis
## Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

## Pairplot
# sns.pairplot(df[num_cols])
# plt.show()

# 7. Categorical vs Numerical
# Example: Boxplot of numerical vs. categorical feature
# sns.boxplot(x='categorical_col', y='numerical_col', data=df)
# plt.show()

# 8. Outlier Detection (Z-score example)
# from scipy.stats import zscore
# z_scores = np.abs(zscore(df[num_cols]))
# outliers = (z_scores > 3).sum(axis=0)
# print("Outliers per column:", outliers)

# 9. Skewness and Kurtosis
for col in num_cols:
    print(f"{col}: Skewness = {df[col].skew():.2f}, Kurtosis = {df[col].kurt():.2f}")

# 10. Summary & Insights
# Add markdown cells or print statements summarizing key findings.

# Example:
# print("Insight: Column X shows high skewness, consider log transformation.")
# print("Insight: Column Y has strong correlation with target.")