In [5]:
import pandas as pd

# Cargar únicamente la columna 'amenities' del archivo CSV
file_path = "../0_Dataset/Listings.csv"
data = pd.read_csv(file_path, usecols=[ 'host_since', 'host_response_rate', 'host_acceptance_rate',
    'host_total_listings_count', 'latitude', 'longitude',
    'accommodates', 'bedrooms', 'price', 'minimum_nights',
    'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value'], delimiter=',', encoding='latin-1')

In [9]:
from datetime import datetime

# Convertir a datetime
data['host_since'] = pd.to_datetime(data['host_since'], errors='coerce')

# Calcular antigüedad en días desde hoy
hoy = pd.to_datetime("today")
data['host_antiguedad_dias'] = (hoy - data['host_since']).dt.days


In [14]:
config_limpieza = {
    'host_response_rate':       {'erroneos': False,  'outliers': False},
    'host_acceptance_rate':     {'erroneos': False,  'outliers': False},
    'latitude':                 {'erroneos': False, 'outliers': False},
    'longitude':                {'erroneos': False, 'outliers': False},
    'review_scores_rating':     {'erroneos': False, 'outliers': False},
    'review_scores_accuracy':   {'erroneos': False, 'outliers': False},
    'review_scores_cleanliness':{'erroneos': False, 'outliers': False},
    'review_scores_checkin':    {'erroneos': False, 'outliers': False},
    'review_scores_communication':{'erroneos': False, 'outliers': False},
    'review_scores_location':   {'erroneos': False, 'outliers': False},
    'review_scores_value':        {'erroneos': False, 'outliers': False},
'host_antiguedad_dias': {'erroneos': False, 'outliers': True},

    'host_total_listings_count':{'erroneos': False,  'outliers': True},
    'accommodates':             {'erroneos': True,  'outliers': True},
    'bedrooms':                 {'erroneos': True,  'outliers': True},
    'minimum_nights':           {'erroneos': True,  'outliers': True},
    
}


In [16]:
def limpiar_variable(serie, aplicar_erroneos=False, aplicar_outliers=False):
    serie = serie.dropna()
    if aplicar_erroneos:
        serie = serie[serie > 0]
    if aplicar_outliers:
        q1 = serie.quantile(0.25)
        q3 = serie.quantile(0.75)
        iqr = q3 - q1
        li = q1 - 1.5 * iqr
        ls = q3 + 1.5 * iqr
        serie = serie[(serie >= li) & (serie <= ls)]
    return serie


In [17]:
variables_limpias = {}

for col, config in config_limpieza.items():
    try:
        limpia = limpiar_variable(
            data[col],
            aplicar_erroneos=config['erroneos'],
            aplicar_outliers=config['outliers']
        )
        variables_limpias[col] = limpia
    except Exception as e:
        print(f"❌ Error al limpiar {col}: {e}")


In [18]:
from scipy.stats import kstest, norm
import pandas as pd

# Crear tabla de resultados
resultados_ks = []

for nombre, serie in variables_limpias.items():
    # Normalizar la variable (escalado Z) para aplicar K-S contra N(0,1)
    datos = (serie - serie.mean()) / serie.std()

    # Aplicar Kolmogórov-Smirnov contra la normal estándar
    estadistico, p_valor = kstest(datos, 'norm')

    # Evaluar resultado
    conclusion = "✅ Distribución Normal" if p_valor >= 0.05 else "❌ No Normal"

    resultados_ks.append({
        'Variable': nombre,
        'n': len(serie),
        'Estadístico D': round(estadistico, 4),
        'p-valor': round(p_valor, 4),
        'Conclusión': conclusion
    })

# Convertir a DataFrame y mostrar
df_ks = pd.DataFrame(resultados_ks)
df_ks.sort_values(by='p-valor', ascending=False, inplace=True)
df_ks.reset_index(drop=True, inplace=True)
df_ks


Unnamed: 0,Variable,n,Estadístico D,p-valor,Conclusión
0,host_response_rate,150930,0.3424,0.0,❌ No Normal
1,host_acceptance_rate,166625,0.275,0.0,❌ No Normal
2,latitude,279712,0.2985,0.0,❌ No Normal
3,longitude,279712,0.1939,0.0,❌ No Normal
4,review_scores_rating,188307,0.2563,0.0,❌ No Normal
5,review_scores_accuracy,187999,0.395,0.0,❌ No Normal
6,review_scores_cleanliness,188047,0.3054,0.0,❌ No Normal
7,review_scores_checkin,187941,0.4487,0.0,❌ No Normal
8,review_scores_communication,188025,0.4497,0.0,❌ No Normal
9,review_scores_location,187937,0.4173,0.0,❌ No Normal
