# 02 - An√°lisis Exploratorio de Datos (EDA)

An√°lisis profundo de los datos de Airbnb Madrid usando MongoDB y Pandas.

## Contenido
1. Carga de datos
2. An√°lisis de precios
3. An√°lisis geogr√°fico
4. An√°lisis de reviews
5. An√°lisis de hosts
6. Correlaciones

In [None]:
# Imports
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

from src.crud_operations import AirbnbCRUD
from src.database import MongoDBConnection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configuraci√≥n
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("‚úÖ Imports completados")

## 1. Carga de Datos

In [None]:
# Conectar y cargar datos
crud = AirbnbCRUD()
conn = MongoDBConnection()

print("Cargando datos...")
listings = list(crud.collection.find())  # Cargar todos
df = pd.DataFrame(listings)

print(f"‚úÖ Datos cargados: {df.shape[0]:,} listings con {df.shape[1]} columnas")

In [None]:
# Informaci√≥n general del dataset
print("üìä INFORMACI√ìN GENERAL:\n")
print(df.info())

In [None]:
# Vista previa
display(df.head())

## 2. An√°lisis de Precios

In [None]:
# Estad√≠sticas de precio
if 'price' in df.columns:
    print("üí∞ ESTAD√çSTICAS DE PRECIO:\n")
    print(df['price'].describe())
    print(f"\nRango: {df['price'].min():.2f}‚Ç¨ - {df['price'].max():.2f}‚Ç¨")
    print(f"Mediana: {df['price'].median():.2f}‚Ç¨")
    print(f"Moda: {df['price'].mode()[0]:.2f}‚Ç¨")

In [None]:
# Distribuci√≥n de precios
if 'price' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histograma
    axes[0].hist(df[df['price'] < 500]['price'], bins=50, edgecolor='black')
    axes[0].set_title('Distribuci√≥n de Precios (< 500‚Ç¨)', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Precio (‚Ç¨)')
    axes[0].set_ylabel('Frecuencia')
    axes[0].axvline(df['price'].median(), color='red', linestyle='--', label=f'Mediana: {df["price"].median():.2f}‚Ç¨')
    axes[0].legend()
    
    # Boxplot
    axes[1].boxplot(df[df['price'] < 500]['price'])
    axes[1].set_title('Boxplot de Precios (< 500‚Ç¨)', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('Precio (‚Ç¨)')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Precio por tipo de habitaci√≥n
if 'price' in df.columns and 'room_type' in df.columns:
    precio_por_tipo = df.groupby('room_type')['price'].agg(['mean', 'median', 'count']).round(2)
    precio_por_tipo = precio_por_tipo.sort_values('mean', ascending=False)
    
    print("\nüè† PRECIO POR TIPO DE HABITACI√ìN:\n")
    display(precio_por_tipo)
    
    # Gr√°fico
    fig, ax = plt.subplots(figsize=(10, 6))
    precio_por_tipo['mean'].plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
    ax.set_title('Precio Promedio por Tipo de Habitaci√≥n', fontsize=14, fontweight='bold')
    ax.set_xlabel('Tipo de Habitaci√≥n')
    ax.set_ylabel('Precio Promedio (‚Ç¨)')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 3. An√°lisis Geogr√°fico

In [None]:
# Listings por barrio
if 'neighbourhood_cleansed' in df.columns:
    barrios = df['neighbourhood_cleansed'].value_counts().head(15)
    
    print("üìç TOP 15 BARRIOS CON M√ÅS LISTINGS:\n")
    for i, (barrio, count) in enumerate(barrios.items(), 1):
        pct = count / len(df) * 100
        print(f"{i:2d}. {barrio:30s}: {count:5,} ({pct:5.2f}%)")
    
    # Gr√°fico
    fig, ax = plt.subplots(figsize=(12, 6))
    barrios.plot(kind='barh', ax=ax, color='coral', edgecolor='black')
    ax.set_title('Top 15 Barrios con M√°s Listings', fontsize=14, fontweight='bold')
    ax.set_xlabel('N√∫mero de Listings')
    ax.set_ylabel('Barrio')
    plt.tight_layout()
    plt.show()

In [None]:
# Precio promedio por barrio (top 15)
if 'price' in df.columns and 'neighbourhood_cleansed' in df.columns:
    precio_barrio = df.groupby('neighbourhood_cleansed')['price'].mean().sort_values(ascending=False).head(15)
    
    print("\nüí∞ TOP 15 BARRIOS M√ÅS CAROS:\n")
    for i, (barrio, precio) in enumerate(precio_barrio.items(), 1):
        print(f"{i:2d}. {barrio:30s}: {precio:7.2f}‚Ç¨")
    
    # Gr√°fico
    fig, ax = plt.subplots(figsize=(12, 6))
    precio_barrio.plot(kind='barh', ax=ax, color='gold', edgecolor='black')
    ax.set_title('Top 15 Barrios M√°s Caros (Precio Promedio)', fontsize=14, fontweight='bold')
    ax.set_xlabel('Precio Promedio (‚Ç¨)')
    ax.set_ylabel('Barrio')
    plt.tight_layout()
    plt.show()

## 4. An√°lisis de Reviews

In [None]:
# Estad√≠sticas de reviews
if 'number_of_reviews' in df.columns:
    print("‚≠ê ESTAD√çSTICAS DE REVIEWS:\n")
    print(df['number_of_reviews'].describe())
    
    # Distribuci√≥n
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histograma
    axes[0].hist(df[df['number_of_reviews'] < 100]['number_of_reviews'], bins=50, edgecolor='black', color='lightgreen')
    axes[0].set_title('Distribuci√≥n de Reviews (< 100)', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('N√∫mero de Reviews')
    axes[0].set_ylabel('Frecuencia')
    
    # Boxplot
    axes[1].boxplot(df[df['number_of_reviews'] < 100]['number_of_reviews'])
    axes[1].set_title('Boxplot de Reviews (< 100)', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('N√∫mero de Reviews')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Relaci√≥n precio vs reviews
if 'price' in df.columns and 'number_of_reviews' in df.columns:
    # Filtrar outliers para mejor visualizaci√≥n
    df_filtered = df[(df['price'] < 300) & (df['number_of_reviews'] < 200)]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.scatter(df_filtered['number_of_reviews'], df_filtered['price'], alpha=0.5)
    ax.set_title('Relaci√≥n: N√∫mero de Reviews vs Precio', fontsize=14, fontweight='bold')
    ax.set_xlabel('N√∫mero de Reviews')
    ax.set_ylabel('Precio (‚Ç¨)')
    plt.tight_layout()
    plt.show()
    
    # Correlaci√≥n
    corr = df[['price', 'number_of_reviews']].corr().iloc[0, 1]
    print(f"\nüìä Correlaci√≥n precio-reviews: {corr:.3f}")

## 5. An√°lisis de Hosts

In [None]:
# Hosts con m√°s listings
if 'host_id' in df.columns and 'host_name' in df.columns:
    host_listings = df.groupby(['host_id', 'host_name']).size().sort_values(ascending=False).head(15)
    
    print("üë§ TOP 15 HOSTS CON M√ÅS LISTINGS:\n")
    for i, ((host_id, host_name), count) in enumerate(host_listings.items(), 1):
        print(f"{i:2d}. {host_name:30s}: {count:3d} listings")
    
    # Gr√°fico
    fig, ax = plt.subplots(figsize=(12, 6))
    host_listings.reset_index()['size'].plot(kind='bar', ax=ax, color='mediumpurple', edgecolor='black')
    ax.set_title('Top 15 Hosts con M√°s Listings', fontsize=14, fontweight='bold')
    ax.set_xlabel('Host')
    ax.set_ylabel('N√∫mero de Listings')
    ax.set_xticklabels([h[1][:20] for h in host_listings.index], rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 6. Matriz de Correlaci√≥n

In [None]:
# Seleccionar columnas num√©ricas
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cols_importantes = ['price', 'number_of_reviews', 'accommodates', 'bedrooms', 'beds', 
                   'availability_30', 'availability_365']
cols_disponibles = [col for col in cols_importantes if col in numeric_cols]

if len(cols_disponibles) >= 2:
    # Matriz de correlaci√≥n
    corr_matrix = df[cols_disponibles].corr()
    
    # Visualizaci√≥n
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, ax=ax, cbar_kws={"shrink": 0.8})
    ax.set_title('Matriz de Correlaci√≥n', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

## üéâ Conclusiones

En este notebook realizamos:

- ‚úÖ An√°lisis de distribuci√≥n de precios
- ‚úÖ An√°lisis geogr√°fico por barrios
- ‚úÖ An√°lisis de reviews
- ‚úÖ An√°lisis de hosts
- ‚úÖ Correlaciones entre variables

**Siguiente:** `03_visualizations.ipynb` - Visualizaciones avanzadas