In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Read the CSV files
cal_df = pd.read_csv('data/raw/calendar2024.csv')
lis_df = pd.read_csv('data/raw/listings2024.csv') 
rev_df = pd.read_csv('data/raw/reviews2024.csv')

## Initial EDA

In [None]:
lis_df.head()

In [None]:
cal_df.head()

In [None]:
rev_df.head()

In [12]:
# Initial exploration functions
def explore_dataset(df, name):
    print(f"\n{'='*50}")
    print(f"Dataset: {name}")
    print(f"{'='*50}")
    
    print("\n1. Basic Information:")
    print(f"Shape: {df.shape}")
    
    print("\n2. Data Types:")
    print(df.dtypes)
    
    print("\n3. Missing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_info = pd.DataFrame({
        'Missing Values': missing,
        'Percentage': missing_pct
    })
    print(missing_info[missing_info['Missing Values'] > 0])
    
    print("\n4. Sample Data:")
    print(df.head())
    
    return missing_info

In [None]:
listings_missing = explore_dataset(lis_df, 'Listings')

In [None]:
calendar_missing = explore_dataset(cal_df, 'Calendar')

In [None]:
reviews_missing = explore_dataset(rev_df, 'Reviews')

In [None]:
# Basic visualizations for initial insights
def plot_missing_values(missing_info, title):
    plt.figure(figsize=(12, 6))
    missing_info[missing_info['Missing Values'] > 0]['Percentage'].plot(kind='bar')
    plt.title(f'Missing Values in {title} Dataset')
    plt.xlabel('Columns')
    plt.ylabel('Percentage Missing')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Plot missing values for each dataset
plot_missing_values(listings_missing, 'Listings')
plot_missing_values(calendar_missing, 'Calendar')
plot_missing_values(reviews_missing, 'Reviews')

In [None]:
# Get columns with less than 3 unique values
def few_unique(df, threshold=3):
    """Returns and prints columns with less than threshold unique values."""
    cols = [col for col in df.columns if df[col].nunique() < threshold]
    print(f"Columns with less than {threshold} unique values:")
    for col in cols:
        print(f"{col}: {df[col].nunique()}")
    return cols

unique_cols = few_unique(lis_df)

## Cleaning

In [6]:
cal_df.drop(columns=['adjusted_price'], inplace=True)

null_cols_lis = lis_df.columns[lis_df.isna().all()].tolist()
lis_df = lis_df.drop(columns=null_cols_lis)

lis_df.drop(columns=['scrape_id'], inplace=True)

rev_df.drop(columns=['reviewer_name'], inplace=True)

In [None]:
cal_df['price'] = cal_df['price'].str.replace(r'[\$,]', '', regex=True)

cal_df = cal_df.rename(columns={'price': 'price($)'})
cal_df['price($)'] = pd.to_numeric(cal_df['price($)'], errors='coerce')

cal_df['available'] = cal_df['available'] == 't'
lis_df['instant_bookable'] = lis_df['instant_bookable'] == 't'

cal_df['date'] = pd.to_datetime(cal_df['date'])
rev_df['date'] = pd.to_datetime(rev_df['date'])
lis_df['last_scraped'] = pd.to_datetime(lis_df['last_scraped'])
lis_df['host_since'] = pd.to_datetime(lis_df['host_since'])

rev_df['comments'].astype("string")#(str)

lis_df.info()
