# Preliminary Exploratory Data Analysis

### 1. Setup and Data Import

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
cal_df = pd.read_csv('data/raw/calendar2024.csv')
lis_df = pd.read_csv('data/raw/listings2024.csv') 
rev_df = pd.read_csv('data/raw/reviews2024.csv')

### 2. Initial Data Preview

In [None]:
lis_df.head()

In [None]:
cal_df.head()

In [None]:
rev_df.head()

### 3. Data Structure Analysis

In [61]:
def explore_dataset(df, name):
    print(f"\n{'='*50}")
    print(f"Dataset: {name}")
    print(f"{'='*50}")
    
    print("\n1. Basic Information:")
    print(f"Shape: {df.shape}")
    
    print("\n2. Data Types:")
    print(df.dtypes)

    print("\n3. Summary Statistics:")
    print(df.describe())
    
    print("\n4. Unique Values:")
    for col in df.columns:
        n_unique = df[col].nunique()
        if n_unique < 3:
            unique_values = df[col].unique()
            print(f"{col}: {n_unique} unique values")
            print(f"Values: {unique_values}\n")

    print("\n5. Missing Values:")
    # Todo: Elias ad unique values + here
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_info = pd.DataFrame({
        'Missing Values': missing,
        'Percentage': missing_pct
    })
    print(missing_info[missing_info['Missing Values'] > 0])
    
    return missing_info

In [None]:
listings_missing = explore_dataset(lis_df, 'Listings')

In [None]:
calendar_missing = explore_dataset(cal_df, 'Calendar')

In [None]:
reviews_missing = explore_dataset(rev_df, 'Reviews')

### 4. Missing Values Visualization

In [None]:
def plot_missing_values(missing_info, title):
    # Filter columns with more than 0.1% missing values
    missing_filtered = missing_info[missing_info['Percentage'] > 0.001]
    
    plt.figure(figsize=(18, 10))
    ax = missing_filtered['Percentage'].plot(kind='bar')
    plt.title(f'Missing Values in {title} Dataset')
    plt.xlabel('Columns')
    plt.ylabel('Percentage Missing')
    plt.xticks(rotation=45, ha='right')
    
    
    # Add total missing values labels on top of each bar, rotated 45 degrees
    for i, v in enumerate(missing_filtered['Missing Values']):
        ax.text(i, missing_filtered['Percentage'].iloc[i], f'{int(v):,}', 
                ha='left', va='bottom', fontsize=8, rotation=45)
    
    plt.tight_layout()
    plt.show()

# Plot missing values for each dataset
plot_missing_values(listings_missing, 'Listings')
plot_missing_values(calendar_missing, 'Calendar')
plot_missing_values(reviews_missing, 'Reviews')

### 5. Data Quality Assessment

In [66]:
def identify_data_quality_issues(df, name):
    print(f"\n{'='*50}")
    print(f"Data Quality Report for {name}")
    print(f"{'='*50}")
    
    # 1. Check for duplicates
    n_duplicates = df.duplicated().sum()
    if n_duplicates > 0:
        print(f"\nDuplicate rows: {n_duplicates}")
    
    # 2. Check for unexpected values
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    has_unexpected = False
    for col in numeric_cols:
        n_zeros = (df[col] == 0).sum()
        n_negative = (df[col] < 0).sum()
        if n_zeros > 0 or n_negative > 0:
            if not has_unexpected:
                print("\nColumns with unexpected values:")
                has_unexpected = True
            print(f"\n{col}:")
            if n_zeros > 0:
                print(f"- Zeros: {n_zeros} ({(n_zeros/len(df))*100:.2f}%)")
            if n_negative > 0:
                print(f"- Negative values: {n_negative} ({(n_negative/len(df))*100:.2f}%)")
    
    # 3. Check string columns for data inconsistencies
    string_cols = df.select_dtypes(include=['object']).columns
    has_inconsistencies = False
    for col in string_cols:
        n_empty = (df[col] == '').sum()
        n_whitespace = df[col].str.isspace().sum() if df[col].dtype == 'object' else 0
        if n_empty > 0 or n_whitespace > 0:
            if not has_inconsistencies:
                print("\nColumns with inconsistencies:")
                has_inconsistencies = True
            print(f"\n{col}:")
            if n_empty > 0:
                print(f"- Empty strings: {n_empty}")
            if n_whitespace > 0:
                print(f"- Whitespace only: {n_whitespace}")
    
    # 4. Check for extreme values in numeric columns
    has_outliers = False
    for col in numeric_cols:
        mean = df[col].mean()
        std = df[col].std()
        outliers = df[col][abs(df[col] - mean) > 3*std]
        if len(outliers) > 0:
            if not has_outliers:
                print("\nColumns with outliers (beyond 3 std devs):")
                has_outliers = True
            print(f"\n{col}:")
            print(f"- Number of outliers: {len(outliers)}")
            print(f"- Min outlier: {outliers.min()}")
            print(f"- Max outlier: {outliers.max()}")

In [None]:
# Check Listings dataset
identify_data_quality_issues(lis_df, 'Listings')

In [None]:
# Check Calendar dataset
identify_data_quality_issues(cal_df, 'Calendar')

# Additional calendar-specific checks
print("\nChecking calendar date patterns:")
cal_df['date'] = pd.to_datetime(cal_df['date'])
print(f"Date range: {cal_df['date'].min()} to {cal_df['date'].max()}")
print(f"Missing dates: {cal_df['date'].isnull().sum()}")
print(f"Days between min and max date: {(cal_df['date'].max() - cal_df['date'].min()).days}")

In [None]:
# Check Reviews dataset
identify_data_quality_issues(rev_df, 'Reviews')

### 6. Data Format Analysis

In [70]:
def check_data_formats(df):
    """Check for inconsistent formats within columns"""
    for col in df.columns:
        # Get sample of unique values
        unique_samples = df[col].dropna().unique()[:2]
        print(f"\n{col}:")
        for sample in unique_samples:
            print(f"Value: {sample}, Type: {type(sample)}")


In [None]:
check_data_formats(lis_df)

In [None]:
check_data_formats(cal_df)

In [None]:
check_data_formats(rev_df)

### 7. Special Characters Examination

In [74]:
def check_special_characters(df):
    """Check for special characters that might need handling"""
    string_cols = df.select_dtypes(include=['object']).columns
    for col in string_cols:
        # Fixed: Properly chain the .any() method
        if df[col].astype(str).str.contains(r'[^a-zA-Z0-9\s\-.,:/+&æøåÆØÅ]').any():
            print(f"\n{col} contains special characters")
            # Show examples of rows containing special characters
            print(df[df[col].astype(str).str.contains(r'[^a-zA-Z0-9\s\-.,:/+&æøåÆØÅ]')][col].head())

In [None]:
check_special_characters(lis_df)


In [None]:
check_special_characters(cal_df)

In [None]:
check_special_characters(rev_df)