In [None]:
# Minimal EDA prototype - with improvements
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# --- 1. Load the data ---
try:
    df = pd.read_csv("your_dataset.csv")
    print("Data loaded successfully! 🎉")
except FileNotFoundError:
    print("Error: 'your_dataset.csv' not found. Please place the file in the same directory.")
    exit()

# --- 2. Initial Data Overview ---
print("\n--- Data Overview ---")
print(f"Dataset has {df.shape[0]:,} rows and {df.shape[1]} columns.")
display(df.head())

# --- 3. Missing Values Analysis ---
print("\n--- Missing Values ---")
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print("Columns with missing values:")
    display(missing_values.head(20))
else:
    print("No missing values found. Great! ✅")

# --- 4. Numeric Column Analysis ---
print("\n--- Numeric Column Summary ---")
numeric_cols = df.select_dtypes(include=['number'])
if not numeric_cols.empty:
    display(numeric_cols.describe().T)

    # --- Correlation Heatmap ---
    print("\n--- Correlation Heatmap ---")
    plt.figure(figsize=(10, 8))
    sns.heatmap(numeric_cols.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5)
    plt.title("Correlation Matrix of Numeric Features")
    plt.show()
else:
    print("No numeric columns found.")

# --- 5. Categorical Column Analysis ---
print("\n--- Categorical Column Analysis ---")
categorical_cols = df.select_dtypes(exclude=['number'])
if not categorical_cols.empty:
    for col in categorical_cols.columns:
        print(f"\n--- Top 10 Value Counts for '{col}' ---")
        value_counts = df[col].value_counts(dropna=False)
        display(value_counts.head(10))

        # Bar chart for categorical data
        if len(value_counts) < 50:  # Only plot if not too many unique values
            plt.figure(figsize=(10, 6))
            sns.barplot(x=value_counts.index, y=value_counts.values)
            plt.title(f"Distribution of '{col}'")
            plt.xticks(rotation=45, ha='right')
            plt.show()
else:
    print("No categorical columns found.")

# --- 6. Outlier Detection with Boxplots ---
print("\n--- Outlier Detection (Boxplots) ---")
if not numeric_cols.empty:
    for col in numeric_cols.columns:
        plt.figure(figsize=(10, 2))
        sns.boxplot(x=df[col])
        plt.title(f"Boxplot of '{col}'")
        plt.show()
else:
    print("No numeric columns for outlier detection.")