# Data Exploration and Analysis

Comprehensive exploratory data analysis (EDA) for your dataset.


In [None]:
# Load the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load your CSV
# df = pd.read_csv('your_file.csv')

print("Dataset Overview")
print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"\nColumn Names:\n{df.columns.tolist()}")


In [None]:
# Display first and last few rows
print("=== First 5 Rows ===")
display(df.head())

print("\n=== Last 5 Rows ===")
display(df.tail())


In [None]:
# Statistical summary
print("=== Statistical Summary ===")
df.describe()


In [None]:
# Check for missing values
print("=== Missing Values ===")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df)
    
    # Visualize missing values
    plt.figure(figsize=(10, 6))
    missing_df['Missing Count'].plot(kind='bar')
    plt.title('Missing Values by Column')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found!")


In [None]:
# Correlation analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns

if len(numeric_cols) > 1:
    plt.figure(figsize=(12, 10))
    sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', center=0, 
                fmt='.2f', square=True, linewidths=1)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
else:
    print("Not enough numeric columns for correlation analysis")
