In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (update the path with the actual file)
file_path = "/content/3b01bcb8-0b14-4abf-b6f2-c1bfd384ba69.csv"
df = pd.read_csv(file_path)

# Convert pollutant columns to numeric
df['pollutant_min'] = pd.to_numeric(df['pollutant_min'], errors='coerce')
df['pollutant_max'] = pd.to_numeric(df['pollutant_max'], errors='coerce')
df['pollutant_avg'] = pd.to_numeric(df['pollutant_avg'], errors='coerce')

# Convert date column (Replace 'date_column' with actual column name)
df['last_update'] = pd.to_datetime(df['last_update'], errors='coerce')

# Fill missing values in numeric columns with their mean
df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)

# Fill missing values in categorical columns with their mode
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():  # Check if there are missing values
        df[col].fillna(df[col].mode()[0], inplace=True)

# Handle outliers by capping values beyond 1.5*IQR
for col in df.select_dtypes(include=['number']).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower_bound, upper_bound)

# Standardizing state & city names
df['state'] = df['state'].str.replace('_', ' ').str.title()
df['city'] = df['city'].str.title()

# Drop duplicate rows
df.drop_duplicates(inplace=True)

### üîπ Additional EDA & Visualizations ###


# 2Ô∏è‚É£ Top 10 Most Polluted Cities
top_cities = df.groupby('city')['pollutant_avg'].mean().nlargest(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_cities.index, y=top_cities.values, palette="Reds_r")
plt.title("Top 10 Most Polluted Cities (Avg Pollutant Level)")
plt.xlabel("City")
plt.ylabel("Average Pollution Level")
plt.xticks(rotation=45)
plt.show()

# 3Ô∏è‚É£ Top 10 Least Polluted Cities
least_cities = df.groupby('city')['pollutant_avg'].mean().nsmallest(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=least_cities.index, y=least_cities.values, palette="Blues_r")
plt.title("Top 10 Least Polluted Cities (Avg Pollutant Level)")
plt.xlabel("City")
plt.ylabel("Average Pollution Level")
plt.xticks(rotation=45)
plt.show()

# 4Ô∏è‚É£ Pollution Type Distribution by State
plt.figure(figsize=(14, 7))
sns.boxplot(x='pollutant_id', y='pollutant_avg', data=df, hue='state')
plt.xticks(rotation=45)
plt.title("Pollutant Distribution Across States")
plt.xlabel("Pollutant Type")
plt.ylabel("Average Level")
plt.legend(title="State", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# 5Ô∏è‚É£ Pairplot to Identify Relationships Between Pollutants
sns.pairplot(df[['pollutant_min', 'pollutant_max', 'pollutant_avg']], diag_kind='kde')
plt.show()

# 6Ô∏è‚É£ Violin Plot for Pollution Spread Across Pollutants
plt.figure(figsize=(12, 6))
sns.violinplot(x="pollutant_id", y="pollutant_avg", data=df, palette="Set2")
plt.xticks(rotation=45)
plt.title("Distribution of Pollutant Levels")
plt.xlabel("Pollutant Type")
plt.ylabel("Average Pollution Level")
plt.show()

# 7Ô∏è‚É£ State-Wise Pollution Levels
state_pollution = df.groupby('state')['pollutant_avg'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x=state_pollution.index, y=state_pollution.values, palette="coolwarm")
plt.xticks(rotation=90)
plt.title("Average Pollution Levels by State")
plt.xlabel("State")
plt.ylabel("Average Pollutant Level")
plt.show()

# 8Ô∏è‚É£ Correlation Heatmap of Pollution Levels
correlation_matrix = df[['pollutant_min', 'pollutant_max', 'pollutant_avg']].corr()
plt.figure(figsize=(8, 5))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Between Pollutant Levels")
plt.show()
