# Air Quality EDA

**Dataset:** `data/raw/air_quality_clean.csv`

Objectifs :
- Inspecter la structure, les plages de valeurs et les données manquantes.
- Comparer les niveaux de polluants entre villes/pays.
- Explorer les corrélations entre polluants et variables météo.
- Observer les tendances temporelles sur 2023.


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_context("talk")
pd.set_option("display.float_format", "{:.2f}".format)

RAW_PATH = Path("data/raw/air_quality_clean.csv")
df = pd.read_csv(RAW_PATH)
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df.head()

In [None]:
print("Shape:", df.shape)
display(df.describe(include="all"))
display(df.isna().mean().sort_values(ascending=False).to_frame("missing_ratio"))

In [None]:
# Corrélation sur les colonnes numériques
num_cols = df.select_dtypes(include=["number"]).columns
plt.figure(figsize=(10, 7))
sns.heatmap(df[num_cols].corr(), cmap="coolwarm", annot=False)
plt.title("Corrélations des variables numériques")
plt.tight_layout()
plt.show()

In [None]:
# PM2.5 par pays (Top 10)
top_countries = df.groupby("Country")['PM2.5'].median().sort_values(ascending=False).head(10).index
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[df['Country'].isin(top_countries)], x='Country', y='PM2.5')
plt.xticks(rotation=45, ha='right')
plt.title("PM2.5 par pays (Top 10 par médiane)")
plt.tight_layout()
plt.show()

In [None]:
# Tendances temporelles : PM2.5 par ville (top 5 en nombre de points)
top_cities = df['City'].value_counts().head(5).index
tmp = df[df['City'].isin(top_cities)].sort_values('Date')
plt.figure(figsize=(12, 6))
sns.lineplot(data=tmp, x='Date', y='PM2.5', hue='City')
plt.title("Évolution PM2.5 (Top 5 villes)")
plt.tight_layout()
plt.show()

In [None]:
# Polluants vs météo : exemple PM2.5 vs Humidité
plt.figure(figsize=(7, 5))
sns.scatterplot(data=df, x='Humidity', y='PM2.5', hue='Country', alpha=0.6, legend=False)
plt.title("PM2.5 vs Humidité")
plt.tight_layout()
plt.show()