# Environmental Monitoring & Pollution Control using AI/ML

## 1. Data Collection

In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("Bangalore_AQI_Dataset.csv")

# Quick look
print(df.head())
print(df.info())


        City        Date  AQI  PM2.5   PM10    NO2    SO2    CO    O3
0  Bangalore  01-01-2018   68   37.4  73.44  56.44  77.52  0.71  64.6
1  Bangalore  02-01-2018   76   41.8  82.08  63.08  86.64  0.80  72.2
2  Bangalore  03-01-2018   70   38.5  75.60  58.10  79.80  0.74  66.5
3  Bangalore  04-01-2018   78   42.9  84.24  64.74  88.92  0.82  74.1
4  Bangalore  05-01-2018   82   45.1  88.56  68.06  93.48  0.86  77.9
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   City    2556 non-null   object 
 1   Date    2556 non-null   object 
 2   AQI     2556 non-null   int64  
 3   PM2.5   2556 non-null   float64
 4   PM10    2543 non-null   float64
 5   NO2     2536 non-null   float64
 6   SO2     2537 non-null   float64
 7   CO      2550 non-null   float64
 8   O3      2546 non-null   float64
dtypes: float64(6), int64(1), object(2)
memory usage: 179.8+ KB
Non

## 2. Handling Missing Values

In [2]:
# Check missing values
print(df.isnull().sum())

# Strategy: 
# - Fill missing pollutant values with median (robust to outliers)

df.fillna({
    'PM2.5': df['PM2.5'].median(),
    'PM10':  df['PM10'].median(),
    'NO2':   df['NO2'].median(),
    'SO2':   df['SO2'].median(),
    'CO':    df['CO'].median(),
    'O3':    df['O3'].median()
}, inplace=True)

# - Drop rows if Date or City missing
df.dropna(subset=['City', 'Date'], inplace=True)


City      0
Date      0
AQI       0
PM2.5     0
PM10     13
NO2      20
SO2      19
CO        6
O3       10
dtype: int64


## Data Preprocessing

In [4]:
# Convert Date column to datetime (DD-MM-YYYY format)
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')

# Extract Year, Month, Day
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Encode City
from sklearn.preprocessing import LabelEncoder
df['City_Code'] = LabelEncoder().fit_transform(df['City'])

# Optional: Drop rows where Date conversion failed
df.dropna(subset=['Date'], inplace=True)


In [5]:
df

Unnamed: 0,City,Date,AQI,PM2.5,PM10,NO2,SO2,CO,O3,Year,Month,Day,City_Code
0,Bangalore,2018-01-01,68,37.40,73.44,56.44,77.52,0.71,64.60,2018,1,1,0
1,Bangalore,2018-01-02,76,41.80,82.08,63.08,86.64,0.80,72.20,2018,1,2,0
2,Bangalore,2018-01-03,70,38.50,75.60,58.10,79.80,0.74,66.50,2018,1,3,0
3,Bangalore,2018-01-04,78,42.90,84.24,64.74,88.92,0.82,74.10,2018,1,4,0
4,Bangalore,2018-01-05,82,45.10,88.56,68.06,93.48,0.86,77.90,2018,1,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2551,Bangalore,2024-12-27,54,29.70,58.32,57.27,61.56,0.57,51.30,2024,12,27,0
2552,Bangalore,2024-12-28,57,31.35,61.56,47.31,64.98,0.60,54.15,2024,12,28,0
2553,Bangalore,2024-12-29,70,38.50,75.60,58.10,79.80,0.74,66.50,2024,12,29,0
2554,Bangalore,2024-12-30,62,34.10,66.96,51.46,70.68,0.65,58.90,2024,12,30,0
