# Title : Analyzing Air Quality Index (AQI) Trends in a City

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore') 

In [None]:
data = pd.read_csv("datasets/AIQ.csv", encoding="cp1252")
data

In [None]:
data.info()

In [None]:
data.state = data.state.replace({'Uttaranchal':'Uttarakhand'})
data.state[data.location == "Jamshedpur"] = data.state[data.location == 'Jamshedpur'].replace({"Bihar":"Jharkhand"})

In [None]:
types = {
    "Residential": "R",
    "Residential and others": "RO",
    "Residential, Rural and other Areas": "RRO",
    "Industrial Area": "I",
    "Industrial Areas": "I",
    "Industrial": "I",
    "Sensitive Area": "S",
    "Sensitive Areas": "S",
    "Sensitive": "S",
    np.nan: "RRO"
}

data.type = data.type.replace(types)
data.head()

In [None]:
VALUE_COLS = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']

In [None]:
VALUE_COLS = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

nan_cols = data[VALUE_COLS].columns[data[VALUE_COLS].isnull().all()].tolist()

VALUE_COLS = [col for col in VALUE_COLS if col not in nan_cols]

imputed_data = imputer.fit_transform(data[VALUE_COLS])

imputed_df = pd.DataFrame(imputed_data, columns=VALUE_COLS, index=data.index)

data[VALUE_COLS] = imputed_df[VALUE_COLS]

In [None]:
print(data.isnull().sum())
data.tail()

In [None]:
def top_and_bottom_10_states(indicator="so2"):
    fig, ax = plt.subplots(2,1, figsize=(20, 12))
    ind = data[[indicator, 'state']].groupby('state', as_index=False).median().sort_values(by=indicator,ascending=False)
    top10 = sns.barplot(x='state', y=indicator, data=ind[:10], ax=ax[0])
    top10.set_title(f"Top 10 states by {indicator} (1991-2016)")
    top10.set_ylabel("so2")
    top10.set_xlabel("State")
    bottom10 = sns.barplot(x='state', y=indicator, data=ind[-10:], ax=ax[1])
    bottom10.set_title(f"Bottom 10 states by {indicator} (1991-2016)")
    bottom10.set_ylabel("so2")
    bottom10.set_xlabel("State")

top_and_bottom_10_states("so2")
top_and_bottom_10_states("no2")

In [None]:
def highest_levels_recorded(indicator="so2"):
    plt.figure(figsize=(20,10))
    ind = data[[indicator, 'state']].groupby('state', as_index=False).max()
    highest = sns.barplot(x='state', y=indicator, data=ind)
    highest.set_title("Highest ever {} levels recorded by state".format(indicator))
    plt.xticks(rotation=90)

highest_levels_recorded("no2")
highest_levels_recorded("rspm")

In [None]:
def type_avg(indicator=""):
    type_avg = data[VALUE_COLS + ['type']].groupby("type").mean()  
    if not indicator:
        t = type_avg[indicator].plot(kind='bar')
        plt.xticks(rotation = 0)
        plt.title("Pollutant average by type for {}".format(indicator))
    else:
        t = type_avg.plot(kind='bar')
        plt.xticks(rotation = 0)
        plt.title("Pollutant average by type")

type_avg('so2')

In [None]:
def location_avgs(state, indicator="so2"):
    numerical_cols = VALUE_COLS + ['state', 'location']
    locs = data[numerical_cols].groupby(['state', 'location']).mean()
    state_avgs = locs.loc[state].reset_index()
    sns.barplot(x='location', y=indicator, data=state_avgs)
    plt.title("Location-wise average for {} in {}".format(indicator, state))
    plt.xticks(rotation = 90)

location_avgs("Bihar", "no2")