In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import cufflinks


In [None]:
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

In [None]:
df = pd.read_csv('C://Users/Govardhan M G/Downloads/city_day.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
df.info()

In [None]:
print(df.loc[df['City']=='Bengaluru'])

In [None]:
# null counts
df.isnull().sum()

In [None]:
df.describe()  # data statistics

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df.head()

In [None]:
# Missing values
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

missing_values= missing_values_table(df)
missing_values.style.background_gradient(cmap='Reds')

In [None]:
# visualising yearly data
city_day = df.copy()

In [None]:
city_day['BTX'] = city_day['Benzene']+city_day['Toluene']+city_day['Xylene']
city_day.drop(['Benzene','Toluene','Xylene'],axis=1);

In [None]:
city_day['Particulate_Matter'] = city_day['PM2.5']+city_day['PM10']

In [None]:
city_day['Particulate_Matter'] = city_day['PM2.5']+city_day['PM10']

In [None]:
pollutants = ['PM2.5','PM10','NO2', 'CO', 'SO2','O3', 'BTX']

In [None]:
city_day.set_index('Date',inplace=True)
axes = city_day[pollutants].plot(marker='.', alpha=0.5, linestyle='None', figsize=(16, 20), subplots=True)
for ax in axes:
    
    ax.set_xlabel('Years')
    ax.set_ylabel('ug / m3')

In [None]:
#Now lets look at a few city wise pollution stats
def max_polluted_city(pollutant):
    x1 = city_day[[pollutant,'City']].groupby(["City"]).mean().sort_values(by=pollutant,ascending=False).reset_index()
    x1[pollutant] = round(x1[pollutant],2)
    return x1[:10].style.background_gradient(cmap='OrRd')

In [None]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.render()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

In [None]:
pm2_5 = max_polluted_city('PM2.5')
pm10 = max_polluted_city('PM10')
no2 = max_polluted_city('NO2')
so2 = max_polluted_city('SO2')
co = max_polluted_city('CO')
btx = max_polluted_city('BTX')


display_side_by_side(pm2_5,pm10,no2,so2,co,btx)

In [None]:
df1 = df.copy()
df1['PM2.5']=df1['PM2.5'].fillna((df1['PM2.5'].median()))
df1['PM10']=df1['PM10'].fillna((df1['PM10'].median()))
df1['NO']=df1['NO'].fillna((df1['NO'].median()))
df1['NO2']=df1['NO2'].fillna((df1['NO2'].median()))
df1['NOx']=df1['NOx'].fillna((df1['NOx'].median()))
df1['NH3']=df1['NH3'].fillna((df1['NH3'].median()))
df1['CO']=df1['CO'].fillna((df1['CO'].median()))
df1['SO2']=df1['SO2'].fillna((df1['SO2'].median()))
df1['O3']=df1['O3'].fillna((df1['O3'].median()))
df1['Benzene']=df1['Benzene'].fillna((df1['Benzene'].median()))
df1['Toluene']=df1['Toluene'].fillna((df1['Toluene'].median()))
df1['Xylene']=df1['Xylene'].fillna((df1['Xylene'].median()))
df1['AQI']=df1['AQI'].fillna((df1['AQI'].median()))
df1['AQI_Bucket']=df1['AQI_Bucket'].fillna('Moderate')

In [None]:
df1['City'].unique()

In [None]:
df1['City'].value_counts()

In [None]:
df1['Pollution content'] = df1['PM2.5']+df1['PM10']+df1['NO']+df1['NO2']+df1['NOx']+df1['NH3']+df1['CO']+df1['SO2']+df1['O3']+df1['Benzene']+df1['Toluene']+df1['Xylene']

In [None]:
def ploting(var):
    df1[var].iplot(title=var,xTitle='Cities',yTitle=var, linecolor='black', )
    plt.show()
ploting('Pollution content')

In [None]:
def max_bar_plot(var):
    x1 = df1[['City',var]].groupby(["City"]).median().sort_values(by = var,
    ascending = True).tail(10).iplot(kind='bar', xTitle='Cities',yTitle=var, 
                                     linecolor='black', title='{2} {1} {0}'.format(")",var,' Most polluted cities('))

p = max_bar_plot('Pollution content')

In [None]:
def min_bar_plot(var):
    x1 = df1[['City',var]].groupby(["City"]).mean().sort_values(by = var,
    ascending = True).head(10).iplot(kind='bar', xTitle='Cities',yTitle=var, linecolor='black',title='{2} {1} {0}'.format(")",var,' Minimum polluted cities('))
p1 = min_bar_plot('Pollution content')

In [None]:
#EDA using Pandas Profiling
import pandas_profiling

In [None]:
data = pd.read_csv('C://Users/Govardhan M G/Downloads/city_day.csv')
data

In [None]:
profile = data.profile_report(title="EDA", explorative = True)
profile.to_file(output_file="EDA.html")

In [None]:
profile

In [None]:
#Prediction of AQI Bucket using PyCaret
from pycaret.classification import *

In [None]:
reg = setup(data = data, 
             target = 'AQI_Bucket',
             silent = True)

In [None]:
compare_models()   #compare all models

In [None]:
Adabooster_model = create_model('ada')

In [None]:
evaluate_model(Adabooster_model)

In [None]:
save_model(Adabooster_model, 'Ada_booster_classifier')

In [None]:
Adabooster_model = load_model(model_name='Ada_booster_classifier')

In [None]:
RandomForest_model = create_model('rf')

In [None]:
evaluate_model(RandomForest_model)

In [None]:
save_model(RandomForest_model, 'Random_forest_classifier')

In [None]:
RandomForest_model = load_model(model_name='Random_forest_classifier')