# Air Quality in Istanbul between 2014.01 - 2020.07

 -*- coding: utf-8 -*-


Created on Thursday Aug  2 21:34:41 2020

@author: erolerdogan


**T	:**Average Temperature (°C)

**TM:**Maximum temperature (°C)

**Tm:**Minimum temperature (°C)

**SLP:**	Atmospheric pressure at sea level (hPa)

**H	:** Average relative humidity (%)

**PP:**Total rainfall and / or snowmelt (mm)

**VV:**Average visibility (Km)

**V	:**Average wind speed (Km/h)

**VM:**Maximum sustained wind speed (Km/h)

**VG:**Maximum speed of wind (Km/h)

**RA:**Indicate if there was rain or drizzle (In the monthly average, total days it rained)

**SN:**Snow indicator (In the monthly average, total days that snowed)

**TS:**Indicates whether there storm (In the monthly average, Total days with thunderstorm)

**FG:**Indicates whether there was fog (In the monthly average, Total days with fog)

### Data Sources

1. AQI (PM2.5, PM10 etc) => https://aqicn.org/data-platform/register/
2. The data between 2017-2020 from Turkey => https://sim.csb.gov.tr/
3. Details (T, TM, Tm etc.) => https://en.tutiempo.net/istanbul.html
4. http://www.havaizleme.gov.tr/


### Useful Links to understand project items better

1. PM2.5 vs PM10 => https://smartairfilters.com/en/blog/pm10-pm2-5-difference-particle-air-pollution/
2. Conversion from PM10 to PM2.5 => https://www.epd.gov.hk/epd/english/environmentinhk/air/guide_ref/guide_aqa_model_g5.html
3. Equation of calculating Air Quality Index => https://en.wikipedia.org/wiki/Air_quality_index

## Data Importing and Understanding

In [103]:
import pandas as pd
import numpy as np

#### Air Quality Dataset from 1st source 

In [199]:
aqi_first = pd.read_excel("istanbul_sirinevler_aqi_2014-2020.xlsx",
                          names=["Date", "pm10", "so2", "co", "no2", "o3", "pm2.5"], header=2)
aqi_first.head(11)

Unnamed: 0,Date,pm10,so2,co,no2,o3,pm2.5
0,2014-01-01 00:00:56,-,352,"1.221,74",5364,-,-
1,2014-01-02 00:00:56,8249,640,"1.473,91",6540,-,-
2,2014-01-03 00:00:56,-,671,"1.613,04",7063,-,-
3,2014-01-04 00:00:56,7203,935,"1.349,42",6409,-,-
4,2014-01-05 00:00:56,7203,1159,"1.399,42",6808,-,-
5,2014-01-06 00:00:56,7203,1143,"1.311,92",7923,-,-
6,2014-01-07 00:00:56,7203,1100,"1.070,25",7054,-,-
7,2014-01-08 00:00:56,7683,489,"1.028,58",5290,-,-
8,2014-01-09 00:00:56,9406,691,"1.324,42",6892,-,-
9,2014-01-10 00:00:56,10608,707,"1.282,75",7348,-,-


#### Air Quality 2017-2020 dataset from Turkey ministry 2nd source

In [105]:
## On new version, we donot need to use or merge new data 

#aqi_second = pd.read_excel("istanbul_besiktas_aqi_sim_2017-2020.xlsx", header=1,
#                           names=["Date", "pm10", "so2", "co", "no2", "o3", "pm2.5"])
#
#aqi_second.head()
#

#### 2017-11 dataset (due to absence of this month)

In [106]:
## On new version, we donot need to use or merge new data 

#data_201711 = pd.read_excel("istanbul_mecidiyekoy_aqi_2017-11.xlsx", header=3,
#                            names=["Date", "pm10", "so2", "co", "no2", "o3", "pm2.5"])
#data_201711.head()

#### Detailed Dataset (T, TM, Tm, PP etc.) from 3rd source

In [107]:
detailed_data = pd.read_csv("combined_data.csv")

detailed_data.tail()

Unnamed: 0.1,Unnamed: 0,Day,T,TM,Tm,SLP,H,PP,VV,V,VM,VG,RA,SN,TS,FG,Date
2460,2460,26,25.4,30.0,19.6,,61.0,,10.0,12.6,33.5,44.3,,,,,2020-9
2461,2461,27,23.3,28.0,19.0,,64.0,7.62,10.0,13.9,25.9,48.2,,,,,2020-9
2462,2462,28,,,,,,,,,,,,,,,2020-9
2463,2463,29,,,,,,,,,,,,,,,2020-9
2464,2464,30,,,,,,,,,,,,,,,2020-9


## Data Cleansing and Manipulation

#### Working on Detailed Data

In [108]:
for i in range(len(detailed_data)):
    detailed_data.loc[i, "Date"] = str(detailed_data.Date[i]) + "-" + str(detailed_data.Day[i])

    
detailed_data_df = detailed_data.iloc[:, 2:-6].copy().join(detailed_data.iloc[:, -1])
detailed_data_df.Date = pd.to_datetime(detailed_data_df.Date).dt.strftime("%Y-%m-%d")
detailed_data_df.head()

Unnamed: 0,T,TM,Tm,SLP,H,PP,VV,V,VM,Date
0,7.7,9.0,6.3,,84.0,0.0,7.1,11.5,18.3,2014-01-01
1,8.8,11.0,7.0,,73.0,0.0,9.8,8.7,16.5,2014-01-02
2,9.0,11.0,7.5,,83.0,0.51,8.4,10.2,14.8,2014-01-03
3,,,,,,,,,,2014-01-04
4,,,,,,,,,,2014-01-05


In [109]:
detailed_data_df.shape

(2465, 10)

In [110]:
detailed_data_df.isna().sum()

T       1212
TM      1212
Tm      1212
SLP     2462
H       1214
PP      1250
VV      1342
V       1212
VM      1212
Date       0
dtype: int64

In [111]:
detailed_data_df.notna().sum()

T       1253
TM      1253
Tm      1253
SLP        3
H       1251
PP      1215
VV      1123
V       1253
VM      1253
Date    2465
dtype: int64

#### Working on Air Quality Data from 1st Source

In [218]:
# Changing empty or wrong cell's values as NaN values
aqi_first.replace(["", " ", "-"], np.nan, inplace=True)
aqi_first.replace([","], np.nan, inplace=True)
# Converting Date to actual datetime
aqi_first.Date = pd.to_datetime(aqi_first.Date)

aqi_first.sort_values(by="Date", inplace=True)
aqi_first.reset_index(drop=True, inplace=True)

###      # Removing 2020 datas because of we don't have it within 'combined_data.csv'
###      aqi_first = aqi_first.iloc[:-3,:]

# No need to mask anymore. Modification data: 2020-10-09 by erolerdogan
###      # Due to combine data
###      mask = aqi_first.Date < "2017-01-02"
###      aqi_first = aqi_first[mask]

# Converting Date to actual datetime
aqi_first.Date = pd.to_datetime(aqi_first.Date).dt.strftime("%Y-%m-%d")
aqi_first.head()

Unnamed: 0,Date,pm10,so2,co,no2,o3,pm2.5
0,2014-01-01,,352,"1.221,74",5364,,
1,2014-01-02,8249.0,640,"1.473,91",6540,,
2,2014-01-03,,671,"1.613,04",7063,,
3,2014-01-04,7203.0,935,"1.349,42",6409,,
4,2014-01-05,7203.0,1159,"1.399,42",6808,,


In [142]:
aqi_first.shape

(2465, 7)

In [143]:
aqi_first.isnull().sum()

Date        0
pm10       84
so2        59
co         49
no2        45
o3       2465
pm2.5    2465
dtype: int64

In [144]:
# aqi.drop("pm2.5", axis=1, inplace=True) # Dropping to PM2.5 index due to many NA values

aqi_first.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2465 entries, 0 to 2464
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2465 non-null   object 
 1   pm10    2381 non-null   object 
 2   so2     2406 non-null   object 
 3   co      2416 non-null   object 
 4   no2     2420 non-null   object 
 5   o3      0 non-null      float64
 6   pm2.5   0 non-null      float64
dtypes: float64(2), object(5)
memory usage: 134.9+ KB


In [145]:
aqi_first.tail()

Unnamed: 0,Date,pm10,so2,co,no2,o3,pm2.5
2460,2020-09-26,5514,836,"1.404,99",8671,,
2461,2020-09-27,4048,706,,4292,,
2462,2020-09-28,5845,585,,8767,,
2463,2020-09-29,5398,287,"1.440,46",7168,,
2464,2020-09-30,3908,345,"1.243,07",6861,,


#### Working on Air Quality Dataset from 2nd source

In [8]:
# mask = ((aqi_second.Date > "2017-11") & (aqi_second.Date < "2017-12"))
# #combined_df.loc[mask, "so2"] = combined_df.loc[mask, "so2"].fillna(so2_mean)
# 
# aqi_second.drop(list(aqi_second[mask].index), inplace=True)
# 
# aqi_second.Date = pd.to_datetime(aqi_second.Date).dt.strftime("%Y-%m-%d")
# data_201711.Date = pd.to_datetime(data_201711.Date).dt.strftime("%Y-%m-%d")
# 
# # Concantenating aqi data from second source and data of 2017-11 
# aqi_second = pd.concat([aqi_second, data_201711])
# 
# 
# #### Combining whole Air Quality Indexes Datasets
# 
# # Combined Istanbul Ministry data and website data (first source and second source datasets)
# aqi = pd.concat([aqi_first, aqi_second])
# 
# # Converting the numeric values to float64 from object
# aqi[["pm2.5", "pm10", "o3", "no2", "so2", "co"]] = aqi[["pm2.5", "pm10", "o3",
#                                                                     "no2", "so2", "co"]].astype(dtype="float64", 
#                                                                                                   errors="ignore")
# aqi.Date = pd.to_datetime(aqi.Date).dt.strftime("%Y-%m-%d")
# 
# aqi.head()

#### Combining Air Quality Index Dataset with Climate Information Dataset

In [146]:
print(aqi_first.shape)
aqi_first.head()

(2465, 7)


Unnamed: 0,Date,pm10,so2,co,no2,o3,pm2.5
0,2014-01-01,,352,"1.221,74",5364,,
1,2014-01-02,8249.0,640,"1.473,91",6540,,
2,2014-01-03,,671,"1.613,04",7063,,
3,2014-01-04,7203.0,935,"1.349,42",6409,,
4,2014-01-05,7203.0,1159,"1.399,42",6808,,


In [130]:
print(detailed_data_df.shape)
detailed_data_df.head()

(2465, 10)


Unnamed: 0,T,TM,Tm,SLP,H,PP,VV,V,VM,Date
0,7.7,9.0,6.3,,84.0,0.0,7.1,11.5,18.3,2014-01-01
1,8.8,11.0,7.0,,73.0,0.0,9.8,8.7,16.5,2014-01-02
2,9.0,11.0,7.5,,83.0,0.51,8.4,10.2,14.8,2014-01-03
3,,,,,,,,,,2014-01-04
4,,,,,,,,,,2014-01-05


#### Merging all data. Air Quality Dataset - Climate Information Dataset

In [378]:
combined_df = pd.merge(aqi_first, detailed_data_df, on="Date", how="right")


# combined_df.set_index([combined_df.Date], inplace=True)
combined_df.sort_values(by="Date", inplace=True)
combined_df.Date = pd.to_datetime(combined_df.Date).dt.strftime('%Y-%m')

# Dropping the column if the number of NaN values are more than the half of whole values 
combined_df.dropna(axis=1, how="any", thresh=len(combined_df)/2, inplace=True)

# Dropping the column if there is no value
for i in combined_df.columns:
    if len(combined_df[i].unique()) == 1:
        combined_df.drop([i], axis=1, inplace=True)

# creating Month column to use later
for i in range(len(combined_df)):
    combined_df.loc[i, "Month"] = combined_df.Date[i].split("-")[1]

    combined_df.loc[i, "Year"] = combined_df.Date[i].split("-")[0]
    
# Converting "," comma to "." dot.
for column in combined_df.columns[1:]:
    for index in range(len(combined_df)):            
        
        if isinstance(combined_df[column][index], str) == True:
            if "." in list(combined_df.loc[index, column]):
                combined_df.loc[index, column] = combined_df[column][index].replace(".", "")
            
            combined_df.loc[index, column] = combined_df[column][index].replace(",", ".")
    
    combined_df[column] = pd.to_numeric(combined_df[column])
        
# Converting to data type to float
for column in combined_df.columns[1:]:
    combined_df[column].astype(dtype="float64")

In [379]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2465 entries, 0 to 2464
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2465 non-null   object 
 1   pm10    2381 non-null   float64
 2   so2     2406 non-null   float64
 3   co      2416 non-null   float64
 4   no2     2420 non-null   float64
 5   T       1253 non-null   float64
 6   TM      1253 non-null   float64
 7   Tm      1253 non-null   float64
 8   H       1251 non-null   float64
 9   V       1253 non-null   float64
 10  VM      1253 non-null   float64
 11  Month   2465 non-null   int64  
 12  Year    2465 non-null   int64  
dtypes: float64(10), int64(2), object(1)
memory usage: 349.6+ KB


In [380]:
combined_df.head()

Unnamed: 0,Date,pm10,so2,co,no2,T,TM,Tm,H,V,VM,Month,Year
0,2014-01,,3.52,1221.74,53.64,7.7,9.0,6.3,84.0,11.5,18.3,1,2014
1,2014-01,82.49,6.4,1473.91,65.4,8.8,11.0,7.0,73.0,8.7,16.5,1,2014
2,2014-01,,6.71,1613.04,70.63,9.0,11.0,7.5,83.0,10.2,14.8,1,2014
3,2014-01,72.03,9.35,1349.42,64.09,,,,,,,1,2014
4,2014-01,72.03,11.59,1399.42,68.08,,,,,,,1,2014


In [331]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2465 entries, 0 to 2464
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2465 non-null   object 
 1   pm10    2381 non-null   object 
 2   so2     2406 non-null   object 
 3   co      2416 non-null   object 
 4   no2     2420 non-null   object 
 5   T       1253 non-null   float64
 6   TM      1253 non-null   float64
 7   Tm      1253 non-null   float64
 8   SLP     3 non-null      float64
 9   H       1251 non-null   float64
 10  PP      1215 non-null   float64
 11  VV      1123 non-null   float64
 12  V       1253 non-null   float64
 13  VM      1253 non-null   float64
 14  Month   2465 non-null   object 
 15  Year    2465 non-null   object 
dtypes: float64(9), object(7)
memory usage: 407.4+ KB


In [155]:
combined_df.isna().sum()

Date        0
pm10       84
so2        59
co         49
no2        45
T        1212
TM       1212
Tm       1212
SLP      2462
H        1214
PP       1250
VV       1342
V        1212
VM       1212
Month       0
Year        0
dtype: int64

In [330]:
combined_df.describe()

Unnamed: 0,T,TM,Tm,SLP,H,PP,VV,V,VM
count,1253.0,1253.0,1253.0,3.0,1251.0,1215.0,1123.0,1253.0,1253.0
mean,16.456744,19.99601,12.903512,1016.5,68.042366,1.45535,9.402671,15.961852,26.892737
std,7.48478,8.191992,7.282131,2.286919,9.973194,4.061654,1.27609,5.981151,8.351272
min,-4.4,-2.9,-6.7,1014.8,34.0,0.0,1.4,0.7,5.4
25%,10.2,13.0,7.0,1015.2,61.0,0.0,9.3,11.5,20.6
50%,16.2,20.0,12.6,1015.6,68.0,0.0,10.0,15.2,25.9
75%,23.8,28.0,19.8,1017.35,75.0,0.51,10.0,19.8,31.7
max,31.5,37.0,26.0,1019.1,97.0,34.04,12.6,47.6,61.1


In [383]:
month_grouped_combined_df = combined_df.groupby(["Date"], sort=False)
year_grouped_combined_df = combined_df.groupby(["Year"])

In [385]:
year_grouped_combined_df.first()

Unnamed: 0_level_0,Date,pm10,so2,co,no2,T,TM,Tm,H,V,VM,Month
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014,2014-01,82.49,3.52,1221.74,53.64,7.7,9.0,6.3,84.0,11.5,18.3,1
2015,2015-01,29.83,4.36,695.65,40.67,3.2,4.0,2.5,78.0,36.1,42.4,1
2016,2016-01,31.24,10.07,946.54,59.88,-1.7,1.0,-5.0,66.0,15.2,33.5,1
2017,2017-01,33.66,2.72,1227.88,84.54,3.1,5.0,0.7,77.0,12.6,18.3,1
2018,2018-01,40.96,4.39,584.56,56.81,9.2,11.0,7.0,76.0,16.5,25.9,1
2019,2019-01,37.35,8.7,657.53,66.69,7.0,10.2,4.3,64.0,19.4,31.7,1
2020,2020-01,55.78,5.55,1228.69,65.02,6.1,9.0,2.2,69.0,19.4,35.2,1


In [389]:
month_grouped_combined_df.first()

Unnamed: 0_level_0,pm10,so2,co,no2,T,TM,Tm,H,V,VM,Month,Year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014-01,82.49,3.52,1221.74,53.64,7.7,9.0,6.3,84.0,11.5,18.3,1,2014
2014-02,53.53,8.82,707.75,53.58,5.7,7.2,3.0,71.0,22.6,31.7,2,2014
2014-03,120.20,21.37,1545.25,99.77,10.6,14.0,8.2,72.0,2.6,14.8,3,2014
2014-04,85.86,22.32,1136.92,107.14,11.4,15.0,5.0,73.0,6.5,20.6,4,2014
2014-05,56.01,11.60,457.75,62.15,15.0,18.0,12.5,79.0,4.1,18.3,5,2014
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05,30.11,10.09,1180.81,33.13,16.7,21.0,14.0,78.0,9.6,16.5,5,2020
2020-06,27.13,5.97,1441.16,50.72,17.3,20.0,15.0,71.0,10.0,18.3,6,2020
2020-07,62.32,6.54,1380.85,62.43,27.3,32.0,21.4,61.0,18.5,25.9,7,2020
2020-08,25.98,32.91,6691.34,39.93,26.5,32.0,21.8,70.0,15.2,29.4,8,2020


#### **Important:**  Decision of the NaN values filling. Based on the mean of months of that year or based on the mean of previous years of that month. Decided to use that month. 

In [None]:
# Filling NA values with mean of that month's values
for date in set(list(combined_df.Date)):   
    for column in combined_df.columns[1:-2]:
        mask = combined_df.Date == date
        mean = month_grouped_combined_df.mean().loc[date, column]
        combined_df.loc[mask, column] = combined_df.loc[mask, column].fillna(mean)

In [404]:
combined_df.head()

Unnamed: 0,Date,pm10,so2,co,no2,T,TM,Tm,H,V,VM,Month,Year
0,2014-01,86.838966,3.52,1221.74,53.64,7.7,9.0,6.3,84.0,11.5,18.3,1,2014
1,2014-01,82.49,6.4,1473.91,65.4,8.8,11.0,7.0,73.0,8.7,16.5,1,2014
2,2014-01,86.838966,6.71,1613.04,70.63,9.0,11.0,7.5,83.0,10.2,14.8,1,2014
3,2014-01,72.03,9.35,1349.42,64.09,9.075,12.1375,6.04375,77.533333,13.41875,22.8,1,2014
4,2014-01,72.03,11.59,1399.42,68.08,9.075,12.1375,6.04375,77.533333,13.41875,22.8,1,2014


In [405]:
combined_df.isna().sum()

Date     0
pm10     0
so2      0
co       0
no2      0
T        0
TM       0
Tm       0
H        0
V        0
VM       0
Month    0
Year     0
dtype: int64

In [366]:
combined_df.describe()

Unnamed: 0,pm10,so2,co,no2,T,TM,Tm,H,PP,VV,V,VM,Month,Year
count,2465.0,2465.0,2465.0,2465.0,2465.0,2465.0,2465.0,2465.0,2465.0,2465.0,2465.0,2465.0,2465.0,2465.0
mean,49.576712,6.19246,893.292016,63.336777,16.480171,20.019625,12.926849,68.038851,1.447867,9.409694,15.958532,26.882326,6.355375,2016.888844
std,28.354682,4.923809,762.09636,25.780676,7.233303,7.850442,7.032971,8.001428,2.9773,0.982797,4.584955,6.271512,3.397304,1.950227
min,6.76,0.56,105.71,11.78,-4.4,-2.9,-6.7,34.0,0.0,1.4,0.7,5.4,1.0,2014.0
25%,31.27,2.59,512.5,44.37,10.3125,13.4,7.0,62.933333,0.0,9.2,13.3,23.286667,3.0,2015.0
50%,44.14,5.03,725.03,60.28,16.3,20.0,12.5,68.466667,0.508125,9.8,15.775,26.693333,6.0,2017.0
75%,62.17,8.02,1079.17,79.67,23.8,28.0,19.76,73.0,1.912667,10.0,18.325,29.4,9.0,2019.0
max,292.52,34.46,7383.95,184.84,31.5,37.0,26.0,97.0,34.04,12.6,47.6,61.1,12.0,2020.0


In [406]:
combined_df.head()

Unnamed: 0,Date,pm10,so2,co,no2,T,TM,Tm,H,V,VM,Month,Year
0,2014-01,86.838966,3.52,1221.74,53.64,7.7,9.0,6.3,84.0,11.5,18.3,1,2014
1,2014-01,82.49,6.4,1473.91,65.4,8.8,11.0,7.0,73.0,8.7,16.5,1,2014
2,2014-01,86.838966,6.71,1613.04,70.63,9.0,11.0,7.5,83.0,10.2,14.8,1,2014
3,2014-01,72.03,9.35,1349.42,64.09,9.075,12.1375,6.04375,77.533333,13.41875,22.8,1,2014
4,2014-01,72.03,11.59,1399.42,68.08,9.075,12.1375,6.04375,77.533333,13.41875,22.8,1,2014


In [407]:
# No need to create these dataframes anymore due month_grouped_combined_df.mean() handles all of them 
# 10.10.2020 by erol.erdogan

## Adding the Mean of every year and every month to analyze the average of the PM10 monthly/yearly.
#
#df_year = year_grouped_combined_df.first().copy()
#df_year["Mean"] = None
#
#for year in df_year.index:
#    df_year.loc[year, "Mean"] = year_grouped_combined_df.get_group(year).pm10.mean()
#
#    
#df_month = month_grouped_combined_df.first().copy()
#df_month["Mean"] = None
#
#for date in df_month.index:
#    df_month.loc[date, "Mean"] = month_grouped_combined_df.get_group(date).pm10.mean()

In [42]:
#for i in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]:
#    
#    month_mask = grouped_combined_df.first().Month == i
#    mean = grouped_combined_df.first()[month_mask]["so2"].mean()
#    
#    mask = (combined_df.Month == i) 
#    combined_df.loc[mask, "so2"] = combined_df.loc[mask, "so2"].fillna(mean)
#    

In [43]:
# Time to remove "Month, Year" columns
#combined_df = combined_df.drop(['Month', 'Year'], axis=1).set_index("Date")
#
#combined_df.isna().sum()

### The Necessary Values of Calculation Air Quality Index,

![AQI Values](AQI_values.png)

# EDA (Explanatory Data Analysis)

combined_df => full dataset

month_grouped_combined_df => dataset grouped by month

year_grouped_combined_df => dataset grouped by year

In [410]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

## Visualization of Dataset filtered

### 1. PM10 Analysis

#### Visualization of Dataset filtered by Months

In [434]:
fig, axs = plt.subplots(len(month_grouped_combined_df.mean().Year.unique()), 1, figsize=(10,12))

for k, year in zip(range(len(month_grouped_combined_df.mean().Year.unique())),
                   range(int(month_grouped_combined_df.mean().Year.min()),
                         int(month_grouped_combined_df.mean().Year.max())+1)):
    year_mask = (month_grouped_combined_df.mean().Year == year)
    axs[k].bar(month_grouped_combined_df.mean()[year_mask].index, month_grouped_combined_df.mean()[year_mask].pm10)
    axs[k].set_title("PM 10 Values in " + str(year))
    
plt.tight_layout()

<IPython.core.display.Javascript object>

In [416]:
plt.figure(figsize=(8,6))
month_grouped_combined_df.mean().pm10.plot()
plt.xticks(rotation=90)
plt.ylabel("PM10")
plt.title("PM10 between 2014- 2020")
plt.tight_layout()

<IPython.core.display.Javascript object>

In [462]:
fig, axs = plt.subplots(1, len(year_grouped_combined_df.mean().columns)-1, figsize=(9, 6))
plt.tight_layout()
plt.subplots_adjust(top=.9)

for num, cols in zip(range(len(year_grouped_combined_df.mean().columns)-1),
                     year_grouped_combined_df.mean().columns[:-1]):
    axs[num].boxplot(year_grouped_combined_df.mean()[cols])
    axs[num].set_title(year_grouped_combined_df.mean()[cols].name)

<IPython.core.display.Javascript object>

#### Visualization of Dataset filtered by Years

In [447]:
fig, ax = plt.subplots()
ax.bar(year_grouped_combined_df.mean().index, year_grouped_combined_df.mean().pm10)
plt.xticks(rotation=45);

<IPython.core.display.Javascript object>

In [452]:
fig, axs = plt.subplots(figsize=(10, 6))

for year in range(int(year_grouped_combined_df.mean().index.min()),int(year_grouped_combined_df.mean().index.max())+1):
    axs.plot(year_grouped_combined_df.mean().pm10, marker="s", ls="--", lw=1, alpha=0.7, color="r", markersize=4, label="Year")
    axs.set_title("PM 10 Values between 2014 and 2020")

plt.xticks(rotation=90)
plt.ylim([0, 100]);

<IPython.core.display.Javascript object>

In [457]:
# Checking the all data based on selected years between 2014 - 2020
while True:

    print("Enter a year between 2014 - 2020")
    x = input("Start Year: ")
    y = input("End Year: ")
    
    if x >= "2014" and y <= "2020":
        break
        
    
fig, axs = plt.subplots(figsize=(10, 6))

for year in range(int(x), int(y)+1):
    year_mask = (df_month.Year == year)

    axs.plot(month_grouped_combined_df.mean()[year_mask].set_index("Month").pm10,
             marker="s", ls="--", lw=1, alpha=0.7, markersize=4)
    #axs[k].set_title("PM 10 Values in " + str(year))
    plt.xticks(rotation=90)
    plt.legend([x for x in range(int(x), int(y)+1)])
#plt.tight_layout()

Enter a year between 2014 - 2020
Start Year: 2014
End Year: 2020


<IPython.core.display.Javascript object>

In [459]:
fig, axs = plt.subplots(len(df_month.Year.unique()), 1,figsize=(10, 12))

for num, year in zip(range(len(month_grouped_combined_df.mean().Year.unique())),
                     range(int(month_grouped_combined_df.mean().Year.min()), 
                           int(month_grouped_combined_df.mean().Year.max())+1)):

    year_mask = (month_grouped_combined_df.mean().Year == year)
    #axs[num].set_xticks(rotation=90)
    axs[num].set_ylim(0,130)

    axs[num].plot(month_grouped_combined_df.mean()[year_mask].pm10,
                  marker="s", ls="--", lw=1, alpha=0.7, markersize=4, color="green")
    axs[num].set_title("PM 10 Values in " + str(year))
plt.tight_layout()


<IPython.core.display.Javascript object>

In [464]:
fig, axs = plt.subplots(1, len(year_grouped_combined_df.mean().columns)-1, figsize=(9, 6))
plt.tight_layout()
plt.subplots_adjust(top=.9)

for num, cols in zip(range(len(year_grouped_combined_df.mean().columns)-1),
                     year_grouped_combined_df.mean().columns[:-1]):
    axs[num].boxplot(year_grouped_combined_df.mean()[cols])
    axs[num].set_title(year_grouped_combined_df.mean()[cols].name)

<IPython.core.display.Javascript object>

#### Visualization of Full Dataset

In [55]:
combined_df.head()

Unnamed: 0,Date,pm10,no2,so2,co,T,TM,Tm,H,PP,VV,V,VM,Month,Year
0,2014-01,18.0,27.0,2.0,7.0,7.7,9.0,6.3,84.0,0.0,7.1,11.5,18.3,1,2014
1,2014-01,27.0,31.0,3.0,9.0,8.8,11.0,7.0,73.0,0.0,9.8,8.7,16.5,1,2014
2,2014-01,31.0,30.0,2.0,8.0,9.0,11.0,7.5,83.0,0.51,8.4,10.2,14.8,1,2014
3,2014-01,27.0,30.0,2.0,10.0,9.075,12.1375,6.04375,77.533333,0.413125,7.25625,13.41875,22.8,1,2014
4,2014-01,46.0,34.0,3.0,8.0,9.075,12.1375,6.04375,77.533333,0.413125,7.25625,13.41875,22.8,1,2014


In [62]:
fig, axs = plt.subplots(1, len(combined_df.columns[1:-2]), figsize=(12, 6))
plt.tight_layout()
plt.subplots_adjust(top=.9)

for num, cols in zip(range(len(combined_df.columns[1:-2])), combined_df.columns[1:-2]):
    
    axs[num].boxplot(combined_df[cols])
    axs[num].set_title(combined_df[cols].name)


<IPython.core.display.Javascript object>

In [57]:
combined_df.corr()

Unnamed: 0,pm10,no2,so2,co,T,TM,Tm,H,PP,VV,V,VM
pm10,1.0,0.247264,0.191747,0.118345,-0.075175,-0.055015,-0.096603,0.051962,-0.029009,-0.218323,-0.218429,-0.141313
no2,0.247264,1.0,0.329715,0.578203,-0.068167,-0.046983,-0.108196,-0.000641,-0.006487,-0.006403,-0.212692,-0.160411
so2,0.191747,0.329715,1.0,0.283589,-0.182223,-0.169355,-0.204338,0.018901,-0.031134,-0.05429,-0.143645,-0.133141
co,0.118345,0.578203,0.283589,1.0,-0.043609,-0.028672,-0.06844,0.115071,-0.000141,0.020548,-0.080843,-0.070487
T,-0.075175,-0.068167,-0.182223,-0.043609,1.0,0.991882,0.987963,-0.488477,-0.190515,0.399631,-0.036641,-0.048389
TM,-0.055015,-0.046983,-0.169355,-0.028672,0.991882,1.0,0.968513,-0.515408,-0.201833,0.383981,-0.070324,-0.066879
Tm,-0.096603,-0.108196,-0.204338,-0.06844,0.987963,0.968513,1.0,-0.428483,-0.169077,0.40313,0.034613,0.007564
H,0.051962,-0.000641,0.018901,0.115071,-0.488477,-0.515408,-0.428483,1.0,0.295998,-0.465378,0.026356,0.083958
PP,-0.029009,-0.006487,-0.031134,-0.000141,-0.190515,-0.201833,-0.169077,0.295998,1.0,-0.111051,0.077345,0.167469
VV,-0.218323,-0.006403,-0.05429,0.020548,0.399631,0.383981,0.40313,-0.465378,-0.111051,1.0,0.155503,0.111956


In [61]:
# Correlation matrix
plt.figure(figsize=(10,8))
sns.heatmap(combined_df.corr(), annot=True, lw=1, cmap="YlGnBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x211a4bd62c8>

# Feature Selection

### Before Feature Selection Methods have been applied (using all features)

In [59]:
X = combined_df.drop("pm10", axis=1)
y = combined_df.pm10

In [60]:
#from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.metrics import classification_report
#
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=101)
#
#rf_model = RandomForestRegressor().fit(X_train, y_train)
#predictions = rf_model.predict(X_test)


ValueError: could not convert string to float: '2016-12'

## Feature Selection w/ Univariate Method

In [None]:
from sklearn.feature_selection import SelectKBest, chi2


## Feature Selection w/ Correlation Matrix

In [None]:
import seaborn as sns


In [None]:
combined_df.corr()["pm10"].abs().nlargest(5)