# Air Quality in Istanbul between 2014.01 - 2020.07

 -*- coding: utf-8 -*-
"""
Created on Thursday Aug  2 21:34:41 2020
@author: erolerdogan
"""

**T	:**Average Temperature (°C)

**TM:**Maximum temperature (°C)

**Tm:**Minimum temperature (°C)

**SLP:**	Atmospheric pressure at sea level (hPa)

**H	:** Average relative humidity (%)

**PP:**Total rainfall and / or snowmelt (mm)

**VV:**Average visibility (Km)

**V	:**Average wind speed (Km/h)

**VM:**Maximum sustained wind speed (Km/h)

**VG:**Maximum speed of wind (Km/h)

**RA:**Indicate if there was rain or drizzle (In the monthly average, total days it rained)

**SN:**Snow indicator (In the monthly average, total days that snowed)

**TS:**Indicates whether there storm (In the monthly average, Total days with thunderstorm)

**FG:**Indicates whether there was fog (In the monthly average, Total days with fog)

### Data Sources

1. AQI (PM2.5, PM10 etc) => https://aqicn.org/data-platform/register/
2. The data between 2017-2020 from Turkey => https://sim.csb.gov.tr/
3. Details (T, TM, Tm etc.) => https://en.tutiempo.net/istanbul.html
4. http://www.havaizleme.gov.tr/


### Useful Links to understand project items better

1. PM2.5 vs PM10 => https://smartairfilters.com/en/blog/pm10-pm2-5-difference-particle-air-pollution/
2. Conversion from PM10 to PM2.5 => https://www.epd.gov.hk/epd/english/environmentinhk/air/guide_ref/guide_aqa_model_g5.html
3. Equation of calculating Air Quality Index => https://en.wikipedia.org/wiki/Air_quality_index

## Data Importing and Understanding

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

#### Air Quality Dataset from 1st source 

In [80]:
aqi_first = pd.read_csv("istanbul_aqi.csv", names=["Date", "pm2.5", "pm10", "o3", "no2", "so2", "co"], header=0)

aqi_first.head()

Unnamed: 0,Date,pm2.5,pm10,o3,no2,so2,co
0,2020/8/1,48,14.0,18.0,16.0,2.0,2.0
1,2020/8/2,42,11.0,17.0,11.0,1.0,1.0
2,2020/8/3,36,10.0,12.0,14.0,1.0,1.0
3,2020/8/4,28,9.0,,,,
4,2020/8/5,35,,,,,


#### Air Quality 2017-2020 dataset from Turkey ministry 2nd source

In [70]:
aqi_second = pd.read_excel("istanbul_besiktas_aqi_sim_2017-2020.xlsx", header=1,
                           names=["Date", "pm10", "so2", "co", "no2", "o3", "pm2.5"])
aqi_second.head()

Unnamed: 0,Date,pm10,so2,co,no2,o3,pm2.5
0,2017-01-02 00:00:56,,13.1,436.68,100.37,20.86,
1,2017-01-03 00:00:56,48.73,15.94,452.92,109.53,26.32,
2,2017-01-04 00:00:56,58.86,12.93,585.75,120.8,16.18,
3,2017-01-05 00:00:56,34.29,6.6,535.62,119.7,19.18,
4,2017-01-06 00:00:56,30.91,6.57,457.47,91.04,24.16,


#### Detailed Dataset (T, TM, Tm, PP etc.) from 3rd source

In [27]:
detailed_data = pd.read_csv("combined_data.csv")

detailed_data.head()

Unnamed: 0.1,Unnamed: 0,Day,T,TM,Tm,SLP,H,PP,VV,V,VM,VG,RA,SN,TS,FG,Date
0,0,1,7.7,9.0,6.3,,84.0,0.0,7.1,11.5,18.3,,,,,,2014-1
1,1,2,8.8,11.0,7.0,,73.0,0.0,9.8,8.7,16.5,,,,,,2014-1
2,2,3,9.0,11.0,7.5,,83.0,0.51,8.4,10.2,14.8,,,,,,2014-1
3,3,4,,,,,,,,,,,,,,,2014-1
4,4,5,,,,,,,,,,,,,,,2014-1


## Data Cleansing and Manipulation

#### Working on Detailed Data

In [28]:
for i in range(len(detailed_data)):
    detailed_data.loc[i, "Date"] = str(detailed_data.Date[i]) + "-" + str(detailed_data.Day[i])

detailed_data_df = detailed_data.iloc[:, 2:-6].copy().join(detailed_data.iloc[:, -1])
detailed_data_df.Date = pd.to_datetime(detailed_data_df.Date)
detailed_data_df.head()

Unnamed: 0,T,TM,Tm,SLP,H,PP,VV,V,VM,Date
0,7.7,9.0,6.3,,84.0,0.0,7.1,11.5,18.3,2014-01-01
1,8.8,11.0,7.0,,73.0,0.0,9.8,8.7,16.5,2014-01-02
2,9.0,11.0,7.5,,83.0,0.51,8.4,10.2,14.8,2014-01-03
3,,,,,,,,,,2014-01-04
4,,,,,,,,,,2014-01-05


In [29]:
detailed_data_df.shape

(2404, 10)

In [30]:
len(detailed_data_df.VV.unique())

55

In [31]:
detailed_data_df.isna().sum()

T       1181
TM      1181
Tm      1181
SLP     2401
H       1183
PP      1217
VV      1311
V       1181
VM      1181
Date       0
dtype: int64

In [32]:
detailed_data_df.notna().sum()

T       1223
TM      1223
Tm      1223
SLP        3
H       1221
PP      1187
VV      1093
V       1223
VM      1223
Date    2404
dtype: int64

#### Working on Air Quality Data from 1st Source

In [82]:
# Changing empty or wrong cell's values as NaN values
aqi_first.replace(["", " "], np.nan, inplace=True)

# Converting Date to actual datetime
aqi_first.Date = pd.to_datetime(aqi_first.Date)

aqi_first.sort_values(by="Date", inplace=True)
aqi_first.drop(0, inplace=True)
aqi_first.reset_index(drop=True, inplace=True)

# Removing 2020 datas because of we don't have it within 'combined_data.csv'
aqi_first = aqi_first.iloc[:-3,:]

# Due to combine data
mask = aqi_first.Date < "2017-01-02"
aqi_first = aqi_first[mask]

# Converting Date to actual datetime
aqi_first.Date = pd.to_datetime(aqi_first.Date).dt.strftime("%Y-%m-%d")
aqi_first.head()

Unnamed: 0,Date,pm2.5,pm10,o3,no2,so2,co,sp2
0,2014-01-01,,18.0,,27.0,2,7.0,2.0
1,2014-01-02,,27.0,,31.0,3,9.0,3.0
2,2014-01-03,,31.0,,30.0,2,8.0,2.0
3,2014-01-04,,27.0,,30.0,2,10.0,2.0
4,2014-01-05,,46.0,,34.0,3,8.0,3.0


In [83]:
aqi_first.shape

(998, 8)

In [36]:
aqi_first.isnull().sum()

Date        0
pm2.5    1172
pm10       68
o3        878
no2        97
so2       304
co        198
sp2       304
dtype: int64

In [37]:
# Dropping to PM2.5 index due to many NA values
#aqi.drop("pm2.5", axis=1, inplace=True)

aqi_first.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1712 entries, 0 to 1711
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1712 non-null   datetime64[ns]
 1   pm2.5   540 non-null    float64       
 2   pm10    1644 non-null   float64       
 3   o3      834 non-null    float64       
 4   no2     1615 non-null   float64       
 5   so2     1408 non-null   object        
 6   co      1514 non-null   float64       
 7   sp2     1408 non-null   float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 107.1+ KB


#### Working on Air Quality Dataset from 2nd source

In [112]:
aqi_second.Date = pd.to_datetime(aqi_second.Date).dt.strftime("%Y-%m-%d")

aqi_second.head()

Unnamed: 0,Date,pm10,so2,co,no2,o3,pm2.5,sp2
0,2017-01-02,,13.1,436.68,100.37,20.86,,13.1
1,2017-01-03,48.73,15.94,452.92,109.53,26.32,,15.94
2,2017-01-04,58.86,12.93,585.75,120.8,16.18,,12.93
3,2017-01-05,34.29,6.6,535.62,119.7,19.18,,6.6
4,2017-01-06,30.91,6.57,457.47,91.04,24.16,,6.57


In [113]:
aqi_second.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1323 entries, 0 to 1322
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1323 non-null   object 
 1   pm10    1193 non-null   float64
 2   so2     1195 non-null   float64
 3   co      1252 non-null   float64
 4   no2     1096 non-null   float64
 5   o3      1159 non-null   float64
 6   pm2.5   0 non-null      float64
 7   sp2     1195 non-null   float64
dtypes: float64(7), object(1)
memory usage: 82.8+ KB


In [53]:
aqi_first.head()

Unnamed: 0,Date,pm2.5,pm10,o3,no2,so2,co,sp2
0,2014-01-01,,18.0,,27.0,2,7.0,2.0
1,2014-01-02,,27.0,,31.0,3,9.0,3.0
2,2014-01-03,,31.0,,30.0,2,8.0,2.0
3,2014-01-04,,27.0,,30.0,2,10.0,2.0
4,2014-01-05,,46.0,,34.0,3,8.0,3.0


In [118]:
# Combined Istanbul Ministry data and website data
aqi = pd.concat([aqi_first, aqi_second])

# Converting the numeric values to float64 from object
aqi[["pm2.5", "pm10", "o3", "no2", "so2", "co"]] = aqi[["pm2.5", "pm10", "o3",
                                                                    "no2", "so2", "co"]].astype(dtype="float64", 
                                                                                                           errors="ignore")
aqi.Date = pd.to_datetime(aqi.Date)
aqi.head()

Unnamed: 0,Date,pm2.5,pm10,o3,no2,so2,co,sp2
0,2014-01-01,,18.0,,27.0,2.0,7.0,2.0
1,2014-01-02,,27.0,,31.0,3.0,9.0,3.0
2,2014-01-03,,31.0,,30.0,2.0,8.0,2.0
3,2014-01-04,,27.0,,30.0,2.0,10.0,2.0
4,2014-01-05,,46.0,,34.0,3.0,8.0,3.0


In [119]:
aqi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2321 entries, 0 to 1322
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    2321 non-null   datetime64[ns]
 1   pm2.5   134 non-null    float64       
 2   pm10    2155 non-null   float64       
 3   o3      1306 non-null   float64       
 4   no2     2029 non-null   float64       
 5   so2     2039 non-null   float64       
 6   co      2094 non-null   float64       
 7   sp2     2039 non-null   float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 163.2 KB


In [89]:
aqi.isna().sum()

Date        0
pm2.5    2187
pm10      166
o3       1015
no2       292
so2       282
co        227
sp2      1477
dtype: int64

In [120]:
# Merging all data. Air Quality Dataset - Climate Information Dataset
combined_df = pd.merge(aqi, detailed_data_df, on="Date", how="right")

#combined_df.set_index([combined_df.Date], inplace=True)
combined_df.sort_values(by="Date", inplace=True)
combined_df.Date = pd.to_datetime(combined_df.Date).dt.strftime('%Y-%m')
combined_df.drop(["SLP", "o3", "pm2.5"], axis=1, inplace=True)
#combined_df.dropna(how="all", inplace=True)

# creating Month column to use later
for i in range(len(combined_df)):
    combined_df.loc[i, "Month"] = combined_df.Date[i].split("-")[1]
    combined_df.loc[i, "Year"] = combined_df.Date[i].split("-")[0]

In [121]:
combined_df.head()

Unnamed: 0,Date,pm10,no2,so2,co,sp2,T,TM,Tm,H,PP,VV,V,VM,Month,Year
0,2014-01,18.0,27.0,2.0,7.0,2.0,7.7,9.0,6.3,84.0,0.0,7.1,11.5,18.3,1,2014
1,2014-01,27.0,31.0,3.0,9.0,3.0,8.8,11.0,7.0,73.0,0.0,9.8,8.7,16.5,1,2014
2,2014-01,31.0,30.0,2.0,8.0,2.0,9.0,11.0,7.5,83.0,0.51,8.4,10.2,14.8,1,2014
3,2014-01,27.0,30.0,2.0,10.0,2.0,,,,,,,,,1,2014
4,2014-01,46.0,34.0,3.0,8.0,3.0,,,,,,,,,1,2014


In [122]:
for date in set(list(combined_df.Date)):   
    for column in combined_df.columns[1:-2]:
        mask = combined_df.Date == date
        mean = combined_df[mask][column].mean()
        
        #if str(mean) == "nan":
        #    print(" {}'s column {} is completely NaN".format(date, column))
            
        combined_df.loc[mask, column] = combined_df.loc[mask, column].fillna(mean)

**Important:** Based on the checking NaN values, I observed, it corresponds to almost 10 months in 2018 and 2019 years. Although it will cost me almost 500 days, I will remove it. Because I can't fill 10 months by looking only 2 months in a year.

In [124]:
combined_df.isna().sum()

Date      0
pm10     30
no2      30
so2      61
co       30
sp2      61
T         0
TM        0
Tm        0
H         0
PP        0
VV        0
V         0
VM        0
Month     0
Year      0
dtype: int64

In [125]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2404 entries, 0 to 2304
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2404 non-null   object 
 1   pm10    2374 non-null   float64
 2   no2     2374 non-null   float64
 3   so2     2343 non-null   float64
 4   co      2374 non-null   float64
 5   sp2     2343 non-null   float64
 6   T       2404 non-null   float64
 7   TM      2404 non-null   float64
 8   Tm      2404 non-null   float64
 9   H       2404 non-null   float64
 10  PP      2404 non-null   float64
 11  VV      2404 non-null   float64
 12  V       2404 non-null   float64
 13  VM      2404 non-null   float64
 14  Month   2404 non-null   object 
 15  Year    2404 non-null   object 
dtypes: float64(13), object(3)
memory usage: 399.3+ KB


In [126]:
grouped_combined_df = combined_df.groupby(["Date"], sort=False).first()
grouped_combined_df.head(15)

Unnamed: 0_level_0,pm10,no2,so2,co,sp2,T,TM,Tm,H,PP,VV,V,VM,Month,Year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-01,18.0,27.0,2.0,7.0,2.0,7.7,9.0,6.3,84.0,0.0,7.1,11.5,18.3,1,2014
2014-02,15.0,34.5,1.0,7.0,1.0,5.7,7.2,3.0,71.0,0.0,10.0,22.6,31.7,2,2014
2014-03,41.275862,37.033333,2.366667,4.107143,2.366667,10.6,14.0,8.2,72.0,0.25,9.8,2.6,14.8,3,2014
2014-04,52.0,41.0,3.0,5.0,3.0,11.4,15.0,5.0,73.0,0.0,8.9,6.5,20.6,4,2014
2014-05,25.0,47.0,2.0,1.0,2.0,15.0,18.0,12.5,79.0,3.3,10.0,4.1,18.3,5,2014
2014-06,30.0,37.0,1.0,2.0,1.0,18.3,23.0,15.0,80.0,2.29,9.0,11.1,35.2,6,2014
2014-07,37.0,37.0,1.0,2.0,1.0,24.0,28.0,18.0,61.0,0.0,9.7,12.6,22.2,7,2014
2014-08,26.875,35.0,1.2,3.8,1.2,27.8,31.0,24.5,61.0,0.0,10.0,20.2,31.7,8,2014
2014-09,28.0,39.0,2.0,4.0,2.0,24.6,27.0,22.5,66.0,0.0,10.0,11.9,22.2,9,2014
2014-10,25.0,33.0,1.0,2.0,1.0,17.8,24.0,11.9,66.0,0.0,9.175,11.9,22.2,10,2014


In [140]:
month_mask = grouped_combined_df.Month == "01"
year_mask = grouped_combined_df.Year == "2017"
grouped_combined_df[month_mask]

Unnamed: 0_level_0,pm10,no2,so2,co,sp2,T,TM,Tm,H,PP,VV,V,VM,Month,Year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-01,18.0,27.0,2.0,7.0,2.0,7.7,9.0,6.3,84.0,0.0,7.1,11.5,18.3,1,2014
2015-01,11.0,30.0,1.0,1.0,1.0,3.2,4.0,2.5,78.0,13.72,7.6,36.1,42.4,1,2015
2016-01,43.892857,37.758621,4.275862,10.72,4.275862,-1.7,1.0,-5.0,66.0,19.3,8.5,15.2,33.5,1,2016
2017-01,31.95,83.113333,6.738333,433.512667,6.738333,3.1,5.0,0.7,77.0,6.1,9.5,12.6,18.3,1,2017
2018-01,30.64,40.69,8.18,635.93,8.18,9.2,11.0,7.0,76.0,0.25,10.0,16.5,25.9,1,2018
2019-01,19.03,39.27,6.13,269.23,6.13,7.0,10.2,4.3,64.0,0.0,10.0,19.4,31.7,1,2019
2020-01,29.61,67.49,3.81,449.08,3.81,6.1,9.0,2.2,69.0,1.02,9.8,19.4,35.2,1,2020


In [None]:
# Filling NA values of Total rainfall and / or snowmelt (mm) with Mean of the values
df["PP"].fillna(df["PP"].mean(), inplace=True)

# Filling NA values of Average visibility (Km) with Mean of the values
df["VV"].fillna(df["VV"].mean(), inplace=True)

# Filling NA values of Average relative humidity (%) with Mean of the values 
df["H"].fillna(df["H"].mean(), inplace=True)

In [None]:
# PM10 looks best to use of calculation Air Quality Index due less missed values.
# The equation of calculating AQI based on chosen concentration. 
# (AQI_high - AQI_low)/(PM10_high - PM10_low) * (PM10 - PM10_low) + AQI_low 



In [None]:
1716/4

In [None]:
aqi.date[len(aqi) - 196]


In [None]:
aqi.tail()

In [None]:
count = 0
for i in range(len(aqi)):
    if aqi.date[i+1].year >= 2020:
        count+=1
count

### The Necessary Values of Calculation Air Quality Index,

![AQI Values](AQI_values.png)

## EDA (Explanatory Data Analysis)

In [None]:
df.head()