# Beijing Air-Quality Time Series Project
### Data Cleaning Notebook

by Dolci Sanders and Paul Torres


### Import Libraries

In [62]:
import pandas as pd
import numpy as np
import glob

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

%matplotlib inline

### Read in Data

Our Data is provided by the UCI Machine Learning Repository
Beijing Multi-Site Air-Quality Data

Once Read in, we will look at the head and convert our time into data time. 

We have 12 data sets, one from each reporting site, to concatenate together to get the whole picture of Beijing's Air Quality. 


In [2]:
time = pd.read_csv('DATA/PRSA_Data_Tiantan_20130301-20170228.csv', index_col = None)

In [3]:
time.head()

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,6.0,6.0,4.0,8.0,300.0,81.0,-0.5,1024.5,-21.4,0.0,NNW,5.7,Tiantan
1,2,2013,3,1,1,6.0,29.0,5.0,9.0,300.0,80.0,-0.7,1025.1,-22.1,0.0,NW,3.9,Tiantan
2,3,2013,3,1,2,6.0,6.0,4.0,12.0,300.0,75.0,-1.2,1025.3,-24.6,0.0,NNW,5.3,Tiantan
3,4,2013,3,1,3,6.0,6.0,4.0,12.0,300.0,74.0,-1.4,1026.2,-25.5,0.0,N,4.9,Tiantan
4,5,2013,3,1,4,5.0,5.0,7.0,15.0,400.0,70.0,-1.9,1027.1,-24.5,0.0,NNW,3.2,Tiantan


## Date Time Formatting
Crucial to time series. 

In [4]:
time['Date'] = pd.to_datetime(time[['year','month','day','hour']])
time = time.set_index('Date')

In [5]:
time.shape

(35064, 18)

### Dealing with Nan Values
After looking at the data, we wanted to look at null values and figure out what to do with these. 

We first calculated the mean and media for all of the values. These values varied wildly and we were concerned they would have a negative effect on the predictions. 

However, upon further research, other methods have proven more effective in inputing the data for time series such as Interpolating Time, however because we have hourly time this was not the best method. We elected to try imputing using the InterpolateLinear method. 

In [6]:
time.isna().sum()

No            0
year          0
month         0
day           0
hour          0
PM2.5       677
PM10        597
SO2        1118
NO2         744
CO         1126
O3          843
TEMP         20
PRES         20
DEWP         20
RAIN         20
wd           78
WSPM         14
station       0
dtype: int64

In [7]:
time = time.interpolate(method='time')

## PM2.5 (TARGET VARIABLE)

In [8]:
print('total PM2.5 Missing Values: ', time['PM2.5'].isna().sum())
print('mean: ', time['PM2.5'].mean())
print('median: ', time['PM2.5'].median())

total PM2.5 Missing Values:  0
mean:  82.03309662331738
median:  58.0


## PM10

In [9]:
print('total PM10 Missing Values: ', time['PM10'].isna().sum())
print('mean: ', time['PM10'].mean())
print('median: ', time['PM10'].median())

total PM10 Missing Values:  0
mean:  106.53707648870636
median:  85.0


## SO2

In [10]:
print('total SO2 Missing Values: ', time['SO2'].isna().sum())
print('mean: ', time['SO2'].mean())
print('median: ', time['SO2'].median())

total SO2 Missing Values:  0
mean:  14.510017738991555
median:  7.0


## NO2

In [11]:
print('total NO2 Missing Values: ', time['NO2'].isna().sum())
print('mean: ', time['NO2'].mean())
print('median: ', time['NO2'].median())

total NO2 Missing Values:  0
mean:  53.25882833532967
median:  47.0


## CO

In [12]:
print('total CO Missing Values: ', time['CO'].isna().sum())
print('mean: ', time['CO'].mean())
print('median: ', time['CO'].median())

total CO Missing Values:  0
mean:  1305.3332620351357
median:  900.0


## O3

In [13]:
print('total O3 Missing Values: ', time['O3'].isna().sum())
print('mean: ', time['O3'].mean())
print('median: ', time['O3'].median())

total O3 Missing Values:  0
mean:  56.14807717887289
median:  40.0


## TEMP

In [14]:
print('total TEMP Missing Values: ', time['TEMP'].isna().sum())
print('mean: ', time['TEMP'].mean())
print('median: ', time['TEMP'].median())

total TEMP Missing Values:  0
mean:  13.668249517775271
median:  14.6


## PRES

In [15]:
print('total PRES Missing Values: ', time['PRES'].isna().sum())
print('mean: ', time['PRES'].mean())
print('median: ', time['PRES'].median())

total PRES Missing Values:  0
mean:  1012.5518711023906
median:  1012.2


## DEWP

In [16]:
print('total DEWP Missing Values: ', time['DEWP'].isna().sum())
print('mean: ', time['DEWP'].mean())
print('median: ', time['DEWP'].median())

total DEWP Missing Values:  0
mean:  2.4451260552133287
median:  3.0


## RAIN

In [17]:
print('total RAIN Missing Values: ', time['RAIN'].isna().sum())
print('mean: ', time['RAIN'].mean())
print('median: ', time['RAIN'].median())

total RAIN Missing Values:  0
mean:  0.06398300250969628
median:  0.0


## wd

wd is wind direction and is not an integer. 

In [30]:
time.wd.dropna(axis = 0, inplace = True)

In [31]:
print('total wd Missing Values: ', time.wd.isna().sum())

total wd Missing Values:  0


## WSPM

In [28]:
print('total WSPM Missing Values: ', time['WSPM'].isna().sum())
print('mean: ', time['WSPM'].mean())
print('median: ', time['WSPM'].median())

total WSPM Missing Values:  0
mean:  1.8608145106091456
median:  1.5


### Nan Values Filled, check. 

In [32]:
time.isna().sum()

No          0
year        0
month       0
day         0
hour        0
PM2.5       0
PM10        0
SO2         0
NO2         0
CO          0
O3          0
TEMP        0
PRES        0
DEWP        0
RAIN        0
wd         78
WSPM        0
station     0
dtype: int64

In [63]:
path = r'DATA/' 
allFiles = glob.glob(path + '/*.csv')
time = pd.DataFrame()
list_ = []
i = 1

for file_ in allFiles:
    x = pd.read_csv(file_,index_col = None,header = 0)
    y = df['PM2.5']
    X_train, X_test, y_train, y_test = train_test_split(x,y)
    
    df['Date'] = pd.to_datetime(df[['year','month','day','hour']])
    df = df.set_index('Date')
    df = df.interpolate(method='time')
    
    time.concatenate(df)
    list_.append(df)

ValueError: not enough values to unpack (expected 4, got 2)

In [59]:
time

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,3.0,6.0,3.0,8.0,300.0,44.0,-0.9,1025.8,-20.5,0.0,NW,9.3,Shunyi
1,2,2013,3,1,1,12.0,12.0,3.0,7.0,300.0,47.0,-1.1,1026.1,-21.3,0.0,NW,9.4,Shunyi
2,3,2013,3,1,2,14.0,14.0,,7.0,200.0,22.0,-1.7,1026.2,-23.0,0.0,NW,8.6,Shunyi
3,4,2013,3,1,3,12.0,12.0,3.0,5.0,,,-2.1,1027.3,-23.3,0.0,NW,6.6,Shunyi
4,5,2013,3,1,4,12.0,12.0,3.0,,200.0,11.0,-2.4,1027.7,-22.9,0.0,NW,4.5,Shunyi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,35060,2017,2,28,19,27.0,72.0,8.0,92.0,800.0,16.0,10.3,1014.2,-12.4,0.0,W,1.8,Shunyi
35060,35061,2017,2,28,20,47.0,55.0,17.0,86.0,1100.0,19.0,9.8,1014.5,-9.9,0.0,NW,1.5,Shunyi
35061,35062,2017,2,28,21,18.0,28.0,4.0,30.0,500.0,64.0,9.1,1014.6,-12.7,0.0,NE,1.7,Shunyi
35062,35063,2017,2,28,22,18.0,20.0,9.0,33.0,500.0,59.0,7.1,1015.2,-13.2,0.0,WNW,1.8,Shunyi


In [None]:
1. read in each df
2. train test split
3. interpolate train and test
4. interpolate y train and y test
5. concatenate to l