# Beijing Air-Quality Time Series Project
### Data Cleaning Notebook

by Dolci Sanders and Paul Torres


### Import Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import glob
from sklearn.model_selection import train_test_split
import pickle

### Read in Data

Our Data is provided by the UCI Machine Learning Repository
Beijing Multi-Site Air-Quality Data

Once Read in, we will look at the head and convert our time into data time. 

We have 12 data sets, one from each reporting site, to concatenate together to get the whole picture of Beijing's Air Quality. 


In [2]:
time = pd.read_csv('DATA/PRSA_Data_Tiantan_20130301-20170228.csv', index_col = None)

In [3]:
time.year.unique()

array([2013, 2014, 2015, 2016, 2017])

### Date Time Formatting
Crucial to time series. 

In [4]:
time['Date'] = pd.to_datetime(time[['year','month','day','hour']])
time = time.set_index('Date')

In [5]:
time.shape

(35064, 18)

### Dealing with Nan Values
After looking at the data, we wanted to look at null values and figure out what to do with these. 

We first calculated the mean and media for all of the values. These values varied wildly and we were concerned they would have a negative effect on the predictions. 

However, upon further research, other methods have proven more effective in inputing the data for time series such as Interpolating Time, however because we have hourly time this was not the best method. We elected to try imputing using the InterpolateLinear method. 

Because we are doing univariate time series, however, we will not need to worry about the other features at this time. 

In [6]:
time.isna().sum()

No            0
year          0
month         0
day           0
hour          0
PM2.5       677
PM10        597
SO2        1118
NO2         744
CO         1126
O3          843
TEMP         20
PRES         20
DEWP         20
RAIN         20
wd           78
WSPM         14
station       0
dtype: int64

In [7]:
time = time.interpolate(method='time')

### PM2.5 (Target Variable)
You can see this value now has no missing values. 

In [8]:
print('total PM2.5 Missing Values: ', time['PM2.5'].isna().sum())
print('mean: ', time['PM2.5'].mean())
print('median: ', time['PM2.5'].median())

total PM2.5 Missing Values:  0
mean:  82.03309662331738
median:  58.0


## Combine 12 Testing Site Tables
We have 12 testing sites with a table for each. 
So we built a function to automate our concatenation of all of these. 
For each test site, we loop through these, creating a whole dataframe for visualizations and also a separate loop of this to train test split, interpolate the test then the train using the time method, finally we back fill the stubborn few.

Each of these will interpolate each table separately as shown with one table above, and then concatenate them for the most accurate results. 

The first will keep the whole data set, in case we want to do some additional visualizations. 
The second will be our test train split, which will be Date and PM2.5 values only. 


### Time DataFrame, import all as a whole, interpolate by table
Here we have the full data frame, not split with all the columns. This is not resampled to the mean. 

In [14]:
# From the DATA folder, we wil run through all of the individual csv files.
path = r'DATA/'
allFiles = glob.glob(path + '/*.csv')

# Prep the test and train data frames.
time = pd.DataFrame()
times = []


for file_ in allFiles:
    
    # Read in and set data index.
    df = pd.read_csv(file_,index_col = None,header = 0)
    df['Date'] = pd.to_datetime(df[['year','month','day','hour']])
    df = df.set_index('Date')
    df = df.loc[:'2016-12-31 23:00:00']
    
    # Univariate time series, drop the other features
    df = df.drop(columns =['No'])
    

    # Interpolate the train data and add to the rest of the train list
    df = df.interpolate(method = 'time')
    times.append(df)
    
    
    
# Take the data frame we made at the top and combine the new data frame we processed here
time = pd.concat(times)
time = time.sort_values(['station','Date'])
time.head()

Unnamed: 0_level_0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2013-03-01 00:00:00,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin
2013-03-01 01:00:00,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin
2013-03-01 02:00:00,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,NNW,5.6,Aotizhongxin
2013-03-01 03:00:00,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,NW,3.1,Aotizhongxin
2013-03-01 04:00:00,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin


### Univariate Train Test Split Interpolation with 80-20 Split

In [10]:
# From the DATA folder, we wil run through all of the individual csv files. 

path = r'DATA/' 
allFiles = glob.glob(path + '/*.csv')

# Prep the test and train data frames.

test = pd.DataFrame()
train = pd.DataFrame()
trains = []
tests = []

for file_ in allFiles:
    
    # Read in and set data index. 
    df = pd.read_csv(file_,index_col = None,header = 0)
    df['Date'] = pd.to_datetime(df[['year','month','day','hour']])
    df = df.set_index('Date')
    #Limits to end 2016 
    df = df.loc[:'2016-12-31 23:00:00']
    
    # Drop unwanted columns, in univariate we should only have the date and the target. 
    df = df.drop(columns=['year','month','day','hour','No', 
                         'PM10', 'SO2', 'NO2', 'CO', 'O3', 
                         'TEMP', 'PRES', 'DEWP', 'RAIN','wd', 'WSPM'], axis = 1)
    
    
    df = df.resample('D').mean()
    
    # Train test split to prevent data leakage
    train = df[:int(df.shape[0]*0.8)]
    test = df[int(df.shape[0]*0.8):]
     
    
    # Interpolate the train data and add to the rest of the train list 
    
    train = train.interpolate(method = 'time')
    trains.append(train)
    
    # Interpolate the test data and add to the rest of the test list
    
    test = test.interpolate(method = 'time')
    test['PM2.5'] = test['PM2.5'].fillna(method = 'bfill')
    tests.append(test)
    
     
    

# Take the data fame we made at the top and combine the new data frame we processed here 
    
train = pd.concat(trains)
train = train.sort_values(['Date'])
test = pd.concat(tests)
test = test.sort_values(['Date'])

### Individual Stations not Concatenated, callable from a list
This will allow us to predict on individual sites (stations) if time allows. 

In [19]:
# From the DATA folder, we wil run through all of the individual csv files. 

path = r'DATA/' 
allFiles = glob.glob(path + '/*.csv')

# Prep the test and train data frames.

test = pd.DataFrame()
train = pd.DataFrame()
all_trains = []
all_tests = []

for file_ in allFiles:
    
    # Read in and set data index. 
    df = pd.read_csv(file_,index_col = None,header = 0)
    df['Date'] = pd.to_datetime(df[['year','month','day','hour']])
    df = df.set_index('Date')
    #Limits to end 2016 
    df = df.loc[:'2016-12-31 23:00:00']
    
    # Drop unwanted columns, in univariate we should only have the date and the target. 
    df = df.drop(columns=['year','month','day','hour','No', 
                         'PM10', 'SO2', 'NO2', 'CO', 'O3', 
                         'TEMP', 'PRES', 'DEWP', 'RAIN','wd', 'WSPM'], axis = 1)
    
    
    
    
    # Train test split to prevent data leakage
    train = df[:int(df.shape[0]*0.8)]
    test = df[int(df.shape[0]*0.8):]
     
    
    # Interpolate the train data and add to the rest of the train list 
    
    train = train.interpolate(method = 'time')
    all_trains.append(train)
    
    # Interpolate the test data and add to the rest of the test list
    
    test = test.interpolate(method = 'time')
    test['PM2.5'] = test['PM2.5'].fillna(method = 'bfill')
    all_tests.append(test)

In [44]:
Gucheng = all_tests[0]
Huairou = all_tests[1]
Tiantan = all_tests[2]
Changping = all_tests[3]
Guanyuan = all_tests[4]
Nongzhanguan = all_tests[5]
Wanliu = all_tests[6]
Dongsi = all_tests[7]
Wanshouxigong = all_tests[8]
Aotizhongxin = all_tests[9]
Dingling = all_tests[10]
Shunyi = all_tests[11]

trainGucheng = all_trains[0]
trainHuairou = all_trains[1]
trainTiantan = all_trains[2]
trainChangping = all_trains[3]
trainGuanyuan = all_trains[4]
trainNongzhanguan = all_trains[5]
trainWanliu = all_trains[6]
trainDongsi = all_trains[7]
trainWanshouxigong = all_trains[8]
trainAotizhongxin = all_trains[9]
trainDingling = all_trains[10]
trainShunyi = all_trains[11]

### Pickle the Time DF and the Train and Test DFs, then proceed to the EDA Notebook
The Time DF will be used for visualizations
The Train Test Split will be used in Predictions as well as for comparative visuals later on. 

In [45]:

test.to_pickle('PKL/test.pkl')
train.to_pickle('PKL/train.pkl')
time.to_pickle('PKL/time.pkl')

# Station Sites 

Aotizhongxin.to_pickle('PKL/Aotizhongxin.pkl')
Changping.to_pickle('PKL/Changping.pkl')
Dingling.to_pickle('PKL/Dingling.pkl')
Dongsi.to_pickle('PKL/Dongsi.pkl')
Guanyuan.to_pickle('PKL/Guanyuan.pkl')
Gucheng.to_pickle('PKL/Gucheng.pkl')
Huairou.to_pickle('PKL/Huairou.pkl')
Nongzhanguan.to_pickle('PKL/Nongzhanguan.pkl')
Shunyi.to_pickle('PKL/Shunyi.pkl')
Tiantan.to_pickle('PKL/Tiantan.pkl')
Wanliu.to_pickle('PKL/Wanliu.pkl')
Wanshouxigong.to_pickle('PKL/Wanshouxigong.pkl')

trainGucheng.to_pickle('PKL/trainGucheng.pkl')
trainHuairou.to_pickle('PKL/trainHuairou.pkl')
trainTiantan.to_pickle('PKL/trainTiantan.pkl')
trainChangping.to_pickle('PKL/trainChangping.pkl')
trainGuanyuan.to_pickle('PKL/trainGuanyuan.pkl')
trainNongzhanguan.to_pickle('PKL/trainNongzhanguan.pkl')
trainWanliu.to_pickle('PKL/trainWanliu.pkl')
trainDongsi.to_pickle('PKL/trainDongsi.pkl')
trainWanshouxigong.to_pickle('PKL/trainWanshouxigong.pkl')
trainAotizhongxin.to_pickle('PKL/trainAotizhongxin.pkl')
trainDingling.to_pickle('PKL/trainDingling.pkl')
trainShunyi.to_pickle('PKL/trainShunyi.pkl')