# Beijing Air Quality

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob

%matplotlib inline

In [None]:
path = r'PRSA_Data_20130301-20170228/'
allFiles = glob.glob(path + '/*.csv')
from sklearn.model_selection import train_test_split
air = pd.DataFrame()
trains = []
tests = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col = None,header = 0)
    df['Date'] = pd.to_datetime(df[['year','month','day','hour']])
    df = df.set_index('Date')
    X = df.drop(columns=['PM2.5','year','month','day','hour','No'], axis = 1)
    y = df['PM2.5']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    train = pd.concat([X_train,y_train], axis = 1, ignore_index=True)
    train.columns = ['PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN','wd', 'WSPM', 'station','PM2.5']
    test = pd.concat([X_test,y_test], axis = 1, ignore_index=True)
    test.columns = ['PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN','wd', 'WSPM', 'station','PM2.5']
    trains.append(train)
    tests.append(test)
    for train in trains:
        train = train.interpolate(method = 'time')
        train.wd = train.wd.fillna(method = 'ffill')
    for test in tests:
        test = test.interpolate(method = 'time')
        test.wd = test.wd.fillna(method = 'ffill')

df = pd.concat(trains)
df = df.sort_values(['station', 'Date'])

test = pd.concat(tests)

test = test.sort_values(['station', 'Date'])

type(tests)


# Exploration of one data set. With sights set on the rest of them

In [None]:
#time = pd.read_csv('PRSA_Data_20130301-20170228/PRSA_Data_Tiantan_20130301-20170228.csv')

In [5]:
air.head()

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin
1,2,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin
2,3,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,NNW,5.6,Aotizhongxin
3,4,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,NW,3.1,Aotizhongxin
4,5,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin


In [6]:
air.shape

(420768, 18)

In [8]:
air['Date'] = pd.to_datetime(air[['year','month','day','hour']])


In [9]:
air.head()

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station,Date
0,1,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin,2013-03-01 00:00:00
1,2,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin,2013-03-01 01:00:00
2,3,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,NNW,5.6,Aotizhongxin,2013-03-01 02:00:00
3,4,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,NW,3.1,Aotizhongxin,2013-03-01 03:00:00
4,5,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin,2013-03-01 04:00:00


In [10]:
air.isna().sum()


No             0
year           0
month          0
day            0
hour           0
PM2.5       8739
PM10        6449
SO2         9021
NO2        12116
CO         20701
O3         13277
TEMP         398
PRES         393
DEWP         403
RAIN         390
wd          1822
WSPM         318
station        0
Date           0
dtype: int64

In [11]:
air.dtypes

No                  int64
year                int64
month               int64
day                 int64
hour                int64
PM2.5             float64
PM10              float64
SO2               float64
NO2               float64
CO                float64
O3                float64
TEMP              float64
PRES              float64
DEWP              float64
RAIN              float64
wd                 object
WSPM              float64
station            object
Date       datetime64[ns]
dtype: object

In [24]:
air.describe(include = 'all')

  time.describe(include = 'all')


Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station,Date
count,35064.0,35064.0,35064.0,35064.0,35064.0,34387.0,34467.0,33946.0,34320.0,33938.0,34221.0,35044.0,35044.0,35044.0,35044.0,34986,35050.0,35064,35064
unique,,,,,,,,,,,,,,,,16,,1,35064
top,,,,,,,,,,,,,,,,ENE,,Tiantan,2015-09-21 23:00:00
freq,,,,,,,,,,,,,,,,3861,,35064,1
first,,,,,,,,,,,,,,,,,,,2013-03-01 00:00:00
last,,,,,,,,,,,,,,,,,,,2017-02-28 23:00:00
mean,17532.5,2014.66256,6.52293,15.729637,11.5,82.164911,106.363672,14.367615,53.162646,1298.303318,55.984297,13.67149,1012.547419,2.447535,0.06402,,1.860785,,
std,10122.249256,1.177213,3.448752,8.800218,6.922285,80.921384,89.700157,20.144631,31.946224,1170.593297,59.081528,11.458418,10.266059,13.810696,0.786282,,1.280368,,
min,1.0,2013.0,1.0,1.0,0.0,3.0,2.0,0.5712,2.0,100.0,0.4284,-16.8,987.1,-35.3,0.0,,0.0,,
25%,8766.75,2014.0,4.0,8.0,5.75,22.0,41.0,3.0,28.0,500.0,8.0,3.1,1004.0,-8.8,0.0,,1.0,,


In [25]:
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(air['PM2.5'], model='multiplicative')  # model='mul' also works
result.plot();

ValueError: This function does not handle missing values