In [53]:
from os.path import join
import pandas as pd
import seaborn as sns
import warnings

import pymc3 as pm

from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.tree import DecisionTreeRegressor


from deep_aqi import ROOT


pd.set_option('expand_frame_repr', False)
pd.set_option('max_columns', 50)
pd.set_option('max_rows', 25)
pd.options.mode.chained_assignment = None 

In [3]:
# warnings.filterwarnings("ignore")

In [4]:
def show_result(results):
    print(results.describe())
    results = results.reset_index()
    results = results.melt(id_vars='index', value_vars=['MAE', 'MSE', 'MdAE', 'R2'])
    results = results.loc[results.variable != 'MSE', :]
    
    sns.boxplot(x=results['variable'], y=results['value'])

In [5]:
PROCESSED_DATA = join(ROOT, 'data', 'processed')
INTERIM_DATA = join(ROOT, 'data', 'interim')
RAW_DATA = join(ROOT, 'data', 'raw')

In [50]:
file_path = join(PROCESSED_DATA, '88101.parquet')

data_source = pd.read_parquet(file_path)
target_column = data_source.columns[-1]

In [51]:
focus_stations = data_source.SiteCode.value_counts()[:4].index
data_source = data_source.loc[data_source.SiteCode.isin(focus_stations), :]
data_source.head()

Unnamed: 0_level_0,SiteCode,LocalDate,WindDir,WindSpeed,Temperature,Pressure,RelHum,PM2.5 - Local Conditions
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4125,Louisiana_East Baton Rouge_9.0,2014-01-01 04:00:00,116.0,6.804044,7.222222,101700.0,97.0,23.0
4126,Louisiana_East Baton Rouge_9.0,2014-01-02 07:00:00,310.0,11.858476,13.333333,100900.0,99.0,9.8
4127,Louisiana_East Baton Rouge_9.0,2014-01-03 13:00:00,60.0,6.804044,6.666667,102100.0,32.0,5.7
4128,Louisiana_East Baton Rouge_9.0,2014-01-04 02:00:00,294.0,1.749611,0.0,101600.0,74.0,6.6
4129,Louisiana_East Baton Rouge_9.0,2014-01-04 04:00:00,276.0,13.413686,-1.111111,101600.0,74.0,10.1


In [8]:
def simple_linear_1(data_source):
    to_concat = []
    for site_code, df in data_source.groupby(by='SiteCode'):

        X = df.loc[:, ['WindDir', 'WindSpeed', 'Temperature', 'Pressure', 'RelHum']]
        Y = df.loc[:, target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=23)

        model = LinearRegression()

        model.fit(X_train, y_train)


        y_pred = model.predict(X_test)


        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        mdae = median_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        mean = np.mean(df[df.columns[-1]])

        result = pd.DataFrame(index = [site_code],
                              data={'MAE': mae,
                                    'MSE': mse,
                                    'MdAE': mdae,
                                    'R2': r2,
                                    'MEAN': mean,
                                   })
        to_concat.append(result)

    return pd.concat(to_concat)


In [52]:
data_source

Unnamed: 0_level_0,SiteCode,LocalDate,WindDir,WindSpeed,Temperature,Pressure,RelHum,PM2.5 - Local Conditions
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4125,Louisiana_East Baton Rouge_9.0,2014-01-01 04:00:00,116.0,6.804044,7.222222,101700.0,97.0,23.0
4126,Louisiana_East Baton Rouge_9.0,2014-01-02 07:00:00,310.0,11.858476,13.333333,100900.0,99.0,9.8
4127,Louisiana_East Baton Rouge_9.0,2014-01-03 13:00:00,60.0,6.804044,6.666667,102100.0,32.0,5.7
4128,Louisiana_East Baton Rouge_9.0,2014-01-04 02:00:00,294.0,1.749611,0.000000,101600.0,74.0,6.6
4129,Louisiana_East Baton Rouge_9.0,2014-01-04 04:00:00,276.0,13.413686,-1.111111,101600.0,74.0,10.1
4130,Louisiana_East Baton Rouge_9.0,2014-01-04 21:00:00,129.0,10.108865,10.000000,101000.0,94.0,16.4
4131,Louisiana_East Baton Rouge_9.0,2014-01-05 15:00:00,293.0,16.912908,16.666667,100400.0,91.0,0.5
4132,Louisiana_East Baton Rouge_9.0,2014-01-06 07:00:00,355.0,20.217729,-2.777778,102500.0,67.0,5.3
4133,Louisiana_East Baton Rouge_9.0,2014-01-06 09:00:00,353.0,16.912908,-2.222222,102800.0,58.0,5.2
4134,Louisiana_East Baton Rouge_9.0,2014-01-06 12:00:00,323.0,15.163297,0.555556,102600.0,36.0,5.7


In [60]:
data_source.columns

Index(['SiteCode', 'LocalDate', 'WindDir', 'WindSpeed', 'Temperature',
       'Pressure', 'RelHum', 'PM2.5 - Local Conditions'],
      dtype='object')

In [61]:
df = data_source

In [66]:
with pm.Model() as unpooled_model:


    a = pm.Normal('a', 0, sd=100, shape=1)
    b = pm.Normal('b', 0, sd=100, shape=1)
    c = pm.Normal('c', 0, sd=100, shape=1)
#     d = pm.Normal('d', 0, sd=100, shape=1)
#     e = pm.Normal('e', 0, sd=100, shape=1)
#     f = pm.Normal('f', 0, sd=100, shape=1)


    eps = pm.HalfCauchy('eps', 5)

    y_hat = a + b*df.WindDir + c*df.WindSpeed #+ d*df.Temperature + e*df.Pressure + f*df.RelHum

    # Data likelihood
    y = pm.Normal('y', y_hat, sd=eps, observed=data_source[target_column])

In [None]:
with unpooled_model:
    unpooled_trace = pm.sample(100, threads=4)


Sampling 2 chains:   0%|          | 0/1200 [00:00<?, ?draws/s][A
Sampling 2 chains:   1%|          | 13/1200 [00:00<00:09, 122.99draws/s][A
Sampling 2 chains:   2%|▏         | 23/1200 [00:00<00:10, 114.93draws/s][A
Sampling 2 chains:   3%|▎         | 31/1200 [00:00<00:11, 99.14draws/s] [A
Sampling 2 chains:   3%|▎         | 38/1200 [00:00<00:14, 82.00draws/s][A
Sampling 2 chains:   4%|▍         | 46/1200 [00:00<00:14, 77.71draws/s][A
Sampling 2 chains:   5%|▍         | 55/1200 [00:00<00:14, 79.42draws/s][A
Sampling 2 chains:   6%|▌         | 67/1200 [00:00<00:12, 87.98draws/s][A
Sampling 2 chains:   6%|▋         | 77/1200 [00:00<00:13, 85.75draws/s][A
Sampling 2 chains:   7%|▋         | 87/1200 [00:00<00:12, 88.37draws/s][A
Sampling 2 chains:   8%|▊         | 96/1200 [00:01<00:13, 83.98draws/s][A
Sampling 2 chains:   9%|▉         | 105/1200 [00:01<00:13, 81.23draws/s][A
Sampling 2 chains:  10%|▉         | 117/1200 [00:01<00:12, 88.21draws/s][A
Sampling 2 chains:  10%|█   