In [15]:
import pandas as pd # Data handling 
import numpy as np

import tensorflow as tf # Neural networks  
from tensorflow import keras
from tensorflow.keras import layers

import plotly.graph_objects as go # Visualization
import plotly.express as px

import sklearn as sk # Stats / ML
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV, TimeSeriesSplit
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import impute

from xgboost import XGBRegressor # Efficient Gradient Boosting

In [16]:
df_train = pd.read_csv("../data/processed/train_imputed.csv", index_col=[3])
df_test = pd.read_csv("../data/processed/test_imputed.csv", index_col=[3])
display(df_test)
subm = pd.read_csv("../data/raw/submission_format.csv")

Unnamed: 0_level_0,year,weekofyear,city,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
week_start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2008-04-29,2008,18,1,-0.018900,-0.018900,0.102729,0.091200,298.492857,298.550000,294.527143,301.1,25.37,78.781429,78.60,3.128571,26.528571,7.057143,33.3,21.7,75.2
2008-05-06,2008,19,1,-0.018000,-0.012400,0.082043,0.072314,298.475714,298.557143,294.395714,300.8,21.83,78.230000,12.56,2.571429,26.071429,5.557143,30.0,22.2,34.3
2008-05-13,2008,20,1,-0.001500,-0.004324,0.151083,0.091529,299.455714,299.357143,295.308571,302.2,4.12,78.270000,3.66,4.428571,27.928571,7.785714,32.8,22.8,3.0
2008-05-20,2008,21,1,0.202833,-0.019867,0.124329,0.125686,299.690000,299.728571,294.402857,303.0,2.20,73.015714,0.00,4.342857,28.057143,6.271429,33.3,24.4,0.3
2008-05-27,2008,22,1,0.056800,0.039833,0.062267,0.075914,299.780000,299.671429,294.760000,302.3,4.36,74.084286,0.76,3.542857,27.614286,7.085714,33.3,23.3,84.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-05-28,2013,22,0,0.301471,0.380029,0.280629,0.383186,297.774286,298.964286,295.638571,305.5,67.60,89.990000,41.12,10.100000,27.400000,9.050000,32.6,21.8,33.0
2013-06-04,2013,23,0,0.247600,0.296343,0.285371,0.350357,297.167143,298.328571,295.845714,306.3,45.70,93.891429,71.52,9.657143,27.520000,10.720000,33.8,21.4,68.0
2013-06-11,2013,24,0,0.238729,0.251029,0.252586,0.249771,295.831429,296.607143,294.894286,304.6,45.22,94.967143,78.96,7.385714,27.200000,10.075000,32.6,21.6,93.2
2013-06-18,2013,25,0,0.310429,0.302700,0.406614,0.403943,295.778571,297.400000,293.648571,305.9,4.70,89.057143,39.54,8.228571,26.700000,8.480000,32.2,21.8,34.1


## Gradient Boosting

In [17]:
xgb = XGBRegressor(objective='reg:squarederror') 
X = df_train.drop('total_cases', axis=1)
y = df_train['total_cases'].to_frame()

tscv = TimeSeriesSplit(n_splits=5)

param = {
    'max_depth':[3,4,5], # depth of tree
    'eta':[.01, 1], # Step size shrinkage used in update to prevents overfitting
    'gamma':[0, 1], # Minimum loss reduction required to make a further partition on a leaf node of the tree (idk bro)
    'n_estimators':[100], # Number of trees 
    'lambda':[2,4], # l2 regularization coefficient
    'alpha':[1,3] # l1 regularization coefficient
}

grid_search = GridSearchCV(
    estimator = xgb,
    param_grid = param,
    scoring = 'neg_mean_absolute_error',
    verbose = 1,
    cv = tscv,
    n_jobs=-1,
)

grid_search = grid_search.fit(X, y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   23.8s finished


In [18]:
print(grid_search.best_params_)

{'alpha': 1, 'eta': 0.01, 'gamma': 0, 'lambda': 2, 'max_depth': 5, 'n_estimators': 100}


In [19]:
pred = grid_search.predict(df_train.drop('total_cases', axis=1))

plt = go.Figure(data=[
    go.Scatter(x=df_train.index, y=df_train['total_cases']),
    go.Scatter(x=df_train.index, y=pred)
])

plt.show()

In [20]:
test_pred = grid_search.predict(df_test)

# df_test.set_index(['city', 'year', 'weekofyear'], inplace=True)
# subm.set_index(['city', 'year', 'weekofyear'], inplace=True)

In [21]:
subm['total_cases'] = test_pred

subm.reset_index(inplace=True)
subm['total_cases'] = subm['total_cases'].apply(lambda x: int(round(x)))

In [22]:
subm.to_csv('../data/subms/total_test.csv', index=False)

Now, let's try splitting the data by city, training a model on each and then putting the data together again

In [23]:
display(df_test)

# sj_train = df_train.loc[X['city'] == 1]
# sj_train_labels = sj_train['total_cases']
# sj_test = df_test.loc[df_test['city'] == 1]

# iq_train = df_train.loc[X['city'] == 0]
# iq_train_labels = iq_train['total_cases']
# iq_test = df_test.loc[df_test['city'] == 0]

Unnamed: 0_level_0,year,weekofyear,city,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
week_start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2008-04-29,2008,18,1,-0.018900,-0.018900,0.102729,0.091200,298.492857,298.550000,294.527143,301.1,25.37,78.781429,78.60,3.128571,26.528571,7.057143,33.3,21.7,75.2
2008-05-06,2008,19,1,-0.018000,-0.012400,0.082043,0.072314,298.475714,298.557143,294.395714,300.8,21.83,78.230000,12.56,2.571429,26.071429,5.557143,30.0,22.2,34.3
2008-05-13,2008,20,1,-0.001500,-0.004324,0.151083,0.091529,299.455714,299.357143,295.308571,302.2,4.12,78.270000,3.66,4.428571,27.928571,7.785714,32.8,22.8,3.0
2008-05-20,2008,21,1,0.202833,-0.019867,0.124329,0.125686,299.690000,299.728571,294.402857,303.0,2.20,73.015714,0.00,4.342857,28.057143,6.271429,33.3,24.4,0.3
2008-05-27,2008,22,1,0.056800,0.039833,0.062267,0.075914,299.780000,299.671429,294.760000,302.3,4.36,74.084286,0.76,3.542857,27.614286,7.085714,33.3,23.3,84.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-05-28,2013,22,0,0.301471,0.380029,0.280629,0.383186,297.774286,298.964286,295.638571,305.5,67.60,89.990000,41.12,10.100000,27.400000,9.050000,32.6,21.8,33.0
2013-06-04,2013,23,0,0.247600,0.296343,0.285371,0.350357,297.167143,298.328571,295.845714,306.3,45.70,93.891429,71.52,9.657143,27.520000,10.720000,33.8,21.4,68.0
2013-06-11,2013,24,0,0.238729,0.251029,0.252586,0.249771,295.831429,296.607143,294.894286,304.6,45.22,94.967143,78.96,7.385714,27.200000,10.075000,32.6,21.6,93.2
2013-06-18,2013,25,0,0.310429,0.302700,0.406614,0.403943,295.778571,297.400000,293.648571,305.9,4.70,89.057143,39.54,8.228571,26.700000,8.480000,32.2,21.8,34.1


KeyError: "None of ['week_start_date'] are in the columns"

In [None]:
sj_est = grid_search.fit(sj_train.drop('total_cases', axis=1), sj_train_labels)
iq_est = grid_search.fit(iq_train.drop('total_cases', axis=1), iq_train_labels)

In [None]:
tst = XGBRegressor()
tst.fit(sj_train.drop('total_cases', axis=1), sj_train_labels)
sj_pred = tst.predict(sj_train.drop('total_cases', axis=1))
iq_pred = iq_est.predict(iq_train.drop('total_cases', axis=1))

plt_sj = go.Figure(data=[
    go.Scatter(x=sj_train.index, y=sj_train['total_cases']),
    go.Scatter(x=sj_train.index, y=sj_pred)
])

plt_iq = go.Figure(data=[
    go.Scatter(x=iq_train.index, y=iq_train['total_cases']),
    go.Scatter(x=iq_train.index, y=iq_pred)
])

plt_sj.show()
plt_iq.show()

In [None]:
sj_test_pred = tst.predict(sj_test)
iq_test_pred = iq_est.predict(iq_test)

sj_test['total_cases'] = sj_test_pred
iq_test['total_cases'] = iq_test_pred