In [136]:
import scipy as sp
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels as stats
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
%matplotlib inline
import myutil_regr as myutil

In [144]:
import importlib
_ = importlib.reload(myutil)

## Get training datasets

In [138]:
#
# iq training data
#
dfx_train_iq = pd.read_csv('data/dengue_features_train_no_outliers_iq.csv')
dfx_train_iq.set_index('yearweekofyear', inplace=True)
dfy_train_iq = pd.read_csv('data/dengue_labels_train_iq.csv')
dfy_train_iq.set_index('yearweekofyear', inplace=True)
#dftrain_iq = pd.merge(dfx_train_iq, dfy_train_iq)
#dftrain_iq.set_index('yearweekofyear', inplace=True)
#
# sj training data
# 
dfx_train_sj = pd.read_csv('data/dengue_features_train_no_outliers_sj.csv')
dfx_train_sj.set_index('yearweekofyear', inplace=True)
dfy_train_sj = pd.read_csv('data/dengue_labels_train_sj.csv')
dfy_train_sj.set_index('yearweekofyear', inplace=True)
#dftrain_sj = pd.merge(dfx_train_sj, dfy_train_sj)
#dftrain_sj.set_index('yearweekofyear', inplace=True)

In [139]:
def regr_run(X, y, poly_degree=1, exploring=False):
        
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.33, random_state=42)
        
    if poly_degree > 1:
        poly = PolynomialFeatures(poly_degree, interaction_only=True)
        X_train = poly.fit_transform(X_train)
        X_valid = poly.fit_transform(X_valid)
        
    if exploring: print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)
    
    # Create linear regression object
    regr = linear_model.LinearRegression()
    #regr = linear_model.Ridge(alpha = .5)
    #regr = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
    #regr = linear_model.Lasso(alpha = .1)
    #regr = linear_model.LassoLars(alpha = .1)
    #regr = linear_model.BayesianRidge()

    # Train the model using the training sets
    regr.fit(X_train, y_train.ravel())

    # Make predictions using the testing set
    y_pred = regr.predict(X_valid)
    #y_pred[ y_pred < 0] = 0
    y_pred = np.around(y_pred).astype('int')
    
    #print(np.hstack((y_valid, y_pred.reshape(y_pred.shape[0],1))))
    
    # The coefficients
    #print('Coefficients: \n', regr.coef_)
    # The mean squared error
    print("Mean absolute error: %.2f" % mean_absolute_error(y_valid, y_pred))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % r2_score(y_valid, y_pred))
    
    return regr


## First make predictions without feature total_cases

In [143]:
# total cases is rightmost column
dfx_iq = dfx_train_iq.iloc[:,:-1].copy()

periods_iq = 2    # best 1
degree_iq = 1     # best 1
# scaler 1 for entire feature set
# scaler 2 for total_cases only
X_iq, scaler_iq, scaler_tc_iq = myutil.preprocess(dfx_iq, periods_iq)
y_iq = dfy_train_iq.values[periods_iq:,:]
regr_iq= regr_run(X_iq, y_iq, degree_iq, exploring=True)

(347, 36) (171, 36) (347, 1) (171, 1)
Mean absolute error: 7.25
Variance score: 0.02


In [135]:
# total cases is rightmost column
dfx_sj = dfx_train_sj.iloc[:,:-1].copy()

periods_sj = 1    # best 2
degree_sj = 1     # best 1
# scaler 1 for entire feature set
# scaler 2 for total_cases only
X_sj, scaler_sj, scaler_tc_sj = myutil.preprocess(dfx_sj, periods_sj)
y_sj = dfy_train_sj.values[periods_sj:,:]
regr_sj = regr_run(X_sj, y_sj, degree_sj, exploring=True)

(626, 18) (309, 18) (626, 1) (309, 1)
Mean absolute error: 27.98
Variance score: 0.05


## Then make predictions with feature total_cases

In [107]:
periods_iq = 1    # best 1
degree_iq = 1     # best 1
# scaler 1 for entire feature set
# scaler 2 for total_cases only
X_iq, scaler_iq, scaler_tc_iq = myutil.preprocess(dfx_train_iq.copy(), periods_iq)
y_iq = dfy_train_iq.values[periods_iq:,:]
regr_iq= regr_run(X_iq, y_iq, degree_iq, exploring=True)

(347, 19) (172, 19) (347, 1) (172, 1)
Mean absolute error: 4.16
Variance score: 0.49


In [108]:
periods_sj = 1    # best 2
degree_sj = 1     # best 1
# scaler 1 for entire feature set
# scaler 2 for total_cases only
X_sj, scaler_sj, scaler_tc_sj = myutil.preprocess(dfx_train_sj.copy(), periods_sj)
y_sj = dfy_train_sj.values[periods_sj:,:]
regr_sj = regr_run(X_sj, y_sj, degree_sj, exploring=True)

(626, 19) (309, 19) (626, 1) (309, 1)
Mean absolute error: 13.48
Variance score: 0.66


## Get test dataset and create predictions

In [105]:
dfx_test_iq = pd.read_csv('data/dengue_features_test_no_outliers_iq.csv')
dfx_test_iq.set_index('yearweekofyear', inplace=True)
dfx_test_sj = pd.read_csv('data/dengue_features_test_no_outliers_sj.csv')
dfx_test_sj.set_index('yearweekofyear', inplace=True)

### Predict San Juan first

In [106]:
# first prediction will require periods_sj data from training dataset along with training labels
feature_count_sj = len(dfx_train_sj.columns)
df_last_train_sj = dfx_train_sj.iloc[-periods_sj:,:].values.reshape(1,periods_sj*feature_count_sj)
X_last_train_sj = scaler_sj.transform(df_last_train_sj)
# predict, scale, set to zero if needed
y_pred = max([[0]],scaler_tc_sj.transform(y_sj[-1:,:]))
print(scaler_tc_sj.transform(regr_sj.predict(X_last_train_sj).reshape(1,1)))

# now, for every row in test dataset, append prior prediction to get a new one
predictions_sj = list()
sick_sj = y_pred
nptest_sj = dfx_test_sj.values
for i in range(0, nptest_sj.shape[0]):
    #print(sick_sj)
    X_sj = scaler_sj.transform(np.hstack((nptest_sj[i:i+1], sick_sj)))
    sick_sj = max([[0]],scaler_tc_sj.transform(regr_sj.predict(X_sj).reshape(1,1)))
    predictions_sj.append(int(round(sick_sj[0][0])))
np_predictions_sj = np.array(predictions_sj).reshape(len(predictions_sj),1)
#np_predictions_sj

[[-0.0231936]]


### Predict Iquito

In [54]:
# first prediction will require periods_sj data from training dataset along with training labels
feature_count_iq = len(dfx_train_iq.columns)
df_last_train_iq = dfx_train_iq.iloc[-periods_iq:,:].values.reshape(1,periods_iq*feature_count_iq)
X_last_train_iq = scaler_iq.transform(df_last_train_iq)
# predict, scale, set to zero if needed
y_pred = max([[0]],scaler_tc_iq.transform(regr_iq.predict(X_last_train_iq).reshape(1,1)))

# now, for every row in test dataset, append prior prediction to get a new one
predictions_iq = list()
sick_iq = y_pred
nptest_iq = dfx_test_iq.values
for i in range(0, nptest_iq.shape[0]):
    #print(sick_iq)
    sick_iq = np.array(y_pred).reshape(1,1)
    X_iq = scaler_iq.transform(np.hstack((nptest_iq[i:i+1], sick_iq)))
    sick_iq = max([[0]],scaler_tc_iq.transform(regr_iq.predict(X_iq).reshape(1,1)))
    predictions_iq.append(int(round(sick_iq[0][0])))
np_predictions_iq = np.array(predictions_iq).reshape(len(predictions_iq),1)
np_predictions_iq

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [None]:
dfsubm = pd.read_csv('data/submission_format.csv')
npsubm_sj = np.concatenate((dfsubm[dfsubm['city']=='sj'][['city','year','weekofyear']].values, \
                            np_predictions_sj), axis=1)
npsubm_iq = np.concatenate((dfsubm[dfsubm['city']=='iq'][['city','year','weekofyear']].values, \
                            np_predictions_iq), axis=1)
dfresults = pd.DataFrame(np.concatenate((npsubm_sj, npsubm_iq), axis=0), columns=dfsubm.columns)
dfresults.to_csv("data/submission_20171119_regr_1.csv", index=False)

----------------------------------------------------------------------------------------------------------------------

In [None]:
regr_predict_and_save(dftrain_iq, regr_iq, periods_iq, dftrain_sj, regr_sj, periods_sj, dftest_iq, dftest_sj,\
                      "data/submission_20171116_regr_1.csv")