### Data Preprocessing

In [96]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [97]:
dlt = pd.read_csv(r"C:\Users\Mustafa Kaan\Desktop\dengue_labels_train.csv")
dft = pd.read_csv(r"C:\Users\Mustafa Kaan\Desktop\dengue_features_train.csv")

In [98]:
df = dlt.merge(dft)
df = df.sample(frac=1)
print(df)

     city  year  weekofyear  total_cases week_start_date   ndvi_ne   ndvi_nw  \
818    sj  2006           3           19      2006-01-22 -0.194233 -0.111740   
172    sj  1993          33           23      1993-08-20  0.096900  0.049667   
401    sj  1998           3           49      1998-01-15 -0.016150 -0.008225   
924    sj  2008           6            2      2008-02-05 -0.111700 -0.003200   
1450   iq  2010          20            6      2010-05-21  0.263071  0.272500   
...   ...   ...         ...          ...             ...       ...       ...   
1169   iq  2004          52            7      2004-12-23  0.364862  0.326600   
849    sj  2006          34           18      2006-08-27  0.064050  0.003450   
602    sj  2001          48           47      2001-11-26  0.215150  0.226150   
676    sj  2003          18            5      2003-04-30  0.068100  0.035500   
97     sj  1992          11           40      1992-03-11  0.086850  0.090150   

       ndvi_se   ndvi_sw  precipitation

In [99]:
df = df.fillna(df.mean())

In [100]:
df = df.drop("week_start_date", axis=1)

In [101]:
y = df["total_cases"]
X = df.drop("total_cases", axis=1)

In [102]:
X = pd.get_dummies(X, columns = ["city", "year", "weekofyear"])

### Modelling

In [103]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV


In [104]:
kf = KFold(n_splits=5, shuffle = True, random_state = 8)

In [105]:
def hparam_tuning(model, X_val, y_val, param_grid):
    return GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error').fit(X_val,y_val)

In [106]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

from sklearn.feature_selection import f_regression
pvalues = f_regression(X,y)
array = list(pvalues[1])

X = pd.DataFrame(X)

In [107]:
def removeHighPValueFeature(array,X,threshold):
    col = 0
    for v in array:
        if v > threshold:
            col = int(array.index(v,col))
            X = X.drop(col, axis=1)
    return X

X = removeHighPValueFeature(array,X,0.01)
X.head()

Unnamed: 0,0,1,2,3,5,6,7,8,9,11,...,34,35,36,38,41,82,83,84,86,87
0,-2.57318,-2.056904,-1.395704,-1.215695,-0.221345,-0.554526,-0.544808,-0.784242,0.383793,-0.482958,...,-0.27735,-0.27735,-0.27735,3.605551,-0.19245,-0.140028,-0.140028,-0.140028,-0.140028,-0.140028
1,-0.347092,-0.686668,-0.091598,-0.468777,0.713309,0.434269,0.832114,-0.349778,0.814211,0.128214,...,-0.27735,-0.27735,-0.27735,-0.27735,-0.19245,-0.140028,-0.140028,-0.140028,-0.140028,-0.140028
2,-1.211505,-1.17813,0.012707,-0.044861,-0.802346,-1.225089,-0.258536,-1.001474,0.266407,0.355323,...,-0.27735,-0.27735,-0.27735,-0.27735,-0.19245,-0.140028,-0.140028,-0.140028,-0.140028,-0.140028
3,-1.942107,-1.135471,0.396589,0.827344,-1.079164,-1.549005,-2.437021,-1.280773,-0.124882,-1.616303,...,-0.27735,-0.27735,-0.27735,-0.27735,-0.19245,-0.140028,-0.140028,-0.140028,-0.140028,-0.140028
4,0.923502,1.20504,0.743622,0.506918,-0.776033,-0.412458,-1.32666,1.636345,-2.198715,-0.443871,...,-0.27735,-0.27735,-0.27735,-0.27735,-0.19245,-0.140028,-0.140028,-0.140028,-0.140028,-0.140028


In [108]:
from sklearn.model_selection import train_test_split
X_tt, X_val, y_tt, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [109]:
def run(model, X, y):
    errors = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [abs(elem) for elem in y_pred]
        MAE = mean_absolute_error(y_test, y_pred)
        errors.append(MAE)

    return errors

In [110]:
models = [LinearRegression(), Lasso(), Ridge(), KNeighborsRegressor(), DecisionTreeRegressor(), SVR(), 
          GradientBoostingRegressor(), RandomForestRegressor(), BaggingRegressor(),
          AdaBoostRegressor(), MLPRegressor()]#, MultinomialNB()]
              
for model in models:
    errors = run(model, X_tt ,y_tt)
    print(type(model).__name__)
    print("%.2f\n" %np.mean(errors))

LinearRegression
18.27

Lasso
17.96

Ridge
18.22

KNeighborsRegressor
15.11

DecisionTreeRegressor
19.45

SVR
16.86

GradientBoostingRegressor
14.79

RandomForestRegressor
15.09

BaggingRegressor
15.70

AdaBoostRegressor
36.28

MLPRegressor
15.81



### Hyperparameter Tuning for Regression Models

In [111]:
grid_lasso = {'alpha': [1, 3, 10]}
grid_ridge = {'alpha': [1, 3, 10]}
grid_dtr = {'max_depth': [2, 4, 8, 16, 32, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]}
grid_knnr = {'n_neighbors' : [1, 3, 10 , 30, 100]}
grid_svr = {'C' : [1, 3 , 10, 30], 'epsilon': [0.1, 0.3, 1.0]}
grid_mlpr = {'hidden_layer_sizes': [(100, 10),(100, 30),(100,100), (100,200)], 'activation': ['relu', 'logistic', 'tanh'], 'solver': ['sgd', 'adam']}

model = [Lasso(), Ridge(), DecisionTreeRegressor(), KNeighborsRegressor(), SVR(), MLPRegressor()]
grid = [grid_lasso, grid_ridge, grid_dtr, grid_knnr, grid_svr, grid_mlpr]

pairs = [[a,b] for (a,b) in list(zip(model,grid))]
print(pairs)

[[Lasso(), {'alpha': [1, 3, 10]}], [Ridge(), {'alpha': [1, 3, 10]}], [DecisionTreeRegressor(), {'max_depth': [2, 4, 8, 16, 32, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]}], [KNeighborsRegressor(), {'n_neighbors': [1, 3, 10, 30, 100]}], [SVR(), {'C': [1, 3, 10, 30], 'epsilon': [0.1, 0.3, 1.0]}], [MLPRegressor(), {'hidden_layer_sizes': [(100, 10), (100, 30), (100, 100), (100, 200)], 'activation': ['relu', 'logistic', 'tanh'], 'solver': ['sgd', 'adam']}]]


In [113]:
os.chdir(r'C:\Users\Mustafa Kaan\Desktop\Results3')
for pair in pairs:
    grid = hparam_tuning(pair[0], X_val, y_val, pair[1])
    
    print("Model %s: " % type(pair[0]).__name__ )
    print(grid.best_params_)
    errors = run(pair[0].set_params(**grid.best_params_), X_tt, y_tt)
    print("%.2f\n" %np.mean(errors))
    
    params = pair[1].keys()
    params = ['param_' + str(param) for param in params]
    params.append('mean_test_score')
    grid_mean_scores = pd.DataFrame(grid.cv_results_)[params]
    grid_mean_scores.to_excel(type(pair[0]).__name__ + ".xlsx")

Model Lasso: 
{'alpha': 3}
18.67

Model Ridge: 
{'alpha': 10}
18.22

Model DecisionTreeRegressor: 
{'max_depth': 4, 'max_features': 'sqrt', 'min_samples_split': 2}
19.76

Model KNeighborsRegressor: 
{'n_neighbors': 10}
16.08

Model SVR: 
{'C': 30, 'epsilon': 1.0}
14.84

Model MLPRegressor: 
{'activation': 'relu', 'hidden_layer_sizes': (100, 30), 'solver': 'sgd'}
15.46



In [114]:
grid_gbr = {'n_estimators': [5, 10, 50, 100, 120]}
grid_rfr = {'n_estimators': [5, 10, 50, 100, 120]}
grid_bgr = {'n_estimators': [5, 10, 50, 100, 120]}
grid_abr = {'n_estimators': [5, 10, 50, 100, 120]}

model = [GradientBoostingRegressor(), RandomForestRegressor(), BaggingRegressor(),
          AdaBoostRegressor()]
grid= [grid_gbr, grid_rfr, grid_bgr, grid_abr]
pairs = [[a,b] for (a,b) in list(zip(model,grid))]

In [115]:
os.chdir(r'C:\Users\Mustafa Kaan\Desktop\Results3')
for pair in pairs:
    grid = hparam_tuning(pair[0], X_val, y_val, pair[1])
    
    print("Model %s: " % type(pair[0]).__name__ )
    print(grid.best_params_)
    errors = run(pair[0].set_params(**grid.best_params_), X_tt, y_tt)
    print("%.2f\n" %np.mean(errors))
    
    params = pair[1].keys()
    params = ['param_' + str(param) for param in params]
    params.append('mean_test_score')
    grid_mean_scores = pd.DataFrame(grid.cv_results_)[params]
    grid_mean_scores.to_excel(type(pair[0]).__name__ + ".xlsx")

Model GradientBoostingRegressor: 
{'n_estimators': 50}
15.34

Model RandomForestRegressor: 
{'n_estimators': 50}
15.10

Model BaggingRegressor: 
{'n_estimators': 50}
14.97

Model AdaBoostRegressor: 
{'n_estimators': 10}
21.35



### Voting Regressor

In [121]:
reg1 = SVR().set_params(**{'C': 30, 'epsilon': 1.0})
reg2 = MLPRegressor().set_params(**{'activation': 'relu', 'hidden_layer_sizes': (100, 30), 'solver': 'sgd'})
reg3 = BaggingRegressor().set_params(**{'n_estimators': 50})
estimators=[('svr', reg1), ('mlp', reg2), ('bgr', reg3)]

In [122]:
from sklearn.ensemble import VotingRegressor
vereg = VotingRegressor(estimators=estimators)
errors = run(vereg, X ,y)
print(type(vereg).__name__)
print("%.2f\n" %np.mean(errors))

VotingRegressor
12.77



### Stacking Regressor

In [123]:
from sklearn.ensemble import StackingRegressor

sereg = StackingRegressor(
        estimators=estimators,
        final_estimator=LinearRegression())
errors = run(sereg, X ,y)
print(type(sereg).__name__)
print("%.2f\n" %np.mean(errors))

StackingRegressor
12.57

