In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
from matplotlib import pyplot as plt
plt.figure(figsize=(10,10))

import xgboost as xgb
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split

<Figure size 720x720 with 0 Axes>

In [2]:
def handle_outliers(data, features):
    df = data.copy(deep=True)
    for c in features:
        q1 = df[c].quantile(0.25)
        q3 = df[c].quantile(0.75)
        iqr = q3-q1 #Interquartile range
            
        fence_low  = q1-1.5*iqr
        fence_high = q3+1.5*iqr
        df[c][df[c] < fence_low] = fence_low
        df[c][df[c] > fence_high] = fence_high
        
        return df

In [127]:
def handle_missing(data, features):
    df = data.copy(deep=True)
    for c in features:
        mean = df[c].mean()
        df[c].fillna(mean, inplace=True)
    return df

In [128]:
def normalize(data):
    df = data.copy(deep=True)
    features = data.columns.values
    for c in features:
        c_max = max(df[c])
        c_min = min(df[c])
        df[c] = (df[c] - c_min)/(c_max-c_min)
    return df

In [129]:
def preprocess_data(data_path, labels_path=None):
#     x_features = get_best_features(limit)
#     print(x_features)
    
    df = pd.read_csv(data_path)
   
    df_1 = df.shift(1)
    df_1 = df_1.add_suffix('_SHIFT_1')
    df = pd.concat([df, df_1], axis=1)

    df.fillna(method='bfill', inplace=True)
    # df.dropna(inplace=True)
    # df.interpolate(inplace=True)
    
    sj_x = df[df['city']=='sj']
    iq_x = df[df['city']=='iq']
    
    sj_y = None
    iq_y = None
    if labels_path:
        y = pd.read_csv(labels_path)
        df = pd.concat([df, y['total_cases']], axis=1)
        sj_y = y[y['city']=='sj'][['total_cases']]
        iq_y = y[y['city']=='iq'][['total_cases']]

    return sj_x, iq_x, sj_y, iq_y

In [130]:
learning_rates = [ 0.05, 0.01, 0.1, 0.2, 0.25]
colsample_bytree = [0.6, 0.7, 0.8, 0.9, 0.95]
n_estimators = [5, 10, 15, 30]
max_depth = [3, 4, 5, 9, 10, 11]

def hyper_param_optimize(X, Y):
    
    lr_best = learning_rates[0]
    cs_best = colsample_bytree[0]
    es_best = n_estimators[0]
    md_best = max_depth[0]
    
    best_mae = float("inf")
    counter = 0
    
    for lr in learning_rates:
        for cs in colsample_bytree:
            for es in n_estimators:
                for md in max_depth:
                    
                    counter += 1
                    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

                    model = xgb.XGBRegressor(
                        objective ='reg:linear',
                        colsample_bytree = cs, 
                        learning_rate = lr, 
                        max_depth = md, 
                        n_estimators = es
                    )
                    
                    model.fit(x_train, y_train)
                    y_predict = model.predict(x_test)

                    mae = mean_absolute_error(y_test, y_predict)**0.5

                    if (mae < best_mae):
                        best_mae = mae
                        lr_best = lr
                        cs_best = cs
                        es_best = es
                        md_best = md
                            
    print(counter, '** best model - lr', lr_best, 'cs', cs_best, 'es', es_best, 'md', md_best)
    print('** best MAE - ', best_mae)
    
    return {'learning_rate': lr_best, 'colsample_bytree': cs_best, 'max_depth': md_best, 
            'n_estimators': es_best}

In [131]:
def get_best_features(limit, x, y):
    df = pd.concat([x, y], axis=1)
    df = df._get_numeric_data().drop(['year'], axis=1)
    corr = df.corr()
    columns = abs(corr['total_cases']).sort_values().index[-1-limit:-1]
    return columns

# CODE

In [187]:
train_feature_path = 'dengue_features_train_f.csv'
train_label_path = 'dengue_labels_train.csv'
test_feature_path = 'dengue_features_test_f.csv'

sj_limit = 7
iq_limit = 14

In [188]:
sj_x, iq_x, sj_y, iq_y = preprocess_data(train_feature_path, train_label_path)

In [189]:
print('sj_x - ', sj_x.shape)
print('sj_y - ', sj_y.shape)
print('iq_x - ', iq_x.shape)
print('iq_y - ', iq_y.shape)

sj_x -  (936, 50)
sj_y -  (936, 1)
iq_x -  (520, 50)
iq_y -  (520, 1)


In [190]:
sj_features = get_best_features(sj_limit, sj_x, sj_y)
iq_features = get_best_features(iq_limit, iq_x, iq_y)

In [191]:
sj_x = sj_x[sj_features]
iq_x = iq_x[iq_features]

# sj_x = normalize(sj_x[iq_features])
# iq_x = normalize(iq_x[iq_features])

# sj_y = normalize(sj_y)
# iq_y  = normalize(iq_y)

In [192]:
print('sj_x - ', sj_x.shape)
print('sj_y - ', sj_y.shape)
print('iq_x - ', iq_x.shape)
print('iq_y - ', iq_y.shape)

sj_x -  (936, 7)
sj_y -  (936, 1)
iq_x -  (520, 14)
iq_y -  (520, 1)


In [193]:
sj_param = hyper_param_optimize(sj_x, sj_y)

600 ** best model - lr 0.25 cs 0.9 es 15 md 11
** best MAE -  2.8120275317026504


In [194]:
iq_param = hyper_param_optimize(iq_x, iq_y)

600 ** best model - lr 0.1 cs 0.95 es 15 md 4
** best MAE -  1.9315631564158207


In [195]:
# best model - lr 0.1 cs 0.6 es 5 md 3 al 8
# best model - lr 0.05 cs 0.9 es 15 md 5 al 8
# MAE = 4.23
# best model - lr 0.1 cs 0.6 es 15 md 4 al 8
# best model - lr 0.2 cs 0.8 es 5 md 10 al 4
# MAE = 2.10


#interpolate, _f, without alpha 4.083 2.129
#interpolate, _f, without alpha feature_selection 2.724 1.684

#interpolate, _f, without alpha feature_selection 2.810 1.901

In [196]:
sj_test, iq_test, _, _ = preprocess_data(test_feature_path)

In [197]:
# sj_test = normalize(sj_test[sj_features])
# iq_test = normalize(iq_test[iq_features])

sj_test = sj_test[sj_features]
iq_test = iq_test[iq_features]

In [198]:
sj_model = xgb.XGBRegressor(
    objective ='reg:linear',
    colsample_bytree = sj_param['colsample_bytree'], 
    learning_rate = sj_param['learning_rate'], 
    max_depth = sj_param['max_depth'], 
    n_estimators = sj_param['n_estimators']
)
iq_model = xgb.XGBRegressor(
    objective ='reg:linear',
    colsample_bytree = iq_param['colsample_bytree'], 
    learning_rate = iq_param['learning_rate'], 
    max_depth = iq_param['max_depth'], 
    n_estimators = iq_param['n_estimators']
)
sj_model.fit(sj_x, sj_y)
iq_model.fit(iq_x, iq_y)

sj_predict = sj_model.predict(sj_test)
iq_predict = iq_model.predict(iq_test)

In [199]:
test_df = pd.read_csv('./dengue_features_test.csv')[['city', 'year', 'weekofyear']]

total_cases = pd.concat([pd.Series(sj_predict), pd.Series(iq_predict)])

total_cases.index = test_df.index
test_df['total_cases'] = total_cases.astype(int)

test_df.to_csv('./submission.csv', index=False)

In [200]:
test_df['total_cases'].unique()

array([  3,   4,   6,   8,  10,  13,  15,  26,  27,  28,  41,  36,  69,
       148,  63,  75,  53,  44,  81, 101, 115,  59,  50,  35,  12,  14,
        16,   7,   2,   1,  29,  31,  67,  65,  68,  78,  33,  88,  99,
        84, 102,  58,  42,  32,  60,  66,  70,  54,  37,  24,  21,  11,
         9,   0,  25,  30,  34,  52,  39,  40,  62,  85,  55,  82,  46,
        23,  20,   5,  51,  47,  45,  90,  80,  19,  17,  18,  74,  38,
       110, 104, 100,  94, 106,  48,  22], dtype=int64)

In [201]:
test_df['total_cases'].shape

(416,)

In [123]:
sj_y.head()

Unnamed: 0,total_cases
0,4
1,5
2,4
3,3
4,6


In [125]:
sj_y.columns.values

array(['total_cases'], dtype=object)

In [122]:
sj_x.shape

(930, 15)

In [125]:
sj = pd.concat([sj_x, sj_y], axis=1)

In [128]:
sj.head()

Unnamed: 0,station_min_temp_c,months,weekofyear,months_SHIFT_1,weekofyear_SHIFT_1,reanalysis_air_temp_k_SHIFT_1,reanalysis_max_air_temp_k_SHIFT_1,station_min_temp_c_SHIFT_1,reanalysis_max_air_temp_k,station_diur_temp_rng_c,station_diur_temp_rng_c_SHIFT_1,reanalysis_tdtr_k_SHIFT_1,reanalysis_tdtr_k,reanalysis_min_air_temp_k,reanalysis_min_air_temp_k_SHIFT_1,total_cases
0,20.0,4.0,18.0,4.0,18.0,297.572857,299.8,20.0,299.8,6.9,6.9,2.628571,2.628571,295.9,295.9,4
1,22.2,5.0,19.0,4.0,18.0,297.572857,299.8,20.0,300.9,6.371429,6.9,2.628571,2.371429,296.4,295.9,5
2,22.8,5.0,20.0,5.0,19.0,298.211429,300.9,22.2,300.5,6.485714,6.371429,2.371429,2.3,297.3,296.4,4
3,23.3,5.0,21.0,5.0,20.0,298.781429,300.5,22.8,301.4,6.771429,6.485714,2.3,2.428571,297.0,297.3,3
4,23.9,5.0,22.0,5.0,21.0,298.987143,301.4,23.3,301.9,9.371429,6.771429,2.428571,3.014286,297.5,297.0,6


In [131]:
corr = sj.corr()
columns = abs(corr['total_cases']).sort_values()

In [132]:
columns

station_diur_temp_rng_c_SHIFT_1      0.035465
station_diur_temp_rng_c              0.039291
station_min_temp_c                   0.076224
reanalysis_max_air_temp_k            0.082645
reanalysis_min_air_temp_k            0.097787
reanalysis_tdtr_k_SHIFT_1            0.107028
station_min_temp_c_SHIFT_1           0.115101
reanalysis_air_temp_k_SHIFT_1        0.115133
reanalysis_tdtr_k                    0.123134
reanalysis_min_air_temp_k_SHIFT_1    0.129229
reanalysis_max_air_temp_k_SHIFT_1    0.129532
months                               0.288334
months_SHIFT_1                       0.295878
weekofyear                           0.296069
weekofyear_SHIFT_1                   0.297619
total_cases                          1.000000
Name: total_cases, dtype: float64

In [153]:
df = pd.read_csv(train_feature_path)
df = df._get_numeric_data().drop(['year', 'ndvi_se', 'ndvi_sw', 'ndvi_ne', 'ndvi_nw'], axis=1)
df_1 = df.shift(1)
df_1 = df_1.add_suffix('_SHIFT_1')
df = pd.concat([df, df_1], axis=1)
df['total_cases'] = pd.read_csv(train_label_path).total_cases

corr = df.corr()
print
columns = abs(corr['total_cases']).sort_values()

In [156]:
type(abs(corr['total_cases']))

pandas.core.series.Series

In [None]:
def preprocess_data(data_path, labels_path=None):

#     x_features = get_best_features(limit)
#     print(x_features)
    y_features = ['total_cases']
    
    df = pd.read_csv(data_path)
   
    df_1 = df.shift(1)
    df_1 = df_1.add_suffix('_SHIFT_1')
    df = pd.concat([df, df_1], axis=1)
    
    if labels_path:
        y = pd.read_csv(labels_path)
        df = pd.concat([df, y[y_features]], axis=1)
#         df = handle_missing(df, x_features + y_features)
#         df = handle_outliers(df, x_features + y_features)
#         df = normalize(df, x_features)
#     else:
#         df = handle_missing(df, x_features)
#         df = normalize(df, x_features)
    
    df.fillna(method='bfill', inplace=True)
#     df.interpolate(inplace=True)
    
    sj_x = df[df['city']=='sj'][x_features]
    iq_x = df[df['city']=='iq'][x_features]
    
    sj_y = None
    iq_y = None
    if labels_path:
        sj_y = df[df['city']=='sj'][y_features]
        iq_y = df[df['city']=='iq'][y_features]

    return sj_x, iq_x, sj_y, iq_y