# Integrate Data

In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

# get input dataset
features_train = pd.read_csv('data/dengue_features_train.csv')
labels_train = pd.read_csv('data/dengue_labels_train.csv')
features_test = pd.read_csv('data/dengue_features_test.csv')

# Normalize the week_start_date feature value
features_train['week_start_date'] = pd.to_datetime(features_train['week_start_date'])
features_test['week_start_date'] = pd.to_datetime(features_test['week_start_date'])

# Divide training data into two parts w.r.t 'city'
train_sj = features_train.loc[features_train['city'] == 'sj']
train_iq = features_train.loc[features_train['city'] == 'iq']
label_sj = labels_train.loc[labels_train['city'] == 'sj']
label_iq = labels_train.loc[labels_train['city'] == 'iq']

# Preprocess the Training Data set

### Feature Selection 

Select features from the features in the training data.

In [2]:
KEYS = ['city', 'year', 'weekofyear']

ALL_FEATURES= ['city', 'year', 'weekofyear', 'week_start_date', 'ndvi_ne', 'ndvi_nw', 
                      'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
                      'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
                      'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
                      'reanalysis_precip_amt_kg_per_m2',
                      'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
                      'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
                      'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
                      'station_min_temp_c', 'station_precip_mm']

RF_TRAINING_FEATURES = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']

TIMELY_TRAINING_FEATURES = ['year', 'weekofyear']

COMMON_TRAINING_FEATURES = ['reanalysis_dew_point_temp_k', 'reanalysis_precip_amt_kg_per_m2', 
                   'reanalysis_specific_humidity_g_per_kg', 'station_avg_temp_c',  'station_max_temp_c', 
                   'station_min_temp_c']

SJ_FEATURES = ['reanalysis_dew_point_temp_k', 'reanalysis_precip_amt_kg_per_m2', 'reanalysis_specific_humidity_g_per_kg',
               'station_avg_temp_c',  'station_max_temp_c', 'station_min_temp_c']

IQ_FEATURES = ['reanalysis_dew_point_temp_k', 'reanalysis_specific_humidity_g_per_kg',
               'station_avg_temp_c', 'station_min_temp_c']

NEW_FEATURES = ['recent_mean_dew_point', 'recent_mean_spec_humid', 'recent_sum_precip']

TIME_SERIES_FEATURES = ['week_start_date']

DROP_FEATURES = list(set(ALL_FEATURES)-set(COMMON_TRAINING_FEATURES)-set(KEYS)-set(TIME_SERIES_FEATURES))

DROP_SJ_FEATURES = list(set(ALL_FEATURES)-set(SJ_FEATURES)-set(KEYS)-set(TIME_SERIES_FEATURES))
DROP_IQ_FEATURES = list(set(ALL_FEATURES)-set(IQ_FEATURES)-set(KEYS)-set(TIME_SERIES_FEATURES))

# Specific features for the cities
FEATURES_SJ = COMMON_TRAINING_FEATURES
FEATURES_IQ = COMMON_TRAINING_FEATURES 

In [3]:
# Drop DROP_FEATURES from a given data set
def drop_unnecessary_features(df,drop_features=DROP_FEATURES):
    df.drop(drop_features, axis=1, inplace=True)
    df.drop(TIME_SERIES_FEATURES, axis=1, inplace=True)
    return df

### Filing Null values

In [4]:
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy='mean')

def fill_null_values_with_mean(df,features_list):
    imputer.fit(df[features_list])
    df[features_list] = imputer.transform(df[features_list])
    return df



In [5]:
# Fill null values of RF_ Features of the training dataset
train_sj = fill_null_values_with_mean(train_sj,RF_TRAINING_FEATURES)
train_iq = fill_null_values_with_mean(train_iq,RF_TRAINING_FEATURES)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.

### Normalization

In [6]:
# Add Time Series features

def add_time_series_features(df, window):
    df.set_index('week_start_date', inplace=True)

    roll_df = df.rolling(window=window, min_periods=1)
    df['recent_mean_dew_point'] = roll_df.reanalysis_dew_point_temp_k.mean()
    df['recent_mean_spec_humid'] = roll_df.reanalysis_specific_humidity_g_per_kg.mean()
    df['recent_sum_precip'] = roll_df.reanalysis_precip_amt_kg_per_m2.sum()
    
    df.reset_index(inplace=True)    
    return df

# normalize data
def normalize(feature):
    return (feature - feature.mean()) / feature.std()

In [7]:
# train_sj = add_time_series_features(train_sj, window=10)
# train_iq = add_time_series_features(train_iq, window=10)

Drop some unnecessory Features

In [8]:
rf_drop_features = list(set(ALL_FEATURES)-set(RF_TRAINING_FEATURES)-set(KEYS)-set(TIME_SERIES_FEATURES))

train_sj = drop_unnecessary_features(train_sj,drop_features=rf_drop_features)
train_iq = drop_unnecessary_features(train_iq,drop_features=rf_drop_features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Normalize the Data

In [9]:
FEATURES_TO_NRMLZE = RF_TRAINING_FEATURES

train_sj[FEATURES_TO_NRMLZE] = train_sj[FEATURES_TO_NRMLZE].apply(normalize, axis=0)
train_iq[FEATURES_TO_NRMLZE] = train_iq[FEATURES_TO_NRMLZE].apply(normalize, axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


### Spliting the training data set into train and test data sets

Here we divide the tarining data set into two parts: for training and for testing.
**X_cross_sj** is the testing data set extracted from the training data of city 'sj'
**y_cross_sj** is the testing label values extracted from the training data of city 'sj'

In [10]:
from sklearn.model_selection import train_test_split

# train_sj.set_index('index', inplace=True)
# train_iq.set_index('index', inplace=True)

y_sj = labels_train.loc[labels_train['city'] == 'sj',:]
y_iq = labels_train.loc[labels_train['city'] == 'iq',:]

In [11]:
X_train_sj, X_cross_sj, y_train_sj, y_cross_sj = train_test_split(train_sj, 
                                                                  y_sj,
                                                                  test_size=0.2,
                                                                  stratify=train_sj.weekofyear)

print(f'X_train_sj: {X_train_sj.shape}')
print(f'y_train_sj: {y_train_sj.shape}')
print(f'X_cross_sj: {X_cross_sj.shape}')
print(f'y_cross_sj: {y_cross_sj.shape}')

X_train_sj: (748, 7)
y_train_sj: (748, 4)
X_cross_sj: (188, 7)
y_cross_sj: (188, 4)


In [12]:
X_train_iq, X_cross_iq, y_train_iq, y_cross_iq = train_test_split(train_iq, 
                                                                  y_iq, 
                                                                  test_size=0.2,
                                                                  stratify=train_iq.weekofyear)

print(f'X_train_iq: {X_train_iq.shape}')
print(f'y_train_iq: {y_train_iq.shape}')
print(f'X_cross_iq: {X_cross_iq.shape}')
print(f'y_cross_iq: {y_cross_iq.shape}')

X_train_iq: (416, 7)
y_train_iq: (416, 4)
X_cross_iq: (104, 7)
y_cross_iq: (104, 4)


### Prepare for Training

In [13]:
def drop_unnecessary_columns(df,features):
    return df[features]

In [14]:
X_FEATURES = RF_TRAINING_FEATURES + TIMELY_TRAINING_FEATURES
Y_FEATURES = ['total_cases']

X_train_sj = drop_unnecessary_columns(X_train_sj,X_FEATURES)
X_train_iq = drop_unnecessary_columns(X_train_iq,X_FEATURES)
X_cross_sj = drop_unnecessary_columns(X_cross_sj,X_FEATURES)
X_cross_iq = drop_unnecessary_columns(X_cross_iq,X_FEATURES)

y_train_iq = drop_unnecessary_columns(y_train_iq,Y_FEATURES)
y_cross_iq = drop_unnecessary_columns(y_cross_iq,Y_FEATURES)

# X_train_sj.drop(['station_min_temp_c'], axis=1, inplace=True)
# X_train_iq.drop(['station_min_temp_c'], axis=1, inplace=True)
# X_cross_sj.drop(['station_min_temp_c'], axis=1, inplace=True)
# X_cross_iq.drop(['station_min_temp_c'], axis=1, inplace=True)

print(f'X_train_iq: {X_train_iq.shape}')
print(f'y_train_iq: {y_train_iq.shape}')
print(f'X_cross_iq: {X_cross_iq.shape}')
print(f'y_cross_iq: {y_cross_iq.shape}')
X_train_sj.head()

X_train_iq: (416, 6)
y_train_iq: (416, 1)
X_cross_iq: (104, 6)
y_cross_iq: (104, 1)


Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,year,weekofyear
271,7.259491e-17,1.168379,1.145138,0.967838,1995,28
929,-3.968507,-2.317955,-1.184328,-0.44939,2008,11
267,0.3308645,0.135865,0.071738,-0.179638,1995,24
806,-0.9196096,-0.179981,-0.125447,-0.065611,2005,43
454,-1.33626,-1.171025,0.055579,-0.902923,1999,3


# Train the Model with Data set

In [15]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

# find the ccurancy of the model with the mean absolute value
def cross_validate_out_of_sample(reg, X_train, y_train, X_cross, y_cross):
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_cross)
    return mean_absolute_error(y_true=y_cross, y_pred=y_pred)

In [16]:
# Grid search to get the best score hyper parameters
def grid_search_cross_val(reg, X, y, param_grid, scoring='neg_mean_absolute_error'):
    grid = GridSearchCV(reg, param_grid=param_grid, scoring=scoring)
    grid.fit(X, y)
    print("Best score: {}".format(np.abs(grid.best_score_)))
    print("Best params: {}".format(grid.best_params_))

###### Check the best hyper param values for GradientBoostingRegression

In [17]:
from sklearn.model_selection import GridSearchCV

# reg = GradientBoostingRegressor(random_state=67)

# param_grid = [
#     {'learning_rate': [0.1, 0.3, 1.0, 3.0], 'n_estimators': [10, 30, 100, 300, 500], 
#      'max_depth': [3, 5, 7, 9]}
# ]

# grid_search_cross_val(reg, X_train_sj, y_train_sj.total_cases, param_grid)
# grid_search_cross_val(reg, X_train_iq, y_train_iq.total_cases, param_grid)

###### Check the best hyper param values for RandomForestRegressor

In [18]:
from sklearn.ensemble import RandomForestRegressor

# reg = RandomForestRegressor(random_state=67)

# param_grid = [
#     {
#       'n_estimators': [10, 30, 100, 300, 500], 
#       'max_depth': [3, 5, 7, None]
#     } 
# ]

# grid_search_cross_val(reg, X_train_sj, y_train_sj.total_cases, param_grid)
# grid_search_cross_val(reg, X_train_iq, y_train_iq.total_cases, param_grid)

In [19]:
X_train_iq.head()

Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,year,weekofyear
1099,1.988958,1.616379,1.380631,1.379286,2003,34
1291,-0.85235,-1.106541,0.591055,-1.088251,2007,18
1422,1.538207,1.751404,-0.002934,1.233931,2009,45
1217,0.230505,-0.237594,0.493816,-0.310042,2005,47
1411,-0.532952,-0.580507,0.368795,-0.073427,2009,34


In [20]:
# Train the model with SJ data set
reg_sj = RandomForestRegressor(max_depth=None, n_estimators=500, random_state=67)
cross_validate_out_of_sample(reg_sj, X_train_sj, y_train_sj.total_cases, X_cross_sj, y_cross_sj.total_cases)

12.238670212765955

In [21]:
# Train the model with IQ data set
reg_iq = GradientBoostingRegressor(max_depth=None, n_estimators=500, random_state=67)
cross_validate_out_of_sample(reg_iq, X_train_iq, y_train_iq.total_cases, X_cross_iq, y_cross_iq.total_cases)

4.657457008761076

In [22]:
# Submission the results
def submission(pred):
    submission = pd.read_csv("data/submission_format.csv", index_col=[0, 1, 2])
    submission['total_cases'] = pred['total_cases']
    submission.to_csv("./submissions/sub_DengAI.csv")

# Preprocess the Test Data set

In [23]:
train_sj = drop_unnecessary_columns(train_sj,X_FEATURES)
train_iq = drop_unnecessary_columns(train_iq,X_FEATURES)

# Impute the Values
X_test_sj = features_test.loc[features_test['city']=='sj',:]
X_test_iq = features_test.loc[features_test['city']=='iq',:]

predict_sj = X_test_sj[KEYS].copy()
predict_iq = X_test_iq[KEYS].copy()

# FIll values
X_test_sj = fill_null_values_with_mean(X_test_sj,RF_TRAINING_FEATURES)
X_test_iq = fill_null_values_with_mean(X_test_iq,RF_TRAINING_FEATURES)


# Normalization
# X_test_sj = add_time_series_features(X_test_sj, window=10)
# X_test_iq = add_time_series_features(X_test_iq, window=10)

X_test_sj[FEATURES_TO_NRMLZE] = X_test_sj[FEATURES_TO_NRMLZE].apply(normalize, axis=0)
X_test_iq[FEATURES_TO_NRMLZE] = X_test_iq[FEATURES_TO_NRMLZE].apply(normalize, axis=0)


columns = RF_TRAINING_FEATURES + TIMELY_TRAINING_FEATURES
# Drop columns 
X_test_sj = drop_unnecessary_columns(X_test_sj,columns)
X_test_iq = drop_unnecessary_columns(X_test_iq,columns)

# train_sj.drop(['station_min_temp_c'], axis=1, inplace=True)
# train_iq.drop(['station_min_temp_c'], axis=1, inplace=True)
# X_test_sj.drop(['station_min_temp_c'], axis=1, inplace=True)
# X_test_iq.drop(['station_min_temp_c'], axis=1, inplace=True)

print(f'train_sj: {train_sj.shape}')
print(f'train_iq: {train_iq.shape}')
print(f'X_test_sj: {X_test_sj.shape}')
print(f'X_test_iq: {X_test_iq.shape}')
X_test_sj.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.

train_sj: (936, 6)
train_iq: (520, 6)
X_test_sj: (260, 6)
X_test_iq: (156, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,year,weekofyear
0,-0.3577464,-0.653177,-1.113613,-1.127334,2008,18
1,-0.3503788,-0.576857,-1.423542,-1.470585,2008,19
2,-0.2153074,0.0,-0.389128,-1.121362,2008,20
3,-2.840141e-17,-0.664527,-0.789986,-0.50055,2008,21
4,0.261945,0.036443,-1.719843,-1.405155,2008,22


In [24]:
# Train models with full dataset
reg_sj = RandomForestRegressor(max_depth=None, n_estimators=500, random_state=67)
reg_sj.fit(train_sj, label_sj.total_cases)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500,
                      n_jobs=None, oob_score=False, random_state=67, verbose=0,
                      warm_start=False)

In [25]:
reg_iq = RandomForestRegressor(max_depth=None, n_estimators=500, random_state=67)
reg_iq.fit(train_iq, label_iq.total_cases)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500,
                      n_jobs=None, oob_score=False, random_state=67, verbose=0,
                      warm_start=False)

In [26]:
# Predict SJ
y_sj_pred = reg_sj.predict(X_test_sj)
predict_sj['total_cases'] = y_sj_pred.round().astype(int)
predict_sj.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,5
1,sj,2008,19,5
2,sj,2008,20,6
3,sj,2008,21,7
4,sj,2008,22,13


In [27]:
# Predict IQ
y_iq_pred = reg_iq.predict(X_test_iq)
predict_iq['total_cases'] = y_iq_pred.round().astype(int)
predict_iq.head()

Unnamed: 0,city,year,weekofyear,total_cases
260,iq,2010,26,3
261,iq,2010,27,3
262,iq,2010,28,3
263,iq,2010,29,5
264,iq,2010,30,3


In [28]:
predict_df = pd.concat([predict_sj, predict_iq], axis=0)
predict_df[predict_df.total_cases < 0]

Unnamed: 0,city,year,weekofyear,total_cases


In [29]:
predict_df.loc[predict_df.total_cases < 0, 'total_cases'] = 0
predict_df[predict_df.total_cases < 0]

Unnamed: 0,city,year,weekofyear,total_cases


In [30]:
predict_df.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,5
1,sj,2008,19,5
2,sj,2008,20,6
3,sj,2008,21,7
4,sj,2008,22,13


In [31]:
submission = pd.read_csv("./data/submission_format.csv")

submission['total_cases'] = predict_df['total_cases']
submission.set_index('city', inplace=True)
submission.to_csv("./submissions/sub_DengAIRF.csv")