## Data Prep

In [106]:
#Importing all the libraries
import pandas as pd
pd.options.display.max_rows = 999
import numpy as np
import seaborn as sns
from plotnine import *
import re
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [2]:
#Import the datasets
X_train_data = pd.read_csv('dengue_features_train.csv', infer_datetime_format=True, parse_dates=['week_start_date'])
X_test_data = pd.read_csv('dengue_features_test.csv', infer_datetime_format=True, parse_dates=['week_start_date'])
y_train_data = pd.read_csv('dengue_labels_train.csv')

In [3]:
#Combining the training label and predictors
train_data = pd.merge(left=X_train_data , right=y_train_data, left_on=['city', 'year', 'weekofyear'],
                     right_on=['city', 'year', 'weekofyear'],)

In [4]:
#Split the  training datasets based on the cities
sj_train_data = train_data[train_data['city']=='sj']
iq_train_data = train_data[train_data['city']=='iq']

#Splitting the test data
sj_test_data = X_test_data[X_test_data['city']=='sj']
iq_test_data = X_test_data[X_test_data['city']=='iq']


In [26]:
#Missing data stats
sj_train_data.isnull().sum()

city                                     0
year                                     0
weekofyear                               0
week_start_date                          0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c                       0
station_pre

In [27]:
iq_train_data.isnull().sum()

city                                     0
year                                     0
weekofyear                               0
week_start_date                          0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c                       0
station_pre

In [24]:
sj_train_data.fillna(method='ffill', inplace=True)
iq_train_data.fillna(method='ffill', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [206]:
#The List of non-lagged vars.
#Highly correlated variables are removed
# Variables removed
# 1. reanalysis_sat_precip_amt_mm
# 2. reanalysis_dew_point_temp_k
predictors = [
    'ndvi_ne',
    'ndvi_nw',
    'ndvi_se',
    'ndvi_sw',
    'precipitation_amt_mm',
    'reanalysis_air_temp_k',
    'reanalysis_avg_temp_k',
    'reanalysis_max_air_temp_k',
    'reanalysis_min_air_temp_k',
    'reanalysis_precip_amt_kg_per_m2',
    'reanalysis_relative_humidity_percent',
    'reanalysis_specific_humidity_g_per_kg',
    'reanalysis_tdtr_k',
    'station_avg_temp_c',
    'station_diur_temp_rng_c',
    'station_max_temp_c',
    'station_min_temp_c',
    'station_precip_mm',
]

### Creating Lagged variables

In [144]:
sj_lag1 = sj_train_data[predictors].shift(1)
sj_lag2 = sj_train_data[predictors].shift(2)
sj_lag3 = sj_train_data[predictors].shift(3)
sj_lag4 = sj_train_data[predictors].shift(4)

iq_lag1 = iq_train_data[predictors].shift(1)
iq_lag2 = iq_train_data[predictors].shift(2)
iq_lag3 = iq_train_data[predictors].shift(3)
iq_lag4 = iq_train_data[predictors].shift(4)

#Adding prefixes to the lagged values so it's easy to identify
sj_lag1 = sj_lag1.add_prefix(prefix='lag1_')
sj_lag2 = sj_lag2.add_prefix(prefix='lag2_')
sj_lag3 = sj_lag3.add_prefix(prefix='lag3_')
sj_lag4 = sj_lag4.add_prefix(prefix='lag4_')

iq_lag1 = iq_lag1.add_prefix(prefix='lag1_')
iq_lag2 = iq_lag2.add_prefix(prefix='lag2_')
iq_lag3 = iq_lag3.add_prefix(prefix='lag3_')
iq_lag4 = iq_lag4.add_prefix(prefix='lag4_')

In [196]:
#Modifying the test data too
sj_test_lag1 = sj_test_data[predictors].shift(1)
sj_test_lag2 = sj_test_data[predictors].shift(2)
sj_test_lag3 = sj_test_data[predictors].shift(3)
sj_test_lag4 = sj_test_data[predictors].shift(4)

iq_test_lag1 = iq_test_data[predictors].shift(1)
iq_test_lag2 = iq_test_data[predictors].shift(2)
iq_test_lag3 = iq_test_data[predictors].shift(3)
iq_test_lag4 = iq_test_data[predictors].shift(4)

#Adding prefixes to the lagged values so it's easy to identify
sj_test_lag1 = sj_test_lag1.add_prefix(prefix='lag1_')
sj_test_lag2 = sj_test_lag2.add_prefix(prefix='lag2_')
sj_test_lag3 = sj_test_lag3.add_prefix(prefix='lag3_')
sj_test_lag4 = sj_test_lag4.add_prefix(prefix='lag4_')

iq_test_lag1 = iq_test_lag1.add_prefix(prefix='lag1_')
iq_test_lag2 = iq_test_lag2.add_prefix(prefix='lag2_')
iq_test_lag3 = iq_test_lag3.add_prefix(prefix='lag3_')
iq_test_lag4 = iq_test_lag4.add_prefix(prefix='lag4_') 

In [289]:
#Combining the datasets along with the lags
sj_combo = pd.concat([sj_train_data, sj_lag1, sj_lag2, sj_lag3, sj_lag4], axis=1)
iq_combo = pd.concat([iq_train_data, iq_lag1, iq_lag2, iq_lag3, iq_lag4], axis=1)

#test set
#Combining the datasets along with the lags
sj_test_combo = pd.concat([sj_test_data, sj_test_lag1, sj_test_lag2, sj_test_lag3, sj_test_lag4], axis=1)
iq_test_combo = pd.concat([iq_test_data, iq_test_lag1, iq_test_lag2, iq_test_lag3, iq_test_lag4], axis=1)

sj_test_combo.fillna(method='backfill', inplace=True)
iq_test_combo.fillna(method='backfill', inplace=True)

In [198]:
# Columns to be used in correlation analysis

corr_list = [
    'ndvi_ne', 'ndvi_nw',
       'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
       'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
       'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
       'reanalysis_precip_amt_kg_per_m2',
       'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
       'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
       'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c', 'station_precip_mm', 'total_cases',
       'lag1_ndvi_ne', 'lag1_ndvi_nw', 'lag1_ndvi_se', 'lag1_ndvi_sw',
       'lag1_precipitation_amt_mm', 'lag1_reanalysis_air_temp_k',
       'lag1_reanalysis_avg_temp_k', 'lag1_reanalysis_max_air_temp_k',
       'lag1_reanalysis_min_air_temp_k',
       'lag1_reanalysis_precip_amt_kg_per_m2',
       'lag1_reanalysis_relative_humidity_percent',
       'lag1_reanalysis_specific_humidity_g_per_kg', 'lag1_reanalysis_tdtr_k',
       'lag1_station_avg_temp_c', 'lag1_station_diur_temp_rng_c',
       'lag1_station_max_temp_c', 'lag1_station_min_temp_c',
       'lag1_station_precip_mm', 'lag2_ndvi_ne', 'lag2_ndvi_nw',
       'lag2_ndvi_se', 'lag2_ndvi_sw', 'lag2_precipitation_amt_mm',
       'lag2_reanalysis_air_temp_k', 'lag2_reanalysis_avg_temp_k',
       'lag2_reanalysis_max_air_temp_k', 'lag2_reanalysis_min_air_temp_k',
       'lag2_reanalysis_precip_amt_kg_per_m2',
       'lag2_reanalysis_relative_humidity_percent',
       'lag2_reanalysis_specific_humidity_g_per_kg', 'lag2_reanalysis_tdtr_k',
       'lag2_station_avg_temp_c', 'lag2_station_diur_temp_rng_c',
       'lag2_station_max_temp_c', 'lag2_station_min_temp_c',
       'lag2_station_precip_mm', 'lag3_ndvi_ne', 'lag3_ndvi_nw',
       'lag3_ndvi_se', 'lag3_ndvi_sw', 'lag3_precipitation_amt_mm',
       'lag3_reanalysis_air_temp_k', 'lag3_reanalysis_avg_temp_k',
       'lag3_reanalysis_max_air_temp_k', 'lag3_reanalysis_min_air_temp_k',
       'lag3_reanalysis_precip_amt_kg_per_m2',
       'lag3_reanalysis_relative_humidity_percent',
       'lag3_reanalysis_specific_humidity_g_per_kg', 'lag3_reanalysis_tdtr_k',
       'lag3_station_avg_temp_c', 'lag3_station_diur_temp_rng_c',
       'lag3_station_max_temp_c', 'lag3_station_min_temp_c',
       'lag3_station_precip_mm', 'lag4_ndvi_ne', 'lag4_ndvi_nw',
       'lag4_ndvi_se', 'lag4_ndvi_sw', 'lag4_precipitation_amt_mm',
       'lag4_reanalysis_air_temp_k', 'lag4_reanalysis_avg_temp_k',
       'lag4_reanalysis_max_air_temp_k', 'lag4_reanalysis_min_air_temp_k',
       'lag4_reanalysis_precip_amt_kg_per_m2',
       'lag4_reanalysis_relative_humidity_percent',
       'lag4_reanalysis_specific_humidity_g_per_kg', 'lag4_reanalysis_tdtr_k',
       'lag4_station_avg_temp_c', 'lag4_station_diur_temp_rng_c',
       'lag4_station_max_temp_c', 'lag4_station_min_temp_c',
       'lag4_station_precip_mm'
]

In [164]:
sj_corr_list = sj_combo[corr_list].corr().loc[:,'total_cases'].sort_values()
iq_corr_list = iq_combo[corr_list].corr().loc[:,'total_cases'].sort_values()

In [170]:
sj_corr_list

lag2_ndvi_se                                 -0.122935
ndvi_se                                      -0.120024
lag1_ndvi_se                                 -0.118539
lag3_ndvi_se                                 -0.116370
lag4_ndvi_se                                 -0.101915
lag2_reanalysis_tdtr_k                       -0.088606
lag3_reanalysis_tdtr_k                       -0.078190
lag1_reanalysis_tdtr_k                       -0.075172
lag4_reanalysis_tdtr_k                       -0.072100
reanalysis_tdtr_k                            -0.067920
lag4_station_diur_temp_rng_c                  0.001846
ndvi_ne                                       0.004144
lag3_station_diur_temp_rng_c                  0.005993
lag1_ndvi_ne                                  0.007216
lag2_ndvi_ne                                  0.008213
lag3_ndvi_ne                                  0.008820
lag2_station_diur_temp_rng_c                  0.011236
lag4_ndvi_ne                                  0.014674
lag1_stati

In [174]:
# Lagged variables are less correlated with the target for IQ
iq_corr_list

reanalysis_tdtr_k                            -0.130535
lag1_reanalysis_tdtr_k                       -0.105279
lag2_reanalysis_tdtr_k                       -0.103033
lag1_ndvi_se                                 -0.081766
lag3_reanalysis_tdtr_k                       -0.080405
lag2_ndvi_se                                 -0.072932
lag4_reanalysis_tdtr_k                       -0.060670
lag3_ndvi_se                                 -0.055632
reanalysis_max_air_temp_k                    -0.053157
lag2_ndvi_nw                                 -0.042423
ndvi_se                                      -0.041153
lag1_ndvi_nw                                 -0.041053
lag1_reanalysis_max_air_temp_k               -0.034494
lag2_station_diur_temp_rng_c                 -0.029566
lag2_reanalysis_max_air_temp_k               -0.029052
lag2_ndvi_ne                                 -0.028895
station_diur_temp_rng_c                      -0.021254
lag2_ndvi_sw                                 -0.018125
lag1_ndvi_

In [99]:
sj_combo.shape, iq_combo.shape

((936, 79), (520, 79))

In [175]:
# Further splitting the data to train and validation datasets
from sklearn.model_selection import train_test_split
sj_train, sj_valid = train_test_split(sj_combo, test_size=0.25, random_state=1)
iq_train, iq_valid = train_test_split(iq_combo, test_size=0.25, random_state=1)

In [176]:
train_data.shape, sj_train.shape, iq_train.shape

((1456, 25), (702, 97), (390, 97))

In [240]:
#Filling the validation datasets
sj_valid.fillna(method='ffill', inplace=True)
iq_valid.fillna(method='ffill', inplace=True)

## List of predictors

In [207]:
sj_predictors_v1 = [
    'lag3_reanalysis_specific_humidity_g_per_kg', 'lag4_reanalysis_air_temp_k',
    'lag4_reanalysis_min_air_temp_k', 'lag3_station_min_temp_c',
    'lag3_station_avg_temp_c', 'lag4_reanalysis_specific_humidity_g_per_kg',
    'lag4_station_avg_temp_c'
]

iq_predictors_v1 = [
        'lag3_reanalysis_specific_humidity_g_per_kg', 'lag1_station_min_temp_c',
    'reanalysis_min_air_temp_k', 'lag2_reanalysis_specific_humidity_g_per_kg',
    'lag1_reanalysis_specific_humidity_g_per_kg',
    'reanalysis_dew_point_temp_k', 'reanalysis_specific_humidity_g_per_kg'
]

## Let's build models!

### XGBoost: The mother of all!
* v1) Leaderboard MAE of 27.8726
    - Simple XGBoost
* v2) Leaderboard MAE of 28.8726
    - XGBoost w RandomizedGridCV and replacing negative values
* v3) Leaderboard MAE of 25.6322
    - This beat the benchmark of 25.8! 
    - Used lagged features upto 4 weeks
    - RandomizedGridCV
    - Subset for relatively high correlations with total cases.
     
    

In [208]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

In [209]:
import scipy.stats as st

one_to_left = st.beta(10, 1)
from_zero_positive = st.expon(0, 50)

params = {
    "n_estimators": st.randint(3, 400),
    "max_depth": st.randint(3, 40),
    "learning_rate": st.uniform(0.05, 0.4),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}

In [210]:
# Model framework
sj_xgb = xgb.XGBRegressor()
iq_xgb = xgb.XGBRegressor()

In [211]:
from sklearn.model_selection import RandomizedSearchCV

sj = RandomizedSearchCV(sj_xgb, params, n_jobs=1, scoring='neg_mean_absolute_error' ,n_iter=20)  
sj.fit(sj_train[sj_predictors_v1], sj_train['total_cases'])  
sj.best_score_

-24.287892719958922

In [212]:
sj.best_params_

{'colsample_bytree': 0.91068195738766333,
 'gamma': 4.3206932884910554,
 'learning_rate': 0.12995249393476094,
 'max_depth': 16,
 'min_child_weight': 24.216007236111171,
 'n_estimators': 5,
 'reg_alpha': 1.062585929136203,
 'subsample': 0.91196135347851759}

In [213]:
sj_xgb

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [214]:
iq = RandomizedSearchCV(iq_xgb, params, n_jobs=1, scoring='neg_mean_absolute_error' ,n_iter=20)  
iq.fit(iq_train[iq_predictors_v1], iq_train['total_cases'])  
iq.best_score_

-6.8974762525313942

In [215]:
iq.best_params_

{'colsample_bytree': 0.8548464351446422,
 'gamma': 9.3802959191278177,
 'learning_rate': 0.077443266471102279,
 'max_depth': 33,
 'min_child_weight': 207.11571395958458,
 'n_estimators': 26,
 'reg_alpha': 90.745632904891366,
 'subsample': 0.77306430255970426}

In [216]:
#Validation MAE score

In [217]:
sj_valid_pred = sj.predict(sj_valid[sj_predictors_v1]).astype(int)
iq_valid_pred = iq.predict(iq_valid[iq_predictors_v1]).astype(int)

In [218]:
#SJ validation MAE
mean_absolute_error(y_pred=sj_valid_pred, y_true=sj_valid['total_cases'])

24.337606837606838

In [219]:
#IQ validation MAE
mean_absolute_error(y_pred=iq_valid_pred, y_true=iq_valid['total_cases'])

4.6692307692307695

In [220]:
# Predicting
sj_pred = sj.predict(sj_test_combo[sj_predictors_v1]).astype(int)
iq_pred = iq.predict(iq_test_combo[iq_predictors_v1]).astype(int)

In [221]:
# Replacing any negative predictions with zero.
sj_pred[sj_pred<0]=0
iq_pred[iq_pred<0]=0

#Train the model on the entire datasets

##Sj training
sj_xgb.fit(X=sj_train[predictors].values, y=sj_train['total_cases'].values, verbose=3, eval_metric=mean_absolute_error)
##IQ training
iq_xgb.fit(X=iq_train[predictors].values, y=iq_train['total_cases'].values, verbose=3, eval_metric=mean_absolute_error)

##Predicting
sj_pred = sj_xgb.predict(sj_test_data[predictors].values)
iq_pred = iq_xgb.predict(iq_test_data[predictors].values)

In [222]:
# Make submission
sj_part = sj_test_data[['city', 'year', 'weekofyear']]
iq_part = iq_test_data[['city', 'year', 'weekofyear']]

In [223]:
#Adding the predictions
sj_part['total_cases']=sj_pred
iq_part['total_cases']=iq_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [224]:
#Exporting the submissions
submission = sj_part.append(iq_part)

In [225]:
submission.to_csv('xgb_v3- RandomizedGridScore- Lagged- subset features.csv', index=False)

### Negative Binomial Regression

In [273]:
sj_corr_list

lag2_ndvi_se                                 -0.122935
ndvi_se                                      -0.120024
lag1_ndvi_se                                 -0.118539
lag3_ndvi_se                                 -0.116370
lag4_ndvi_se                                 -0.101915
lag2_reanalysis_tdtr_k                       -0.088606
lag3_reanalysis_tdtr_k                       -0.078190
lag1_reanalysis_tdtr_k                       -0.075172
lag4_reanalysis_tdtr_k                       -0.072100
reanalysis_tdtr_k                            -0.067920
lag4_station_diur_temp_rng_c                  0.001846
ndvi_ne                                       0.004144
lag3_station_diur_temp_rng_c                  0.005993
lag1_ndvi_ne                                  0.007216
lag2_ndvi_ne                                  0.008213
lag3_ndvi_ne                                  0.008820
lag2_station_diur_temp_rng_c                  0.011236
lag4_ndvi_ne                                  0.014674
lag1_stati

In [298]:
import statsmodels.api as sm
from statsmodels.tools import eval_measures
import statsmodels.formula.api as smf


#Model for SJ
def sj_best_model(train, test):
    # Step 1: specify the form of the model
    model_formula = 'total_cases ~ 1 + reanalysis_specific_humidity_g_per_kg + reanalysis_dew_point_temp_k + \
    station_min_temp_c + station_avg_temp_c'
    
    grid = 10**np.arange(-8, -3, dtype=np.float64)

    best_alpha = []
    best_score = 1000

    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(
            formula=model_formula,
            data=train,
            family=sm.families.NegativeBinomial(alpha=alpha),
            missing='drop')

        results = model.fit()
        predictions = results.predict(test).astype(int)
        score = eval_measures.meanabs(predictions, test.total_cases)

        if score < best_score:
            best_alpha = alpha
            best_score = score

    print('best alpha = ', best_alpha)
    print('best score = ', best_score)

    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(
        formula=model_formula,
        data=full_dataset,
        family=sm.families.NegativeBinomial(alpha=best_alpha),
        missing='drop')

    fitted_model = model.fit()
    return fitted_model


#Model for IQ
def iq_best_model(train, test):
    # Step 1: specify the form of the model
    model_formula = 'total_cases ~ 1+ lag2_reanalysis_specific_humidity_g_per_kg + lag1_reanalysis_specific_humidity_g_per_kg \
     + reanalysis_specific_humidity_g_per_kg + reanalysis_dew_point_temp_k + station_min_temp_c + station_avg_temp_c'

    grid = 10**np.arange(-8, -3, dtype=np.float64)

    best_alpha = []
    best_score = 1000

    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(
            formula=model_formula,
            data=train,
            family=sm.families.NegativeBinomial(alpha=alpha),
            missing='drop')

        results = model.fit()
        predictions = results.predict(test).astype(int)
        score = eval_measures.meanabs(predictions, test.total_cases)

        if score < best_score:
            best_alpha = alpha
            best_score = score

    print('best alpha = ', best_alpha)
    print('best score = ', best_score)

    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(
        formula=model_formula,
        data=full_dataset,
        family=sm.families.NegativeBinomial(alpha=best_alpha),
        missing='drop')

    fitted_model = model.fit()
    return fitted_model

In [299]:
sj_best_model = sj_best_model(sj_train, sj_valid)

best alpha =  1e-08
best score =  26.7222222222


In [300]:
iq_best_model = iq_best_model(iq_train, iq_valid)

best alpha =  1e-08
best score =  4.72307692308


In [301]:
# Make the predictions
sj_pred = sj_best_model.predict(sj_test_combo).astype(int)
iq_pred = iq_best_model.predict(iq_test_combo).astype(int)

In [302]:
# Make submission
sj_part = sj_test_data[['city', 'year', 'weekofyear']]
iq_part = iq_test_data[['city', 'year', 'weekofyear']]

#Adding the predictions
sj_part['total_cases']=sj_pred
iq_part['total_cases']=iq_pred

#Exporting the submissions
submission = sj_part.append(iq_part)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [303]:
submission.to_csv('NBR_v1- Lagged- subset features.csv', index=False)