# Base Model using all Train Data

In [1]:
import pandas as pd
import numpy as np
import os 

import plotly.plotly as py
import plotly.graph_objs as go

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Private DataSet path: ../input/kddbr2018dataset/kddbr-2018-dataset/dataset. This dataset is the same of competitions
#
path = '../input/'
print(os.listdir(path))

['field-22.csv', 'field-20.csv', 'sample-submission.csv', 'field-3.csv', 'soil_data.csv', 'field-18.csv', 'field-23.csv', 'field-17.csv', 'field-2.csv', 'field-27.csv', 'field-24.csv', 'field-10.csv', 'field-11.csv', 'field-26.csv', 'field-21.csv', 'test.csv', 'field-25.csv', 'field-19.csv', 'field-7.csv', 'field-15.csv', 'field-8.csv', 'field-12.csv', 'field-0.csv', 'field-1.csv', 'field-14.csv', 'train.csv', 'field-5.csv', 'field-16.csv', 'field-13.csv', 'field-9.csv', 'field-6.csv', 'field-4.csv']


## Train and test datasets

Basic data containing palm tree information

In [3]:
df_train = pd.read_csv(os.path.join(path, 'train.csv'))
df_test  = pd.read_csv(os.path.join(path, 'test.csv'))
df_all = pd.concat([df_train, df_test])

# Remove noisy data.
df_all = df_all[df_all.harvest_year >= 2006]

print(df_train.shape, df_test.shape, df_all.shape)

(5243, 7) (4110, 6) (8186, 7)


In [4]:
def to_date(df):
    return pd.to_datetime((df.harvest_year*10000+df.harvest_month*100+1)\
                                  .apply(str),format='%Y%m%d')
# Add date variable 
for d in [df_train, df_test, df_all]:
    d['date'] = to_date(d)

## Field data (field_*.csv)

These files hold atmospheric data from January 2002 to December 2017, and can be used to estimate the weather conditions during the development of the plant. Notice that weather does influence the production. Using only a single month prior to harvest is probably too little data. Participants should decide how far back in the past they want to look when training models.



In [5]:
# read
df_field = pd.read_csv(path+'field-0.csv')
df_field['field'] = 0
df_field.head()

Unnamed: 0,month,year,temperature,dewpoint,windspeed,Soilwater_L1,Soilwater_L2,Soilwater_L3,Soilwater_L4,Precipitation,field
0,1,2002,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55,0
1,2,2002,25.774,24.734,1.9875,0.35884,0.35812,0.35536,0.34368,289.28,0
2,3,2002,25.777,24.609,1.7504,0.35886,0.35896,0.35898,0.36199,492.05,0
3,4,2002,25.89,24.904,1.495,0.36013,0.35991,0.35997,0.36043,461.84,0
4,5,2002,26.182,24.826,1.8062,0.35567,0.35541,0.35536,0.35894,282.69,0


In [6]:
for i in range(1, 28):
    _df_field = pd.read_csv(path+'field-{}.csv'.format(i))
    _df_field['field'] = i
    df_field = pd.concat([df_field, _df_field])

# remove duplicates
df_field = df_field.drop_duplicates()

# Group 
df_field = df_field.groupby(['month', 'year', 'field']).mean().reset_index()
print(df_field.shape)
df_field.head()

(5376, 11)


Unnamed: 0,month,year,field,temperature,dewpoint,windspeed,Soilwater_L1,Soilwater_L2,Soilwater_L3,Soilwater_L4,Precipitation
0,1,2002,0,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55
1,1,2002,1,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55
2,1,2002,2,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55
3,1,2002,3,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55
4,1,2002,4,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55


In [7]:
# df_all
df_all = pd.merge(df_all, df_field, left_on=['harvest_year', 'harvest_month','field'], 
                  right_on=['year', 'month', 'field'], how='inner').reset_index()

print(df_all.shape)
df_all.head()

(8186, 19)


Unnamed: 0,index,Id,age,field,harvest_month,harvest_year,production,type,date,month,year,temperature,dewpoint,windspeed,Soilwater_L1,Soilwater_L2,Soilwater_L3,Soilwater_L4,Precipitation
0,0,24,21,0,1,2006,0.121454,5,2006-01-01,1,2006,26.595,24.74,1.9897,0.34587,0.34462,0.33994,0.32083,341.51
1,1,1556,15,0,1,2006,0.212025,2,2006-01-01,1,2006,26.595,24.74,1.9897,0.34587,0.34462,0.33994,0.32083,341.51
2,2,2351,4,0,1,2006,0.018644,5,2006-01-01,1,2006,26.595,24.74,1.9897,0.34587,0.34462,0.33994,0.32083,341.51
3,3,2352,5,0,1,2006,0.087408,3,2006-01-01,1,2006,26.595,24.74,1.9897,0.34587,0.34462,0.33994,0.32083,341.51
4,4,4226,6,0,1,2006,0.100588,5,2006-01-01,1,2006,26.595,24.74,1.9897,0.34587,0.34462,0.33994,0.32083,341.51


In [8]:
df_all.columns

Index(['index', 'Id', 'age', 'field', 'harvest_month', 'harvest_year',
       'production', 'type', 'date', 'month', 'year', 'temperature',
       'dewpoint', 'windspeed', 'Soilwater_L1', 'Soilwater_L2', 'Soilwater_L3',
       'Soilwater_L4', 'Precipitation'],
      dtype='object')

In [9]:
# Features i will duplicate with the past months
features  = ['temperature', 'dewpoint', 'windspeed', 'Precipitation', 'Soilwater_L1']

df_all    = df_all.drop(columns=['Soilwater_L2', 'Soilwater_L3','Soilwater_L4'])

In [10]:
df_group = df_all.groupby(['field', 'date']).mean().reset_index()[['field', 'date', 'production'] + features ]
df_group = df_group.sort_values(['field', 'date'])
print(df_group.shape)
df_group.head()

(3753, 8)


Unnamed: 0,field,date,production,temperature,dewpoint,windspeed,Precipitation,Soilwater_L1
0,0,2006-01-01,0.108024,26.595,24.74,1.9897,341.51,0.34587
1,0,2006-02-01,0.110836,26.204,24.626,2.1281,309.01,0.35104
2,0,2006-03-01,0.125208,26.019,24.734,2.176,452.42,0.361
3,0,2006-04-01,0.088623,25.978,24.882,1.5105,516.34,0.36575
4,0,2006-05-01,0.091603,26.171,24.843,1.6832,349.06,0.36109


In [11]:
# Collect shift values of variables in all features time
period = 2

new_features = {}
for f in features:
    new_features[f] = []
    for i in range(1, period):
        new_features[f].append('{}_{}'.format(f, i))
        df_group['{}_{}'.format(f, i)] = df_group[f].shift(i).fillna(df_group[f].mean())
        #df_group['{}_{}'.format(f, i)] = df_group[f].rolling(i, min_periods=1).mean().fillna(df_group.temperature.mean())

In [12]:
df_group= df_group.drop(features+['production'], axis=1)
df_group.head()

Unnamed: 0,field,date,temperature_1,dewpoint_1,windspeed_1,Precipitation_1,Soilwater_L1_1
0,0,2006-01-01,27.482751,23.767951,2.1562,252.957335,0.308304
1,0,2006-02-01,26.595,24.74,1.9897,341.51,0.34587
2,0,2006-03-01,26.204,24.626,2.1281,309.01,0.35104
3,0,2006-04-01,26.019,24.734,2.176,452.42,0.361
4,0,2006-05-01,25.978,24.882,1.5105,516.34,0.36575


In [13]:
df_all = df_all.drop(['index', 'month', 'year'], axis=1)
df_all = pd.merge(df_all, df_group, left_on=['field', 'date'], right_on=['field','date'], how='inner').reset_index()

print(df_all.shape)
df_all.head()

(8186, 19)


Unnamed: 0,index,Id,age,field,harvest_month,harvest_year,production,type,date,temperature,dewpoint,windspeed,Soilwater_L1,Precipitation,temperature_1,dewpoint_1,windspeed_1,Precipitation_1,Soilwater_L1_1
0,0,24,21,0,1,2006,0.121454,5,2006-01-01,26.595,24.74,1.9897,0.34587,341.51,27.482751,23.767951,2.1562,252.957335,0.308304
1,1,1556,15,0,1,2006,0.212025,2,2006-01-01,26.595,24.74,1.9897,0.34587,341.51,27.482751,23.767951,2.1562,252.957335,0.308304
2,2,2351,4,0,1,2006,0.018644,5,2006-01-01,26.595,24.74,1.9897,0.34587,341.51,27.482751,23.767951,2.1562,252.957335,0.308304
3,3,2352,5,0,1,2006,0.087408,3,2006-01-01,26.595,24.74,1.9897,0.34587,341.51,27.482751,23.767951,2.1562,252.957335,0.308304
4,4,4226,6,0,1,2006,0.100588,5,2006-01-01,26.595,24.74,1.9897,0.34587,341.51,27.482751,23.767951,2.1562,252.957335,0.308304


### soil_data.csv

Information on the soil on which each field is

In [14]:
df_soil = pd.read_csv(path+'soil_data.csv')
print(df_soil.shape)
df_soil.head()

(28, 73)


Unnamed: 0,field,BDRICM_BDRICM_M,BDRLOG_BDRLOG_M,BDTICM_BDTICM_M,BLDFIE_sl1,BLDFIE_sl2,BLDFIE_sl3,BLDFIE_sl4,BLDFIE_sl5,BLDFIE_sl6,...,SLTPPT_sl5,SLTPPT_sl6,SLTPPT_sl7,SNDPPT_sl1,SNDPPT_sl2,SNDPPT_sl3,SNDPPT_sl4,SNDPPT_sl5,SNDPPT_sl6,SNDPPT_sl7
0,4,200,7,6973,1345,1308,1361,1413,1486,1503,...,21,19,20,47,48,47,42,40,40,39
1,3,200,9,7272,1297,1287,1323,1428,1492,1508,...,23,22,22,44,45,43,40,36,37,36
2,2,200,7,7281,1266,1249,1310,1387,1463,1491,...,21,21,22,46,46,45,40,39,39,39
3,1,200,6,7457,1297,1277,1345,1409,1480,1506,...,21,21,21,46,47,46,42,40,40,40
4,7,200,8,6771,1305,1289,1333,1438,1497,1510,...,22,21,22,44,45,44,40,38,38,37


In [15]:
# Join datasets
df_all_soil = pd.merge(df_all, df_soil, on='field', how='inner')
print(df_all_soil.shape)
df_all_soil.head()

(8186, 91)


Unnamed: 0,index,Id,age,field,harvest_month,harvest_year,production,type,date,temperature,...,SLTPPT_sl5,SLTPPT_sl6,SLTPPT_sl7,SNDPPT_sl1,SNDPPT_sl2,SNDPPT_sl3,SNDPPT_sl4,SNDPPT_sl5,SNDPPT_sl6,SNDPPT_sl7
0,0,24,21,0,1,2006,0.121454,5,2006-01-01,26.595,...,22,22,23,44,45,44,39,38,37,36
1,1,1556,15,0,1,2006,0.212025,2,2006-01-01,26.595,...,22,22,23,44,45,44,39,38,37,36
2,2,2351,4,0,1,2006,0.018644,5,2006-01-01,26.595,...,22,22,23,44,45,44,39,38,37,36
3,3,2352,5,0,1,2006,0.087408,3,2006-01-01,26.595,...,22,22,23,44,45,44,39,38,37,36
4,4,4226,6,0,1,2006,0.100588,5,2006-01-01,26.595,...,22,22,23,44,45,44,39,38,37,36


### Feature Importance Measures

Find the main features for the production target. Uses a RandomRorest to identify features


In [16]:
df_all.columns

Index(['index', 'Id', 'age', 'field', 'harvest_month', 'harvest_year',
       'production', 'type', 'date', 'temperature', 'dewpoint', 'windspeed',
       'Soilwater_L1', 'Precipitation', 'temperature_1', 'dewpoint_1',
       'windspeed_1', 'Precipitation_1', 'Soilwater_L1_1'],
      dtype='object')

In [17]:
## Import the random forest model.
from sklearn.ensemble import RandomForestRegressor

## This line instantiates the model. 
rf = RandomForestRegressor() 

# data
df      = df_all_soil[~df_all_soil.production.isna()]
X_train = df.drop(['production', 'date', 'Id', 'index'], axis=1)
y_train = df.production.values

## Fit the model on your training data.
rf.fit(X_train, y_train) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [18]:
# feature_importances
feature_importances = pd.DataFrame(rf.feature_importances_, 
                                   index = X_train.columns, 
                                   columns=['importance']).sort_values('importance', ascending=False).reset_index()
feature_importances

Unnamed: 0,index,importance
0,age,0.149661
1,dewpoint,0.122151
2,temperature,0.103900
3,Precipitation_1,0.085690
4,Soilwater_L1,0.062395
5,type,0.054311
6,Precipitation,0.048206
7,dewpoint_1,0.030016
8,windspeed,0.028902
9,field,0.028889


## Base Model 

Creation of a baseline model to finalize the competition submission pipeline. The idea is to create the most basic for future improvements.

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection

### Prepare Dataset

In [23]:
# Exclude test data.
df_train = df_all_soil[~df_all_soil.production.isna()]

# Most important features.
features = list(feature_importances['index'].values)[:15]

scores = []
num_runs = 10

# Validate using sliding window.
for val_year in range(2007, 2012):

    # Split train/val.
    df_train_train = df_train[df_train.harvest_year < val_year]
    df_train_test = df_train[df_train.harvest_year >= val_year]

    X_train = df_train_train[features]
    y_train = df_train_train.production.values

    X_test = df_train_test[features]
    y_test = df_train_test.production.values

    df_train_train = df_train_train.drop(['production', 'date', 'Id'], axis=1)
    df_train_test = df_train_test.drop(['production', 'date', 'Id'], axis=1)

    # Normalize features.
    scaler = StandardScaler()
    scaler.fit_transform(np.concatenate([X_train, X_test]))
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    maes = []
    r2s = []
    for run in range(num_runs):
        #
        # Train
        #
        base_model = RandomForestRegressor()
        base_model.fit(X_train, y_train)

        #
        # Predict validation data.
        #
        y_hat = base_model.predict(X_test)

        #
        # Score model.
        #
        mae = sklearn.metrics.mean_absolute_error(y_test, y_hat)
        r2 = sklearn.metrics.r2_score(y_test, y_hat)
        
        maes.append(mae)
        r2s.append(r2)

    mae = np.mean(maes)
    mae_sd = np.std(maes)
    
    r2 = np.mean(r2s)
    r2_sd = np.std(r2s)
    
    print('year: {}, train: {}, test: {}, mae: {} ({}), r2: {} ({})'.format(val_year, 
                                                                            X_train.shape[0], 
                                                                            X_test.shape[0], 
                                                                            mae, mae_sd, 
                                                                            r2, r2_sd))

    scores.append(mae)
    
print('mae: {} ({})\n{}'.format(np.mean(scores), np.std(scores), scores))

year: 2007, train: 685, test: 3391, mae: 0.08944112134260565 (0.0012208715035818714), r2: 0.1879619763271525 (0.019009896125644764)
year: 2008, train: 1363, test: 2713, mae: 0.09942356496142815 (0.002260982816641234), r2: 0.061197965591389056 (0.03044264725077673)
year: 2009, train: 2006, test: 2070, mae: 0.09606628855654566 (0.0010530500620043429), r2: 0.1761039502153112 (0.017828713694979823)
year: 2010, train: 2689, test: 1387, mae: 0.09898833015910785 (0.0028542784411172036), r2: 0.15836320314742508 (0.036993889148839144)
year: 2011, train: 3374, test: 702, mae: 0.09889443574748866 (0.003236620912511005), r2: 0.3135360783646738 (0.037432125696875714)
mae: 0.0965627481534352 (0.003754158453647654)
[0.08944112134260565, 0.09942356496142815, 0.09606628855654566, 0.09898833015910785, 0.09889443574748866]


## Train on all data

In [21]:
# Exclude test data.
df_train = df_all_soil[~df_all_soil.production.isna()]

# Most important features.
features = list(feature_importances['index'].values)[:NUM_FTRS]

X_train = df_train[features]
y_train = df_train.production.values

# Normalize features.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

#
# Train
#
base_model = RandomForestRegressor(random_state=1)
base_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

### Submission

It makes the predict of the basic model and creates the sample to substrate in kaggle, finishing the complete pipeline

In [22]:
# Get test examples.
df_test = df_all_soil[df_all_soil.production.isna()]

# Important features.
X = df_test[features]

# Normalize input features.
X = scaler.transform(X)

# Make prediction.
pred = base_model.predict(X)

# Create a submission file.
with open('../submissions/submission_all_data-ignore_lt2016.csv', 'w') as f:
    f.write("Id,production\n")
    for _id, _pred in zip(df_test.Id.values, pred):
        f.write("{},{}\n".format(_id, _pred))