# Model
- Ignoring data before 2006.
- Using only type=5 examples.

In [1]:
import pandas as pd
import numpy as np
import os 

import plotly.plotly as py
import plotly.graph_objs as go

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

In [2]:
# Private DataSet path: ../input/kddbr2018dataset/kddbr-2018-dataset/dataset. This dataset is the same of competitions
#
path = '../input/'
print(os.listdir(path))

['field-22.csv', 'field-20.csv', 'sample-submission.csv', 'field-3.csv', 'soil_data.csv', 'field-18.csv', 'field-23.csv', 'field-17.csv', 'field-2.csv', 'field-27.csv', 'field-24.csv', 'field-10.csv', 'field-11.csv', 'field-26.csv', 'field-21.csv', 'test.csv', 'field-25.csv', 'field-19.csv', 'field-7.csv', 'field-15.csv', 'field-8.csv', 'field-12.csv', 'field-0.csv', 'field-1.csv', 'field-14.csv', 'train.csv', 'field-5.csv', 'field-16.csv', 'field-13.csv', 'field-9.csv', 'field-6.csv', 'field-4.csv']


## Train and test datasets

Basic data containing palm tree information

In [3]:
df_train = pd.read_csv(os.path.join(path, 'train.csv'))

# Ignoring date before 2006
# df_train = df_train[df_train.harvest_year >= 2006]

# Using only examples with type=5
df_train = df_train[df_train.type == 5]

df_test  = pd.read_csv(os.path.join(path, 'test.csv'))
df_all = pd.concat([df_train, df_test], sort=False)

print(df_train.shape, df_test.shape, df_all.shape)
df_all.head()

(4253, 7) (4110, 6) (8363, 7)


Unnamed: 0,Id,field,age,type,harvest_year,harvest_month,production
0,0,0,19,5,2004,1,0.064071
1,1,0,19,5,2004,2,0.047658
2,2,0,19,5,2004,3,0.016866
3,3,0,19,5,2004,4,0.025525
4,4,0,19,5,2004,5,0.04769


In [4]:
def to_date(df):
    return pd.to_datetime((df.harvest_year*10000+df.harvest_month*100+1)\
                                  .apply(str),format='%Y%m%d')
# Add date variable 
for d in [df_train, df_test, df_all]:
    d['date'] = to_date(d)

## Field data (field_*.csv)

These files hold atmospheric data from January 2002 to December 2017, and can be used to estimate the weather conditions during the development of the plant. Notice that weather does influence the production. Using only a single month prior to harvest is probably too little data. Participants should decide how far back in the past they want to look when training models.



In [5]:
# read
df_field = pd.read_csv(path+'field-0.csv')
df_field['field'] = 0
for i in range(1, 28):
    _df_field = pd.read_csv(path+'field-{}.csv'.format(i))
    _df_field['field'] = i
    df_field = pd.concat([df_field, _df_field])

# remove duplicates
df_field = df_field.drop_duplicates()

# Group 
df_field = df_field.groupby(['month', 'year', 'field']).mean().reset_index()
print(df_field.shape)
df_field.head()

(5376, 11)


Unnamed: 0,month,year,field,temperature,dewpoint,windspeed,Soilwater_L1,Soilwater_L2,Soilwater_L3,Soilwater_L4,Precipitation
0,1,2002,0,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55
1,1,2002,1,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55
2,1,2002,2,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55
3,1,2002,3,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55
4,1,2002,4,26.008,24.434,1.8453,0.32984,0.32597,0.31477,0.29513,361.55


In [6]:
# df_all
df_all = pd.merge(df_all, df_field, left_on=['harvest_year', 'harvest_month','field'], 
                  right_on=['year', 'month', 'field'], how='inner').reset_index()
print(df_all.shape)
df_all.head()

(8363, 19)


Unnamed: 0,index,Id,field,age,type,harvest_year,harvest_month,production,date,month,year,temperature,dewpoint,windspeed,Soilwater_L1,Soilwater_L2,Soilwater_L3,Soilwater_L4,Precipitation
0,0,0,0,19,5,2004,1,0.064071,2004-01-01,1,2004,26.132,24.661,1.8766,0.35274,0.35192,0.34844,0.33385,360.91
1,1,4204,0,4,5,2004,1,0.106263,2004-01-01,1,2004,26.132,24.661,1.8766,0.35274,0.35192,0.34844,0.33385,360.91
2,2,1,0,19,5,2004,2,0.047658,2004-02-01,2,2004,25.295,24.401,1.9206,0.36361,0.36376,0.36411,0.36357,484.67
3,3,4205,0,4,5,2004,2,0.040194,2004-02-01,2,2004,25.295,24.401,1.9206,0.36361,0.36376,0.36411,0.36357,484.67
4,4,2,0,19,5,2004,3,0.016866,2004-03-01,3,2004,25.61,24.651,1.9948,0.36399,0.36383,0.36351,0.36575,460.76


In [7]:
df_all.columns

Index(['index', 'Id', 'field', 'age', 'type', 'harvest_year', 'harvest_month',
       'production', 'date', 'month', 'year', 'temperature', 'dewpoint',
       'windspeed', 'Soilwater_L1', 'Soilwater_L2', 'Soilwater_L3',
       'Soilwater_L4', 'Precipitation'],
      dtype='object')

In [8]:
# Features i will duplicate with the past months
features = ['temperature', 'dewpoint', 'windspeed', 'Precipitation', 'Soilwater_L1']

# Remove redundant features.
df_all = df_all.drop(columns=['Soilwater_L2', 'Soilwater_L3', 'Soilwater_L4'])

df_group = df_all.groupby(['field', 'date']).mean().reset_index()[['field', 'date', 'production'] + features]
df_group = df_group.sort_values(['field', 'date'])
print(df_group.shape)
df_group.head()

(4317, 8)


Unnamed: 0,field,date,production,temperature,dewpoint,windspeed,Precipitation,Soilwater_L1
0,0,2004-01-01,0.085167,26.132,24.661,1.8766,360.91,0.35274
1,0,2004-02-01,0.043926,25.295,24.401,1.9206,484.67,0.36361
2,0,2004-03-01,0.016866,25.61,24.651,1.9948,460.76,0.36399
3,0,2004-04-01,0.019378,26.328,24.753,1.9063,350.02,0.35677
4,0,2004-05-01,0.04769,26.566,24.94,1.8569,186.72,0.36002


In [9]:
# Collect shift values of variables in all features time
period = 2

new_features = {}
for f in features:
    new_features[f] = []
    for i in range(1, period):
        new_feature = '{}_{}'.format(f, i)
        new_features[f].append(new_feature)
        df_group[new_feature] = df_group[f].shift(i).fillna(df_group[f].mean())

df_group = df_group.drop(features+['production'], axis=1)
df_group.head()

Unnamed: 0,field,date,temperature_1,dewpoint_1,windspeed_1,Precipitation_1,Soilwater_L1_1
0,0,2004-01-01,27.423209,23.810793,2.137359,247.8296,0.309522
1,0,2004-02-01,26.132,24.661,1.8766,360.91,0.35274
2,0,2004-03-01,25.295,24.401,1.9206,484.67,0.36361
3,0,2004-04-01,25.61,24.651,1.9948,460.76,0.36399
4,0,2004-05-01,26.328,24.753,1.9063,350.02,0.35677


In [10]:
df_all = df_all.drop(['index', 'month', 'year'], axis=1)
df_all = pd.merge(df_all, df_group, left_on=['field', 'date'], right_on=['field', 'date'], how='inner').reset_index()

print(df_all.shape)
df_all.head()

(8363, 19)


Unnamed: 0,index,Id,field,age,type,harvest_year,harvest_month,production,date,temperature,dewpoint,windspeed,Soilwater_L1,Precipitation,temperature_1,dewpoint_1,windspeed_1,Precipitation_1,Soilwater_L1_1
0,0,0,0,19,5,2004,1,0.064071,2004-01-01,26.132,24.661,1.8766,0.35274,360.91,27.423209,23.810793,2.137359,247.8296,0.309522
1,1,4204,0,4,5,2004,1,0.106263,2004-01-01,26.132,24.661,1.8766,0.35274,360.91,27.423209,23.810793,2.137359,247.8296,0.309522
2,2,1,0,19,5,2004,2,0.047658,2004-02-01,25.295,24.401,1.9206,0.36361,484.67,26.132,24.661,1.8766,360.91,0.35274
3,3,4205,0,4,5,2004,2,0.040194,2004-02-01,25.295,24.401,1.9206,0.36361,484.67,26.132,24.661,1.8766,360.91,0.35274
4,4,2,0,19,5,2004,3,0.016866,2004-03-01,25.61,24.651,1.9948,0.36399,460.76,25.295,24.401,1.9206,484.67,0.36361


## Soil Data (soil_data.csv)

Information about the soil on each field.

In [11]:
df_soil = pd.read_csv(path+'soil_data.csv')
print(df_soil.shape)
df_soil.head()

(28, 73)


Unnamed: 0,field,BDRICM_BDRICM_M,BDRLOG_BDRLOG_M,BDTICM_BDTICM_M,BLDFIE_sl1,BLDFIE_sl2,BLDFIE_sl3,BLDFIE_sl4,BLDFIE_sl5,BLDFIE_sl6,...,SLTPPT_sl5,SLTPPT_sl6,SLTPPT_sl7,SNDPPT_sl1,SNDPPT_sl2,SNDPPT_sl3,SNDPPT_sl4,SNDPPT_sl5,SNDPPT_sl6,SNDPPT_sl7
0,4,200,7,6973,1345,1308,1361,1413,1486,1503,...,21,19,20,47,48,47,42,40,40,39
1,3,200,9,7272,1297,1287,1323,1428,1492,1508,...,23,22,22,44,45,43,40,36,37,36
2,2,200,7,7281,1266,1249,1310,1387,1463,1491,...,21,21,22,46,46,45,40,39,39,39
3,1,200,6,7457,1297,1277,1345,1409,1480,1506,...,21,21,21,46,47,46,42,40,40,40
4,7,200,8,6771,1305,1289,1333,1438,1497,1510,...,22,21,22,44,45,44,40,38,38,37


In [12]:
# Join datasets
df_all_soil = pd.merge(df_all, df_soil, on='field', how='inner')
print(df_all_soil.shape)
df_all_soil.head()

(8363, 91)


Unnamed: 0,index,Id,field,age,type,harvest_year,harvest_month,production,date,temperature,...,SLTPPT_sl5,SLTPPT_sl6,SLTPPT_sl7,SNDPPT_sl1,SNDPPT_sl2,SNDPPT_sl3,SNDPPT_sl4,SNDPPT_sl5,SNDPPT_sl6,SNDPPT_sl7
0,0,0,0,19,5,2004,1,0.064071,2004-01-01,26.132,...,22,22,23,44,45,44,39,38,37,36
1,1,4204,0,4,5,2004,1,0.106263,2004-01-01,26.132,...,22,22,23,44,45,44,39,38,37,36
2,2,1,0,19,5,2004,2,0.047658,2004-02-01,25.295,...,22,22,23,44,45,44,39,38,37,36
3,3,4205,0,4,5,2004,2,0.040194,2004-02-01,25.295,...,22,22,23,44,45,44,39,38,37,36
4,4,2,0,19,5,2004,3,0.016866,2004-03-01,25.61,...,22,22,23,44,45,44,39,38,37,36


## Feature Importance Measures

Find the main features for the production target. Uses a RandomRorest to identify features


In [13]:
df_all_soil.columns

Index(['index', 'Id', 'field', 'age', 'type', 'harvest_year', 'harvest_month',
       'production', 'date', 'temperature', 'dewpoint', 'windspeed',
       'Soilwater_L1', 'Precipitation', 'temperature_1', 'dewpoint_1',
       'windspeed_1', 'Precipitation_1', 'Soilwater_L1_1', 'BDRICM_BDRICM_M',
       'BDRLOG_BDRLOG_M', 'BDTICM_BDTICM_M', 'BLDFIE_sl1', 'BLDFIE_sl2',
       'BLDFIE_sl3', 'BLDFIE_sl4', 'BLDFIE_sl5', 'BLDFIE_sl6', 'BLDFIE_sl7',
       'CECSOL_sl1', 'CECSOL_sl2', 'CECSOL_sl3', 'CECSOL_sl4', 'CECSOL_sl5',
       'CECSOL_sl6', 'CECSOL_sl7', 'CLYPPT_sl1', 'CLYPPT_sl2', 'CLYPPT_sl3',
       'CLYPPT_sl4', 'CLYPPT_sl5', 'CLYPPT_sl6', 'CLYPPT_sl7', 'CRFVOL_sl1',
       'CRFVOL_sl2', 'CRFVOL_sl3', 'CRFVOL_sl4', 'CRFVOL_sl5', 'CRFVOL_sl6',
       'CRFVOL_sl7', 'OCSTHA_sd1', 'OCSTHA_sd2', 'OCSTHA_sd3', 'OCSTHA_sd4',
       'OCSTHA_sd5', 'OCSTHA_sd6', 'ORCDRC_sl1', 'ORCDRC_sl2', 'ORCDRC_sl3',
       'ORCDRC_sl4', 'ORCDRC_sl5', 'ORCDRC_sl6', 'ORCDRC_sl7', 'PHIHOX_sl1',
       'PHIHOX

In [14]:
## Import the random forest model.
from sklearn.ensemble import RandomForestRegressor

## This line instantiates the model. 
rf = RandomForestRegressor(random_state=1) 

# data
df      = df_all_soil[~df_all_soil.production.isna()]
X_train = df.drop(['production', 'date', 'Id', 'index'], axis=1)
y_train = df.production.values

## Fit the model on your training data.
rf.fit(X_train, y_train) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [15]:
# feature_importances
feature_importances = pd.DataFrame(rf.feature_importances_, 
                                   index = X_train.columns, 
                                   columns=['importance']).sort_values('importance', ascending=False).reset_index()
feature_importances.head()

Unnamed: 0,index,importance
0,age,0.15092
1,dewpoint,0.131477
2,temperature,0.118324
3,Soilwater_L1,0.1178
4,Precipitation_1,0.078825


## Base Model 

Creation of a baseline model to finalize the competition submission pipeline. The idea is to create the most basic for future improvements.

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection

### Prepare Dataset

In [17]:
# Exclude test data.
df_train = df_all_soil[~df_all_soil.production.isna()]

# Most important features.
features = list(feature_importances['index'].values)[:15]

scores = []

# Validate using sliding window.
for val_year in range(2007, 2012):

    # Split train/val.
    df_train_train = df_train[df_train.harvest_year < val_year]
    df_train_test = df_train[df_train.harvest_year >= val_year]

    X_train = df_train_train[features]
    y_train = df_train_train.production.values

    X_test = df_train_test[features]
    y_test = df_train_test.production.values

    df_train_train = df_train_train.drop(['production', 'date', 'Id'], axis=1)
    df_train_test = df_train_test.drop(['production', 'date', 'Id'], axis=1)

    # Normalize features.
    scaler = StandardScaler()
    scaler.fit_transform(np.concatenate([X_train, X_test]))
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    #
    # Train
    #
    base_model = RandomForestRegressor(random_state=1)
    base_model.fit(X_train, y_train)

    #
    # Predict validation data.
    #
    y_hat = base_model.predict(X_test)

    #
    # Score model.
    #
    mae = sklearn.metrics.mean_absolute_error(y_test, y_hat)
    r2 = sklearn.metrics.r2_score(y_test, y_hat)
    
    print(val_year, X_train.shape[0], X_test.shape[0], mae, r2)

    scores.append(mae)
    
print(scores)
print(np.mean(scores))
print(np.std(scores))

2007 1484 2769 0.09217958114666573 0.19642816329701818
2008 2042 2211 0.10713621370419907 -0.03369412618890322
2009 2565 1688 0.09504148132910901 0.20620435002630988
2010 3127 1126 0.08617394144254771 0.33595167449862784
2011 3689 564 0.12405466409656923 -0.2587712408875298
[0.09217958114666573, 0.10713621370419907, 0.09504148132910901, 0.08617394144254771, 0.12405466409656923]
0.10091717634381814
0.013433183556311122


### Train on all data

In [18]:
# Exclude test data.
df_train = df_all_soil[~df_all_soil.production.isna()]

# Most important features.
features = list(feature_importances['index'].values)[:15]

X_train = df_train[features]
y_train = df_train.production.values

# Normalize features.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

#
# Train
#
base_model = RandomForestRegressor(random_state=1)
base_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

### Submission

It makes the predict of the basic model and creates the sample to substrate in kaggle, finishing the complete pipeline

In [19]:
# Get test examples.
df_test = df_all_soil[df_all_soil.production.isna()]

# Important features.
X = df_test[features]

# Normalize input features.
X = scaler.transform(X)

# Make prediction.
pred = base_model.predict(X)

# Create a submission file.
with open('../submissions/submission-all_data-only_type5.csv', 'w') as f:
    for _id, _pred in zip(df_test.Id.values, pred):
        f.write("{},{}\n".format(_id, _pred))