# DL Modeling - Fine Tuning
Load a pre-trained model and fine tune it.

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

%env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


In [2]:
import pandas as pd
import numpy as np
import os 

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

from kdd_utils import addFieldDataFtrs, addSoilFtrs, cvPerYear, load_model_ignoring, save_model_ignoring

sns.set(style="whitegrid")

  return f(*args, **kwds)


In [3]:
from fastai.structured import *
from fastai.column_data import *

  return f(*args, **kwds)
  from numpy.core.umath_tests import inner1d


## Train and test datasets

Basic data containing palm tree information

In [4]:
path = '../input/'

# Train data.
df_all = pd.read_csv(os.path.join(path, 'all_clean.csv'))

# Configure number of jobs.
N_JOBS=7

print('Shape:', df_all.shape)

Shape: (9353, 7)


## Field and Soil Features

These files hold atmospheric data from January 2002 to December 2017, and can be used to estimate the weather conditions during the development of the plant. Notice that weather does influence the production. Using only a single month prior to harvest is probably too little data. Participants should decide how far back in the past they want to look when training models.



In [5]:
df_all_ftrs = addFieldDataFtrs(df_all, shiftPeriod=12, shiftFtrs=['temperature', 'dewpoint', 'windspeed', 
                                                                 'Precipitation', 'Soilwater_L1', 
                                                                 'Soilwater_L2', 'Soilwater_L3', 
                                                                 'Soilwater_L4'])
df_all_ftrs = addSoilFtrs(df_all_ftrs)
print(list(df_all_ftrs.columns))

['index', 'Id', 'age', 'field', 'harvest_month', 'harvest_year', 'production', 'type', 'temperature', 'dewpoint', 'windspeed', 'Soilwater_L1', 'Soilwater_L2', 'Soilwater_L3', 'Soilwater_L4', 'Precipitation', 'temperature_1', 'temperature_2', 'temperature_3', 'temperature_4', 'temperature_5', 'temperature_6', 'temperature_7', 'temperature_8', 'temperature_9', 'temperature_10', 'temperature_11', 'dewpoint_1', 'dewpoint_2', 'dewpoint_3', 'dewpoint_4', 'dewpoint_5', 'dewpoint_6', 'dewpoint_7', 'dewpoint_8', 'dewpoint_9', 'dewpoint_10', 'dewpoint_11', 'windspeed_1', 'windspeed_2', 'windspeed_3', 'windspeed_4', 'windspeed_5', 'windspeed_6', 'windspeed_7', 'windspeed_8', 'windspeed_9', 'windspeed_10', 'windspeed_11', 'Precipitation_1', 'Precipitation_2', 'Precipitation_3', 'Precipitation_4', 'Precipitation_5', 'Precipitation_6', 'Precipitation_7', 'Precipitation_8', 'Precipitation_9', 'Precipitation_10', 'Precipitation_11', 'Soilwater_L1_1', 'Soilwater_L1_2', 'Soilwater_L1_3', 'Soilwater_L1_4

In [6]:
df_all_ftrs.head()

Unnamed: 0,index,Id,age,field,harvest_month,harvest_year,production,type,temperature,dewpoint,...,SLTPPT_sl5,SLTPPT_sl6,SLTPPT_sl7,SNDPPT_sl1,SNDPPT_sl2,SNDPPT_sl3,SNDPPT_sl4,SNDPPT_sl5,SNDPPT_sl6,SNDPPT_sl7
0,0,0,19,0,1,2004,0.064071,5,26.132,24.661,...,22,22,23,44,45,44,39,38,37,36
1,1,1532,13,0,1,2004,0.283228,2,26.132,24.661,...,22,22,23,44,45,44,39,38,37,36
2,2,4204,4,0,1,2004,0.106263,5,26.132,24.661,...,22,22,23,44,45,44,39,38,37,36
3,3,1,19,0,2,2004,0.047658,5,25.295,24.401,...,22,22,23,44,45,44,39,38,37,36
4,4,1533,13,0,2,2004,0.182068,2,25.295,24.401,...,22,22,23,44,45,44,39,38,37,36


In [7]:
# Categorical features.
cat_ftrs = ['field', 'age', 'type', 'harvest_month']
# Continuous features.
contin_ftrs = [f for f in df_all_ftrs.columns if f not in (['production', 'Id', 'index'] + cat_ftrs)]
print(contin_ftrs)

['harvest_year', 'temperature', 'dewpoint', 'windspeed', 'Soilwater_L1', 'Soilwater_L2', 'Soilwater_L3', 'Soilwater_L4', 'Precipitation', 'temperature_1', 'temperature_2', 'temperature_3', 'temperature_4', 'temperature_5', 'temperature_6', 'temperature_7', 'temperature_8', 'temperature_9', 'temperature_10', 'temperature_11', 'dewpoint_1', 'dewpoint_2', 'dewpoint_3', 'dewpoint_4', 'dewpoint_5', 'dewpoint_6', 'dewpoint_7', 'dewpoint_8', 'dewpoint_9', 'dewpoint_10', 'dewpoint_11', 'windspeed_1', 'windspeed_2', 'windspeed_3', 'windspeed_4', 'windspeed_5', 'windspeed_6', 'windspeed_7', 'windspeed_8', 'windspeed_9', 'windspeed_10', 'windspeed_11', 'Precipitation_1', 'Precipitation_2', 'Precipitation_3', 'Precipitation_4', 'Precipitation_5', 'Precipitation_6', 'Precipitation_7', 'Precipitation_8', 'Precipitation_9', 'Precipitation_10', 'Precipitation_11', 'Soilwater_L1_1', 'Soilwater_L1_2', 'Soilwater_L1_3', 'Soilwater_L1_4', 'Soilwater_L1_5', 'Soilwater_L1_6', 'Soilwater_L1_7', 'Soilwater_L1

In [8]:
# Remove unused features.
df_all_ftrs = df_all_ftrs[cat_ftrs + contin_ftrs + ['production']]
df_all_ftrs.head(2)

Unnamed: 0,field,age,type,harvest_month,harvest_year,temperature,dewpoint,windspeed,Soilwater_L1,Soilwater_L2,...,SLTPPT_sl6,SLTPPT_sl7,SNDPPT_sl1,SNDPPT_sl2,SNDPPT_sl3,SNDPPT_sl4,SNDPPT_sl5,SNDPPT_sl6,SNDPPT_sl7,production
0,0,19,5,1,2004,26.132,24.661,1.8766,0.35274,0.35192,...,22,23,44,45,44,39,38,37,36,0.064071
1,0,13,2,1,2004,26.132,24.661,1.8766,0.35274,0.35192,...,22,23,44,45,44,39,38,37,36,0.283228


In [9]:
# Inform pandas which features are categorical ...
for v in cat_ftrs:
    df_all_ftrs[v] = df_all_ftrs[v].astype('category').cat.as_ordered()
# ... and which are continuous.
for v in contin_ftrs:
    df_all_ftrs[v] = df_all_ftrs[v].astype('float32')

df_all_ftrs.head(2)

Unnamed: 0,field,age,type,harvest_month,harvest_year,temperature,dewpoint,windspeed,Soilwater_L1,Soilwater_L2,...,SLTPPT_sl6,SLTPPT_sl7,SNDPPT_sl1,SNDPPT_sl2,SNDPPT_sl3,SNDPPT_sl4,SNDPPT_sl5,SNDPPT_sl6,SNDPPT_sl7,production
0,0,19,5,1,2004.0,26.132,24.660999,1.8766,0.35274,0.35192,...,22.0,23.0,44.0,45.0,44.0,39.0,38.0,37.0,36.0,0.064071
1,0,13,2,1,2004.0,26.132,24.660999,1.8766,0.35274,0.35192,...,22.0,23.0,44.0,45.0,44.0,39.0,38.0,37.0,36.0,0.283228


In [10]:
# Split test and train data.
df_test = df_all_ftrs[df_all_ftrs.production.isna()]
df_all_ftrs = df_all_ftrs[~df_all_ftrs.production.isna()]

In [11]:
val_year = 2010

# Ignore training examples with type != 5
# df_all_ftrs = df_all_ftrs[(df_all_ftrs.type == 5) | (df_all_ftrs.harvest_year >= val_year)]

# Ignore training examples before 2006
# df_all_ftrs = df_all_ftrs[df_all_ftrs.harvest_year >= 2006]

val_idx = np.flatnonzero(df_all_ftrs.harvest_year >= val_year)
print(len(val_idx))
print(len(df_all_ftrs))

1387
5243


In [12]:
df, y, nas, mapper = proc_df(df_all_ftrs, y_fld='production', do_scale=True)
df.head(2)

Unnamed: 0,field,age,type,harvest_month,harvest_year,temperature,dewpoint,windspeed,Soilwater_L1,Soilwater_L2,...,SLTPPT_sl5,SLTPPT_sl6,SLTPPT_sl7,SNDPPT_sl1,SNDPPT_sl2,SNDPPT_sl3,SNDPPT_sl4,SNDPPT_sl5,SNDPPT_sl6,SNDPPT_sl7
0,1,17,7,1,-1.626713,-1.014467,0.894661,-0.616689,0.894475,0.896105,...,0.475658,0.773142,0.975656,-0.839105,-0.879232,-0.594899,-1.054052,-0.639341,-1.113264,-1.217346
1,1,11,4,1,-1.626713,-1.014467,0.894661,-0.616689,0.894475,0.896105,...,0.475658,0.773142,0.975656,-0.839105,-0.879232,-0.594899,-1.054052,-0.639341,-1.113264,-1.217346


In [13]:
df_test, _, nas, mapper = proc_df(df_test, y_fld='production', do_scale=True, mapper=mapper, na_dict=nas)

In [14]:
cat_sz = [(c, len(df_all_ftrs[c].cat.categories)+1) for c in cat_ftrs]
print(cat_sz)
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
print(emb_szs)

[('field', 29), ('age', 29), ('type', 10), ('harvest_month', 13)]
[(29, 15), (29, 15), (10, 5), (13, 7)]


In [15]:
md = ColumnarModelData.from_data_frame(path, # path for data saving
                                       val_idx, # indexes of validation examples
                                       df, # training data
                                       y.astype(np.float32), # output variable
                                       cat_flds=cat_ftrs, # categorical features
                                       bs=16, # batch size
                                       test_df=df_test) # test dataframe

In [16]:
# dropout rate
dr = 0.3

learner_params = {
    "emb_szs": emb_szs, # embedding sizes
    "n_cont": len(df.columns) - len(cat_ftrs), # num continuous inputs
    "emb_drop": dr, # embeddings dropout probability
    "out_sz": 1, # output size
    "szs": [300, 100], # sizes of fully-connected layers
    "drops": [dr, dr], # dropout probabilities after each FC layer
    "use_bn": False # batch normalization
}

m = md.get_learner(**learner_params)

load_model_ignoring(m, 'pretrain-clean-trn0389-dev0037')

m.freeze_to(-1)

m.lr_find()
m.sched.plot()

In [22]:
# dropout rate
dr = 0.3

learner_params = {
    "emb_szs": emb_szs, # embedding sizes
    "n_cont": len(df.columns) - len(cat_ftrs), # num continuous inputs
    "emb_drop": dr, # embeddings dropout probability
    "out_sz": 1, # output size
    "szs": [300, 100], # sizes of fully-connected layers
    "drops": [dr, dr], # dropout probabilities after each FC layer
    "use_bn": False # batch normalization
}

m = md.get_learner(**learner_params)

load_model_ignoring(m, 'pretrain-clean-trn0315-dev0020')

m.freeze_to(-1)

lr = 1e-3
m.fit(lr, 10, cycle_len=3, metrics=[metrics.mean_absolute_error])

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   mean_absolute_error         
    0      13.50628   5.014788   1.772549  
    1      7.011073   2.667631   1.291674                    
    2      6.014333   2.430422   1.220316                    
    3      3.1166     0.742483   0.652579                    
    4      1.765097   0.401666   0.465987                    
    5      1.554124   0.340301   0.433611                    
    6      0.634038   0.115753   0.261305                     
    7      0.36854    0.071756   0.196798                     
    8      0.280733   0.062055   0.183757                     
    9      0.107088   0.028441   0.118141                     
    10     0.050823   0.020998   0.099714                      
    11     0.041197   0.02218    0.100354                      
    12     0.02077    0.019547   0.09502                       
    13     0.016088   0.020785   0.095349                      
    14     0.016119   0.020111   0.094144                      
    15     0

[array([0.01934]), 0.09303242305499147]

In [23]:
m.save('finetune-clean-trn0152-dev0930')

In [27]:
# dropout rate
dr = 0.3

learner_params = {
    "emb_szs": emb_szs, # embedding sizes
    "n_cont": len(df.columns) - len(cat_ftrs), # num continuous inputs
    "emb_drop": dr, # embeddings dropout probability
    "out_sz": 1, # output size
    "szs": [300, 100], # sizes of fully-connected layers
    "drops": [dr, dr], # dropout probabilities after each FC layer
    "use_bn": False # batch normalization
}

m = md.get_learner(**learner_params)

m.load('finetune-clean-trn0152-dev0930')

m.unfreeze()

lr = 1e-3

lrs=[lr/100, lr/10, lr]
m.fit(lrs, 10, cycle_len=3, metrics=[metrics.mean_absolute_error])

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   mean_absolute_error           
    0      0.017382   0.019809   0.096507  
    1      0.016685   0.019975   0.092957                      
    2      0.015354   0.018319   0.090697                      
    3      0.018502   0.022251   0.099516                      
    4      0.01442    0.018259   0.091177                      
    5      0.013849   0.018118   0.089523                      
    6      0.017778   0.020616   0.096007                      
    7      0.015667   0.017801   0.092203                      
    8      0.015952   0.01806    0.089046                      
    9      0.017599   0.018265   0.089595                      
    10     0.013822   0.017977   0.089365                      
    11     0.014834   0.017312   0.087477                      
    12     0.01455    0.017517   0.091359                      
    13     0.013569   0.017497   0.089373                      
    14     0.013954   0.016852   0.087335                   

[array([0.0169]), 0.08685708390824126]

In [26]:
m.save('finetune-clean-trn0125-dev0857')

In [30]:
# dropout rate
dr = 0.3

learner_params = {
    "emb_szs": emb_szs, # embedding sizes
    "n_cont": len(df.columns) - len(cat_ftrs), # num continuous inputs
    "emb_drop": dr, # embeddings dropout probability
    "out_sz": 1, # output size
    "szs": [300, 100], # sizes of fully-connected layers
    "drops": [dr, dr], # dropout probabilities after each FC layer
    "use_bn": False # batch normalization
}

m = md.get_learner(**learner_params)

m.load('finetune-clean-trn0142-dev0946')

m.unfreeze()

lr = 1e-3

lrs=[lr/100, lr/10, lr]
m.fit(lrs, 10, cycle_len=3, metrics=[metrics.mean_absolute_error])

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   mean_absolute_error           
    0      0.016922   0.019205   0.100577  
    1      0.014377   0.020181   0.094987                      
    2      0.014631   0.018151   0.090602                      
    3      0.017568   0.020215   0.098974                      
    4      0.014055   0.017772   0.09038                       
    5      0.015251   0.01717    0.089289                      
    6      0.017858   0.018943   0.090282                      
    7      0.014024   0.017741   0.090204                      
    8      0.012132   0.017209   0.089316                      
    9      0.015562   0.01732    0.08889                       
    10     0.014191   0.017801   0.090075                      
    11     0.01137    0.016485   0.086968                      
    12     0.014212   0.016952   0.087721                      
    13     0.01374    0.017408   0.088424                      
    14     0.012851   0.016206   0.085923                   

[array([0.01686]), 0.08816640951120776]

In [26]:
m.save('finetune-clean-trn0125-dev0857')

## Using Model Pre-Trained for less epochs (5x3)

In [20]:
# dropout rate
dr = 0.1

learner_params = {
    "emb_szs": emb_szs, # embedding sizes
    "n_cont": len(df.columns) - len(cat_ftrs), # num continuous inputs
    "emb_drop": dr, # embeddings dropout probability
    "out_sz": 1, # output size
    "szs": [300, 100], # sizes of fully-connected layers
    "drops": [dr, dr], # dropout probabilities after each FC layer
    "use_bn": False # batch normalization
}

m = md.get_learner(**learner_params)

load_model_ignoring(m, 'pretrain-clean-trn049-dev047')

m.freeze_to(-1)

lr = 1e-3
m.fit(lr, 10, cycle_len=3, metrics=[metrics.mean_absolute_error])

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   mean_absolute_error         
    0      1.902526   1.233181   0.897498  
    1      0.953671   0.872316   0.74217                      
    2      0.874915   0.817793   0.718718                     
    3      0.539209   0.519601   0.571068                     
    4      0.374212   0.334625   0.455582                     
    5      0.301172   0.30468    0.435296                     
    6      0.196652   0.145041   0.300086                     
    7      0.117809   0.096699   0.245884                     
    8      0.10284    0.087297   0.234347                     
    9      0.060866   0.041069   0.156782                      
    10     0.039081   0.0307     0.135263                      
    11     0.03588    0.028034   0.126531                      
    12     0.022152   0.019766   0.101731                      
    13     0.017795   0.018559   0.096509                      
    14     0.016493   0.018218   0.094569                      
    15

[array([0.01718]), 0.08936285330674248]

In [22]:
m.save('finetune-clean-freeze_to_last-trn0127-dev0893')

In [32]:
# dropout rate
dr = 0.3

learner_params = {
    "emb_szs": emb_szs, # embedding sizes
    "n_cont": len(df.columns) - len(cat_ftrs), # num continuous inputs
    "emb_drop": dr, # embeddings dropout probability
    "out_sz": 1, # output size
    "szs": [300, 100], # sizes of fully-connected layers
    "drops": [dr, dr], # dropout probabilities after each FC layer
    "use_bn": False # batch normalization
}

m = md.get_learner(**learner_params)

m.load('finetune-clean-freeze_to_last-trn0127-dev0893')

m.unfreeze()

lr = 1e-3

lrs=[lr/100, lr/10, lr]
m.fit(lrs, 10, cycle_len=3, metrics=[metrics.mean_absolute_error])

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   mean_absolute_error           
    0      0.018171   0.017799   0.09524   
    1      0.016885   0.019063   0.094244                      
    2      0.013732   0.017664   0.090963                      
    3      0.016908   0.022043   0.099545                      
    4      0.014145   0.017864   0.088524                      
    5      0.01627    0.01731    0.088509                      
    6      0.015891   0.022868   0.10173                       
    7      0.012868   0.016437   0.086291                      
    8      0.012917   0.017024   0.087149                      
    9      0.017379   0.018984   0.099455                      
    10     0.013458   0.016059   0.086858                      
    11     0.011435   0.016477   0.085299                      
    12     0.015423   0.015969   0.086595                      
    13     0.012674   0.016642   0.086349                      
    14     0.011618   0.016327   0.085963                   

[array([0.0164]), 0.08521869992696302]

In [24]:
m.save('finetune-clean-trn0132-dev0839')

## Submission file

In [73]:
# dropout rate
dr = 0.3

learner_params = {
    "emb_szs": emb_szs, # embedding sizes
    "n_cont": len(df.columns) - len(cat_ftrs), # num continuous inputs
    "emb_drop": dr, # embeddings dropout probability
    "out_sz": 1, # output size
    "szs": [300, 100], # sizes of fully-connected layers
    "drops": [dr, dr], # dropout probabilities after each FC layer
    "use_bn": False # batch normalization
}

m = md.get_learner(**learner_params)

m.load('finetuned-all-082')

In [84]:
from datetime import datetime

# Make prediction.
pred = m.predict(is_test=True).squeeze()

now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')

df_test_id = df_test # pd.read_csv(os.path.join(path, 'test.csv'))

submission_file  = '../submissions/{}.submission.csv'.format(now)
model_file       = '../submissions/{}.model'.format(now)

# Create a submission file.
with open(submission_file, 'w') as f:
    f.write("Id,production\n")
    for _id, _pred in zip(df_test_id.Id.values, pred):
        f.write("{},{}\n".format(_id, min(1.0, max(0.0, _pred))))

# Save model.
save_model(m.model, model_file)

[autoreload of urllib3.packages.six failed: Traceback (most recent call last):
  File "/home/eraldo/miniconda2/envs/fastai/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/eraldo/miniconda2/envs/fastai/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 384, in superreload
    update_generic(old_obj, new_obj)
  File "/home/eraldo/miniconda2/envs/fastai/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 323, in update_generic
    update(a, b)
  File "/home/eraldo/miniconda2/envs/fastai/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 275, in update_class
    old_obj = getattr(old, key)
  File "/home/eraldo/miniconda2/envs/fastai/lib/python3.6/site-packages/urllib3/packages/six.py", line 93, in __get__
    setattr(obj, self.name, result)  # Invokes __set__.
AttributeError: 'NoneType' object has no attribute 'cStringIO'
]
[autoreload of urllib3.co

AttributeError: 'DataFrame' object has no attribute 'Id'

In [82]:
pred = m.predict(is_test=True)
pred.squeeze()

array([0.08109, 0.24406, 0.08957, ..., 0.06683, 0.24095, 0.26506], dtype=float32)