# Training the base classifier

Create a base line, how good can we predict without deep-learning? Les create a base classifier using tpot out-of-the-box.

In [1]:
import sys
sys.path.append("../src") # go to parent dir
# from customFunctions import *

In [2]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from tpot import TPOTClassifier
import pandas as pd

from data import read_crop_list, load_structured_sample
from eval import eval_model
from training import create_training_folder
from random import randrange




In [3]:

df_crops, vocab = read_crop_list()
sample = load_structured_sample()

print("Sample size: {}".format(sample.shape))

Sample size: (30941, 12)


In [4]:
y = sample[:, 11]
X = sample[:, 3:11]


sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print(X_train.shape)

print(X_test.shape)


(21658, 8)
(9283, 8)


# Selection of Model Performance Indicator
As a performance metric, the f1-score on the test set has been used, this metric balances between precision and recall and makes it more robust in unbalanced datasets. This is the case in our project, we have crops with extreamly high frequencies comprared to less represented cultives and we want to balance the precision and recall of the classifier.



# A stats based model
We can make a very basic model just using the population distrubition of our data the prior knowledge that we have about it.
1. Some categories have extreamly high frequence, so if we have to guess a category, we can just use the main category as a base prediction.
2. Some crops are very static and usually don't change over the years, so we can assume that next year prediction will be the previous year crop code.


In [5]:
last_year = X[:,-1] # last year usage
y_pred = np.zeros(shape = y.shape)


# create a CDF a cummulative function of the crop frequences
cumulative_sum = pd.value_counts(last_year).cumsum()
# maps each crop code to its frequence
freq_map =cumulative_sum.to_dict()
print("Crop codes and frequencies")
print(pd.Series(freq_map))

# lest get the cummulative distrubition of data, so we randomly choose the crop code based on its frequence.
max_value = pd.value_counts(last_year).sum()
def get_random_crop(t=None):
    """
    This function returns a random crop code using the crop distribution of the last year.
    """
    value = randrange(max_value)
    for k, crop_value in freq_map.items():
        if value < crop_value:
            return k
    return k

# Apply the function for all non static codes
y_pred = np.array(list(map(get_random_crop, y_pred)))
print("\nshapes: {} == {}".format(y.shape, y_pred.shape))

Crop codes and frequencies
1       6330
5      10717
20     13461
200    15855
203    17458
33     18995
8      20270
4      21382
201    22468
3      23464
60     24453
100    25425
204    26392
101    27351
183    28126
202    28895
181    29493
61     29873
110    30196
40     30377
9      30552
39     30712
82     30842
35     30931
80     30941
dtype: int64

shapes: (30941,) == (30941,)


In [6]:
# For static crop or usages, use the value from the last year
static_crop_codes = [204,203,202,183,181,101,100,9,3]

static_mask = np.isin(y, static_crop_codes)
# same as last year
y_pred[static_mask] = last_year[static_mask]
y_pred[~static_mask] = 1

from sklearn.metrics import f1_score
f1_score(y, y_pred, average="macro")
# lest evaluate how good is this predictor using f1 socre

print("F1-score if we just take the last year crop: {}".format(f1_score(y, last_year, average="macro")))
print("F1-score if we use static and prob codes: {}".format(f1_score(y, y_pred, average="macro")))

F1-score if we just take the last year crop: 0.3505316223774698
F1-score if we use static and prob codes: 0.23371528564154648


So using the last year crop, we have a **0.23 f1-score**, not enough, it seems there's more variability than expected.

## Using TPOT to create a base model

TPOT is a library that automates the phases of searching models and feature extraction, lets see what's the best model we can get using TOP out of the box.

In [None]:
tpot = TPOTClassifier(generations=5, population_size=30, verbosity=2, random_state=42, scoring="f1_macro")

tpot.fit(X_train, y_train)

print("Final score: {}".format(tpot.score(X_test, y_test)))





HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=180.0, style=ProgressStyle(de…

In [None]:
folder = create_training_folder("tpot")
eval_model(folder, y_test, y_hat, crop_list, crop_names)
# save model
# model_folder = '{}/model'.format(folder)
# model.save(model_folder)
tpot.export('{}/model_tpot.py'.format(folder))
