# Training the base classifier

Create a base line, how good can we predict without deep-learning? Les create a base classifier using tpot out-of-the-box.

In [1]:
import sys
sys.path.append("../src") # go to parent dir
# from customFunctions import *

In [2]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from tpot import TPOTClassifier
import pandas as pd

from data import read_crop_list, load_structured_sample
from eval import eval_model
from training import create_training_folder
from random import randrange




In [3]:

df_crops, vocab = read_crop_list()
sample = load_structured_sample()


In [4]:
y = sample[:, 11]
X = sample[:, 3:11]


sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print(X_train.shape)

print(X_test.shape)


(21961, 8)
(9412, 8)


# Selection of Model Performance Indicator
As a performance metric, the f1-score on the test set has been used, this metric balances between precision and recall and makes it more robust in unbalanced datasets. This is the case in our project, we have crops with extreamly high frequencies comprared to less represented cultives and we want to balance the precision and recall of the classifier.



# A stats based model
We can make a very basic model just using the crop code population distribution and the the prior knowledge that we have about the data and the context:
1. Some categories have extreamly high frequence, so if we have to guess a category, we can just use the main category as a base prediction.
2. Some crops are very static and usually don't change over the years, so we can assume that next year prediction will be the previous year crop code.


In [5]:
print(df_crops)
# For static crop or usages, use the value from the last year
static_crop_codes = [1,5,16,17,20,21,25,26,27]

    code                  description  is_crop  idx
0      1                        TRIGO        1    0
1      3                URBANO-VIALES        0    1
2      4                         MAIZ        1    2
3      5                       CEBADA        1    3
4      8               OTROS CEREALES        1    4
5      9                     ROQUEDOS        0    5
6     20                SUELO DESNUDO        1    6
7     33                      GIRASOL        1    7
8     35                        COLZA        1    8
9     39      OTRAS LEGUMINOSAS GRANO        1    9
10    40                    GUISANTES        1   10
11    60                      ALFALFA        1   11
12    61                   FORRAJERAS        1   12
13    80  OTROS CULTIVOS INDUSTRIALES        1   13
14    82                    REMOLACHA        1   14
16   100                       VIÑEDO        1   16
17   101                       OLIVAR        1   17
18   110                    HORTICOLA        1   18
19   177    

In [6]:
data_file = "/media/data/projects/crophisto/data.npy"
data_orig = np.load(data_file)
y_orig = data_orig[:, 11]
X_orig = data_orig[:, :11]

last_year = X_orig[:,-1] # last year usage
y_pred = np.zeros(shape = y_orig.shape)


# create a CDF a cummulative function of the crop frequences
cumulative_sum = pd.value_counts(last_year).cumsum()
# maps each crop code to its frequence
freq_map =cumulative_sum.to_dict()
print("Crop codes and frequencies")
print(pd.Series(freq_map))

# lest get the cummulative distrubition of data, so we randomly choose the crop code based on its frequence.
max_value = pd.value_counts(last_year).sum()
def get_random_crop(t=None):
    """
    This function returns a random crop code using the crop distribution of the last year.
    """
    value = randrange(max_value)
    for k, crop_value in freq_map.items():
        if value < crop_value:
            return k
    return k

# Apply the function for all non static codes
y_pred = np.array(list(map(get_random_crop, y_pred)))
print("\nshapes: {} == {}".format(y_orig.shape, y_pred.shape))

Crop codes and frequencies
0      474535
3      794460
21    1019903
6     1243298
7     1378289
4     1492494
22    1581247
24    1667043
25    1730292
11    1778045
2     1819273
1     1856963
12    1883033
15    1908758
23    1929418
10    1940491
9     1951309
16    1958146
19    1964575
14    1969813
8     1974185
17    1977818
20    1981128
26    1983104
5     1984153
13    1984471
dtype: int64

shapes: (1984471,) == (1984471,)


In [7]:


static_mask = np.isin(y_orig, static_crop_codes)
# same as last year
y_pred[static_mask] = last_year[static_mask]

from sklearn.metrics import f1_score
# lest evaluate how good is this predictor using f1 socre

print("Percentage of crops that vary from one year to the next: {} %".format(round(100*np.count_nonzero(last_year-y_orig)/len(y_orig),2)))
print("F1-score if we just take the last year crop: {}".format(f1_score(y_orig, last_year, average="macro")))
print("F1-score if we use static and prob codes: {}".format(f1_score(y_orig, y_pred, average="macro")))
most_freq_crop_code = 0
y_pred[~static_mask] = most_freq_crop_code
print("F1-score if we use the main category for non static crops : {}".format(f1_score(y_orig, y_pred, average="macro")))

Percentage of crops that vary from one year to the next: 62.66 %
F1-score if we just take the last year crop: 0.3393251651296892
F1-score if we use static and prob codes: 0.1298599394722416
F1-score if we use the main category for non static crops : 0.154172731814438


So using the last year crop, we have a **0.34 f1-score**, not enough, it seems there's more variability than expected but it remarks the need of finding a proper classifier.

## Using TPOT to create a base model

TPOT is a library that automates the phases of searching models and feature extraction, lets see what's the best model we can get using TOP out of the box.

In [8]:
tpot = TPOTClassifier(generations=5, population_size=30, verbosity=2, random_state=42, scoring="f1_macro")

tpot.fit(X_train, y_train)

print("Final score: {}".format(tpot.score(X_test, y_test)))





HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=180.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.4706572541797421
Generation 2 - Current best internal CV score: 0.4771235737178454
Generation 3 - Current best internal CV score: 0.48062829150066666
Generation 4 - Current best internal CV score: 0.48278584018315823
Generation 5 - Current best internal CV score: 0.48278584018315823
Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.8, min_samples_leaf=2, min_samples_split=12, n_estimators=100)
Final score: 0.49752538168292254


In [10]:
folder = create_training_folder("tpot","base")

df_crops, vocab = read_crop_list()
crop_names = df_crops["description"].values.tolist()
crop_list = df_crops["code"].values.tolist()

tpot.export('{}/model_tpot.py'.format(folder))
    
# run prediction
y_pred = tpot.predict(X_test)

from sklearn.metrics import f1_score


print("Manually measure the f1-score: {}".format(f1_score(y_test, y_pred, average="macro")))


Manually measure the f1-score: 0.49752538168292254


So TPOT has found a **ExtraTreesClassifier that could give us a 0.5 f1-score**, lets see if we can improve this using deep-learning.