# Training the base classifier

Create a base line, how good can we predict without deep-learning? Les create a base classifier using tpot out-of-the-box.

In [1]:
import sys
sys.path.append("../src") # go to parent dir
# from customFunctions import *

In [2]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from tpot import TPOTClassifier

from data import read_crop_list, load_structured_sample
from eval import eval_model
from training import create_training_folder
import pandas as pd
from random import randrange



In [3]:

def read_crop_list():
    df_crops = pd.read_pickle("/media/data/projects/crophisto/crop_codes.pckl")
    # transform data to get a line per each
    crop_list = df_crops["code"].to_numpy()
    vocab = {val: idx for idx, val in enumerate(crop_list)}
    return df_crops, vocab


In [4]:

df_crops, vocab = read_crop_list()
sample = load_structured_sample()

print("Sample size: {}".format(sample.shape))

df_crops, vocab = read_crop_list()
df_crops

Sample size: (30944, 12)


Unnamed: 0,code,description,is_crop,idx
0,1,TRIGO,1,0
1,3,URBANO-VIALES,0,1
2,4,MAIZ,1,2
3,5,CEBADA,1,3
4,8,OTROS CEREALES,1,4
5,9,ROQUEDOS,0,5
6,20,SUELO DESNUDO,1,6
7,33,GIRASOL,1,7
8,35,COLZA,1,8
9,39,OTRAS LEGUMINOSAS GRANO,1,9


In [5]:

print()





In [6]:
y = sample[:, 11]
X = sample[:, 3:11]
last_year = X[:,-1] # last year usage
y_pred = np.zeros(shape = y.shape)


# create a CDF a cummulative function of the crop frequences
cumulative_sum = pd.value_counts(last_year).cumsum()
# maps each crop code to its frequence
freq_map =cumulative_sum.to_dict()
print("Crop codes and frequencies")
print(pd.Series(freq_map))

# lest get the cummulative distrubition of data, so we randomly choose the crop code based on its frequence.
max_value = pd.value_counts(last_year).sum()
def get_random_crop(t=None):
    """
    This function returns a random crop code using the crop distribution of the last year.
    """
    value = randrange(max_value)
    for k, crop_value in freq_map.items():
        if value < crop_value:
            return k
    return k

# Apply the function for all non static codes
y_pred = np.array(list(map(get_random_crop, y_pred)))
print("\nshapes: {} == {}".format(y.shape, y_pred.shape))

Crop codes and frequencies
1       6330
5      10717
20     13461
200    15856
203    17460
33     18997
8      20272
4      21384
201    22470
3      23466
60     24455
100    25427
204    26395
101    27354
183    28129
202    28898
181    29496
61     29876
110    30199
40     30380
9      30555
39     30715
82     30845
35     30934
80     30944
dtype: int64

shapes: (30944,) == (30944,)


In [7]:
# For static crop or usages, use the value from the last year
static_crop_codes = [204,203,202,183,181,101,100,9,3]

static_mask = np.isin(y, static_crop_codes)
# same as last year
y_pred[static_mask] = last_year[static_mask]

from sklearn.metrics import f1_score
f1_score(y, y_pred, average="macro")
# lest evaluate how good is this predictor using f1 socre

print("Percentage of crops that vary from one year to the next: {} %".format(round(100*np.count_nonzero(last_year-y)/np.count_nonzero(y),2)))
print("F1-score if we just take the last year crop: {}".format(f1_score(y, last_year, average="macro")))
print("F1-score if we use static and prob codes: {}".format(f1_score(y, y_pred, average="macro")))
most_freq_crop_code = 1
y_pred[~static_mask] = most_freq_crop_code
print("F1-score if we use the main category for non static crops : {}".format(f1_score(y, y_pred, average="macro")))

Percentage of crops that vary from one year to the next: 64.53 %
F1-score if we just take the last year crop: 0.3505081951564627
F1-score if we use static and prob codes: 0.1950476095455231
F1-score if we use the main category for non static crops : 0.2336911295967801


So using the last year crop, we have a **0.35 f1-score**. 

# Selection of Model Performance Indicator
We can make a very basic model just using the population distrubition of our data the prior knowledge that we have about it.
1. Some categories have extreamly high frequence, so if we have to guess a category, we can just use the main category as a base prediction.
2. Some crops are very static and usually don't change over the years, so we can assume that next year prediction will be the previous year crop code.


# A stats based model
We can make a very basic model just using the population distrubition of our data the prior knowledge that we have about it.
1. Some categories have extreamly high frequence, so if we have to guess a category, we can just use the main category as a base prediction.
2. Some crops are very static and usually don't change over the years, so we can assume that next year prediction will be the previous year crop code.


## Using TPOT to create a base model

TPOT is a library that automates the phases of searching models and feature extraction, lets see what's the best model we can get using TOP out of the box.

In [8]:
# tpot = TPOTClassifier(generations=5, population_size=50, max_time_mins=30, verbosity=2, random_state=42, scoring="f1")
# tpot.fit(X_train, y_train)

# print("Final score: {}".format(tpot.score(X_test, y_test)))


NameError: name 'X_train' is not defined

In [None]:
folder = create_training_folder("tpot")
eval_model(folder, y_test, y_hat, crop_list, crop_names)
# save model
# model_folder = '{}/model'.format(folder)
# model.save(model_folder)
tpot.export('{}/model.py'.format(folder))
