Notebook to use logistic regression on the Animal Future survey data to predict farmers adoption.

In [1]:
import pandas as pd
import numpy as np

# Data preparation

## Data upload

In [2]:
path_to_data = "./survey_data/AF_survey_data_30.xlsx"

In [3]:
dataset_original = pd.read_excel(path_to_data, index_col=0)
dataset_original.head()

Unnamed: 0_level_0,AdoptedSBP,PastureSurface,CattlePercentage,Distrito,Concelho,FarmerSince,PercentRentedLand,LegalForm,HighestEducationalDegree,HighestAgriculturalEducationalDegree,ExpectationFamilySuccession
FARM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
PT02,0,364.0,0.0,Setúbal,Grândola,29,0.0,Individual,Undergraduate,Undergraduate,Yes
PT13,1,542.58,1.0,Portalegre,Avis,11,0.0,Associated,Undergraduate,Undergraduate,Yes
PT15,1,262.7,1.0,Portalegre,Monforte,11,1.0,Associated,Undergraduate,Undergraduate,Yes
PT16,0,23.0,1.0,Évora,Évora,3,1.0,Individual,Undergraduate,Undergraduate,Yes
PT17,1,250.0,1.0,Évora,Montemor,10,1.0,Associated,Undergraduate,,Yes


## Features reduction

In [4]:
features_to_drop = ['Concelho', 'HighestAgriculturalEducationalDegree']

In [5]:
dataset = dataset_original.drop(features_to_drop, axis=1)

In [6]:
dataset.head()

Unnamed: 0_level_0,AdoptedSBP,PastureSurface,CattlePercentage,Distrito,FarmerSince,PercentRentedLand,LegalForm,HighestEducationalDegree,ExpectationFamilySuccession
FARM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
PT02,0,364.0,0.0,Setúbal,29,0.0,Individual,Undergraduate,Yes
PT13,1,542.58,1.0,Portalegre,11,0.0,Associated,Undergraduate,Yes
PT15,1,262.7,1.0,Portalegre,11,1.0,Associated,Undergraduate,Yes
PT16,0,23.0,1.0,Évora,3,1.0,Individual,Undergraduate,Yes
PT17,1,250.0,1.0,Évora,10,1.0,Associated,Undergraduate,Yes


In [137]:
from scipy.stats import spearmanr
spearmanr(dataset['PastureSurface'], dataset['PercentRentedLand'])

SpearmanrResult(correlation=-0.25513786732043997, pvalue=0.17360450664016885)

## Data preparation

In [7]:
dataset_att = dataset.drop('AdoptedSBP', axis=1)
labels = dataset['AdoptedSBP'].copy()

### Categorical attributes

Ordinal encoding

In [8]:
ordinal_attributes = ['HighestEducationalDegree']

In [9]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder_education = OrdinalEncoder(categories=[['Primary', 'Secondary', 'Undergraduate', 'Graduate']])

One-hot encoding

In [10]:
onehot_attributes = ['Distrito']

In [11]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()
onehot_encoder.fit_transform(dataset_att[onehot_attributes])

<30x5 sparse matrix of type '<class 'numpy.float64'>'
	with 30 stored elements in Compressed Sparse Row format>

Binary attributes

In [12]:
binary_attributes = ['ExpectationFamilySuccession', 'LegalForm']

### Numerical attributes

In [13]:
dataset_att.columns

Index(['PastureSurface', 'CattlePercentage', 'Distrito', 'FarmerSince',
       'PercentRentedLand', 'LegalForm', 'HighestEducationalDegree',
       'ExpectationFamilySuccession'],
      dtype='object')

In [14]:
numerical_attributes = [feat for feat in dataset_att.columns if (
    (feat not in ordinal_attributes) and (feat not in onehot_attributes) and (feat not in binary_attributes)
)]

In [15]:
numerical_attributes

['PastureSurface', 'CattlePercentage', 'FarmerSince', 'PercentRentedLand']

### Preparation pipeline

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [17]:
preparation_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_attributes),
    ('ord_cat_edu', ordinal_encoder_education, ['HighestEducationalDegree']),
    ('other_ord_cat', OrdinalEncoder(), binary_attributes),
    ('onehot_cat', onehot_encoder, onehot_attributes)
])

In [18]:
dataset_prep = preparation_pipeline.fit_transform(dataset_att)

### Extract all attributes

In [19]:
attributes = (numerical_attributes
              + ['HighestEducationalDegree']
              + binary_attributes)
for cat_name in onehot_encoder.categories_:
    attributes += cat_name.tolist()

In [20]:
# Print prepared data as a DataFrame
pd.DataFrame(dataset_prep, columns=attributes, index=dataset_att.index).head()

Unnamed: 0_level_0,PastureSurface,CattlePercentage,FarmerSince,PercentRentedLand,HighestEducationalDegree,ExpectationFamilySuccession,LegalForm,Beja,Portalegre,Santarém,Setúbal,Évora
FARM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PT02,-0.23919,-2.378406,1.146256,-0.615057,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
PT13,0.054486,0.556318,-0.444948,-0.615057,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
PT15,-0.405778,0.556318,-0.444948,1.85887,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
PT16,-0.799967,0.556318,-1.15215,1.85887,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
PT17,-0.426663,0.556318,-0.533348,1.85887,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


# Logistic regression

## Functions for model analysis

In [75]:
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.metrics import f1_score, precision_score, recall_score, log_loss

In [22]:
def display_scores(scores, score_name):
    #print(score_name + ' scores:', scores)
    print(score_name + ' mean:', scores.mean())
    #print(score_name + ' stdv:', scores.std())

In [82]:
def cross_val_scores(clsf, dataset_prepared, labels, seed):
    metrics = ['f1', 'precision', 'recall', 'neg_log_loss']
    metric_names = ['F1 score', 'precision', 'recall', 'neg_log loss']
    scores = cross_validate(clsf, dataset_prepared, labels,
                            scoring=metrics, 
                            cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=seed))
    
    cv_scores = {}
    for (name, metric)  in zip(metric_names, metrics):
        cv_scores[name] = scores['test_' + metric]
    return cv_scores

In [76]:
def predict_train_set(clsf, dataset_prepared, labels):
    clsf.fit(dataset_prepared, labels)
    pred_probs = clsf.predict_proba(dataset_prepared)[:, 1]
    pred_classes = pred_probs >= 0.5
    f1 = f1_score(labels, pred_classes)
    prec = precision_score(labels, pred_classes)
    rec = recall_score(labels, pred_classes)
    ll = log_loss(labels, pred_probs)

    print('Prediction on training set results')
    print('F1 score:', f1)
    print('Precision:', prec)
    print('Recall:', rec)
    print('Logistic loss:', ll)

In [78]:
from sklearn.base import clone

def test_classifier(clsf, dataset_prepared, labels, seed=None):  
    """
    Function to:
    - train a classifier using cross validation, reporting performance measures
    - get classifier's predictions on the training set, to check it for overfiting
    """
    clsf_copy = clone(clsf) 
    cv_scores = cross_val_scores(clsf_copy, dataset_prepared, labels, seed)
    print("Cross validation scores")
    for score_name, score_value in cv_scores.items():
        display_scores(score_value, score_name)
    print("")
    
    train_scores = predict_train_set(clsf_copy, dataset_prepared, labels)

## All features

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import reciprocal, uniform

In [27]:
cross_val_split = StratifiedKFold(n_splits=3, shuffle=True)

In [28]:
# Nested cross-validation to try different split into train and test set
log_reg = LogisticRegression(random_state=42)

param_grid = {
    #"tol": [5e-4],
    "max_iter": [5000],
    "penalty": ["elasticnet"],
    "solver": ["saga"],
    "l1_ratio": uniform(0., 1.),
    "C": uniform(0.001, 1)
    }

NUM_TRIALS = 30
rnd_srch_results = {}
for i in range(NUM_TRIALS):
     cv_split = StratifiedKFold(n_splits=3, shuffle=True, random_state=i)
     rnd_srch = RandomizedSearchCV(log_reg, param_grid, cv=cv_split, n_iter=1000,
                                  scoring='f1', return_train_score=True, verbose=0)
     rnd_srch.fit(dataset_prep, labels)
     rnd_srch_results[rnd_srch.best_estimator_] = rnd_srch.best_score_
     print("Iteration number", str(i), "completed.")

Iteration number 0 completed.
Iteration number 1 completed.
Iteration number 2 completed.
Iteration number 3 completed.
Iteration number 4 completed.
Iteration number 5 completed.
Iteration number 6 completed.
Iteration number 7 completed.
Iteration number 8 completed.
Iteration number 9 completed.
Iteration number 10 completed.
Iteration number 11 completed.
Iteration number 12 completed.
Iteration number 13 completed.
Iteration number 14 completed.
Iteration number 15 completed.
Iteration number 16 completed.
Iteration number 17 completed.
Iteration number 18 completed.
Iteration number 19 completed.
Iteration number 20 completed.
Iteration number 21 completed.
Iteration number 22 completed.
Iteration number 23 completed.
Iteration number 24 completed.
Iteration number 25 completed.
Iteration number 26 completed.
Iteration number 27 completed.
Iteration number 28 completed.
Iteration number 29 completed.


In [29]:
rnd_srch_results.values()

dict_values([0.7222222222222222, 0.7222222222222222, 0.738095238095238, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7388888888888889, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7388888888888889, 0.7269841269841271, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7388888888888889, 0.7222222222222222, 0.7222222222222222, 0.7269841269841271, 0.7222222222222222, 0.7388888888888889, 0.7388888888888889, 0.7388888888888889, 0.7388888888888889, 0.7222222222222222, 0.7388888888888889, 0.7222222222222222, 0.7222222222222222])

In [30]:
best_score = max(rnd_srch_results.values())
best_score

0.7388888888888889

In [31]:
best_models = [k for k, v in rnd_srch_results.items() if v == best_score]
best_models

[LogisticRegression(C=0.0773548283999207, l1_ratio=0.03927935318811604,
                    max_iter=5000, penalty='elasticnet', random_state=42,
                    solver='saga'),
 LogisticRegression(C=0.23870015747619622, l1_ratio=0.6452907729170746,
                    max_iter=5000, penalty='elasticnet', random_state=42,
                    solver='saga'),
 LogisticRegression(C=0.23663681586017382, l1_ratio=0.3207446109379246,
                    max_iter=5000, penalty='elasticnet', random_state=42,
                    solver='saga'),
 LogisticRegression(C=0.04370596992621867, l1_ratio=0.02531116496180963,
                    max_iter=5000, penalty='elasticnet', random_state=42,
                    solver='saga'),
 LogisticRegression(C=0.02568877353481258, l1_ratio=0.028208223047712533,
                    max_iter=5000, penalty='elasticnet', random_state=42,
                    solver='saga'),
 LogisticRegression(C=0.42140919847985403, l1_ratio=0.6953715607735839,
               

In [32]:
best_log_reg = best_models[0]

In [33]:
# Get index of the best model to obtain same split when evaluating
best_indexes = [i for i, v in enumerate(rnd_srch_results.values()) if v == best_score]
best_index = best_indexes[0]

In [34]:
pred = best_log_reg.predict(dataset_prep)
pred

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [35]:
#Percentage of adopters
sum(pred) / len(pred)

0.9

In [36]:
for attr, coef in zip(attributes, best_log_reg.coef_.tolist()[0]):
    print(coef, attr)

0.08142825568407668 PastureSurface
-0.0032380144533242228 CattlePercentage
0.0 FarmerSince
-0.10657444170017065 PercentRentedLand
0.05936055123342599 HighestEducationalDegree
0.004790344854862259 ExpectationFamilySuccession
-0.059757772281346336 LegalForm
-0.07065577348863503 Beja
0.0897083149782675 Portalegre
0.0 Santarém
0.0 Setúbal
0.0 Évora


In [38]:
best_log_reg.intercept_

array([0.15329])

In [83]:
test_classifier(best_log_reg, dataset_prep, labels, seed=best_index)

Cross validation scores
F1 score mean: 0.6944444444444443
precision mean: 0.5518518518518519
recall mean: 0.9444444444444445
neg_log loss mean: -0.6812902367894554

Prediction on training set results
F1 score: 0.7272727272727272
Precision: 0.5925925925925926
Recall: 0.9411764705882353
Logistic loss: 0.6561420576702656


## Threshold tuning

In [130]:
pred_prob_1 =cross_val_predict(best_log_reg, dataset_prep, labels, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=24),
                               method = 'predict_proba')[:, 1]

In [131]:
pred_prob_1 

array([0.57594829, 0.52833968, 0.48274274, 0.47315948, 0.48231502,
       0.59010533, 0.63589455, 0.52397454, 0.54735296, 0.65311381,
       0.51202777, 0.54612088, 0.5110641 , 0.56860186, 0.55121636,
       0.52934042, 0.52102879, 0.67107906, 0.59209048, 0.57820247,
       0.65593718, 0.54242627, 0.58547169, 0.51787197, 0.66553751,
       0.68291945, 0.61671156, 0.5072688 , 0.58815997, 0.62745645])

In [132]:
thresholds = np.arange(0.4, 0.7, 0.001)
f1s = {}
for th in thresholds:
    cls_pred = [1 if el > th else 0 for el in pred_prob_1]
    f1 = f1_score(labels, cls_pred)
    f1s[th] = f1

In [133]:
 m = max(f1s.values())
for k, v in f1s.items():
    if v == m:
        print(k, v)

0.4740000000000001 0.7391304347826086
0.4750000000000001 0.7391304347826086
0.4760000000000001 0.7391304347826086
0.4770000000000001 0.7391304347826086
0.4780000000000001 0.7391304347826086
0.4790000000000001 0.7391304347826086
0.4800000000000001 0.7391304347826086
0.4810000000000001 0.7391304347826086
0.4820000000000001 0.7391304347826086


## Reduced number of features

In [40]:
numerical_attributes_red = ['PastureSurface', 'PercentRentedLand']
binary_attributes_red = ['LegalForm']

In [41]:
preparation_pipeline_red = ColumnTransformer([
    ('num', StandardScaler(), numerical_attributes_red),
    ('ord_cat_edu', ordinal_encoder_education, ['HighestEducationalDegree']),
    ('other_ord_cat', OrdinalEncoder(), binary_attributes_red),
])

In [42]:
dataset_prep_red = preparation_pipeline_red.fit_transform(dataset_att)

In [43]:
attributes_red = (numerical_attributes_red + ['HighestEducationalDegree'] + binary_attributes_red)

In [44]:
# Nested cross-validation to try different split into train and test set
log_reg = LogisticRegression(random_state=42)

param_grid = {
    #"tol": [5e-4],
    "max_iter": [5000],
    "penalty": ["elasticnet"],
    "solver": ["saga"],
    "l1_ratio": uniform(0., 1.),
    "C": reciprocal(0.001, 1)
    }

NUM_TRIALS = 30
rnd_srch_results_red = {}
for i in range(NUM_TRIALS):
     cv_split = StratifiedKFold(n_splits=3, shuffle=True, random_state=i)
     rnd_srch = RandomizedSearchCV(log_reg, param_grid, cv=cv_split, n_iter=1000,
                                  scoring='f1', return_train_score=True, verbose=0)
     rnd_srch.fit(dataset_prep_red, labels)
     rnd_srch_results_red[rnd_srch.best_estimator_] = rnd_srch.best_score_
     print("Iteration number", str(i), "completed.")

Iteration number 0 completed.
Iteration number 1 completed.
Iteration number 2 completed.
Iteration number 3 completed.
Iteration number 4 completed.
Iteration number 5 completed.
Iteration number 6 completed.
Iteration number 7 completed.
Iteration number 8 completed.
Iteration number 9 completed.
Iteration number 10 completed.
Iteration number 11 completed.
Iteration number 12 completed.
Iteration number 13 completed.
Iteration number 14 completed.
Iteration number 15 completed.
Iteration number 16 completed.
Iteration number 17 completed.
Iteration number 18 completed.
Iteration number 19 completed.
Iteration number 20 completed.
Iteration number 21 completed.
Iteration number 22 completed.
Iteration number 23 completed.
Iteration number 24 completed.
Iteration number 25 completed.
Iteration number 26 completed.
Iteration number 27 completed.
Iteration number 28 completed.
Iteration number 29 completed.


In [45]:
rnd_srch_results_red.values()

dict_values([0.7222222222222222, 0.7261904761904762, 0.738095238095238, 0.7222222222222222, 0.7428571428571429, 0.7222222222222222, 0.7388888888888889, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7428571428571429, 0.7222222222222222, 0.7388888888888889, 0.7472527472527473, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7428571428571429, 0.7509157509157509, 0.7222222222222222, 0.738095238095238, 0.7222222222222222, 0.7222222222222222, 0.7388888888888889, 0.7388888888888889, 0.7388888888888889, 0.7269841269841271, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222])

In [46]:
best_score_red = max(rnd_srch_results_red.values())
best_score_red

0.7509157509157509

In [47]:
best_models_red = [k for k, v in rnd_srch_results_red.items() if v == best_score_red]
best_models_red

[LogisticRegression(C=0.678203657458604, l1_ratio=0.16876441160206257,
                    max_iter=5000, penalty='elasticnet', random_state=42,
                    solver='saga')]

In [48]:
best_log_reg_red = best_models_red[0]

In [49]:
# Get index of the best model to obtain same split when evaluating
best_indexes = [i for i, v in enumerate(rnd_srch_results_red.values()) if v == best_score_red]
best_index = best_indexes[0]

In [50]:
pred = best_log_reg_red.predict(dataset_prep_red)
pred

array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [51]:
#Percentage of adopters
sum(pred) / len(pred)

0.7666666666666667

In [52]:
for attr, coef in zip(attributes_red, best_log_reg_red.coef_.tolist()[0]):
    print(coef, attr)

0.15402075007263527 PastureSurface
-0.23522605758243706 PercentRentedLand
0.15500652648009802 HighestEducationalDegree
-0.3481608538176351 LegalForm


In [53]:
best_log_reg_red.intercept_

array([0.01299797])

In [84]:
test_classifier(best_log_reg_red, dataset_prep_red, labels, seed=best_index)

Cross validation scores
F1 score mean: 0.7509157509157509
precision mean: 0.6547619047619048
recall mean: 0.888888888888889
neg_log loss mean: -0.6686828252065444

Prediction on training set results
F1 score: 0.7
Precision: 0.6086956521739131
Recall: 0.8235294117647058
Logistic loss: 0.6452347771118475


## Threshold tuning

In [123]:
pred_prob_1 =cross_val_predict(best_log_reg_red, dataset_prep_red, labels, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=18),
                               method = 'predict_proba')[:, 1]

In [124]:
pred_prob_1 

array([0.54334971, 0.63174689, 0.52176036, 0.39795735, 0.44016472,
       0.60584269, 0.63613619, 0.50368466, 0.47838745, 0.83136972,
       0.6260317 , 0.49372539, 0.35557051, 0.67191359, 0.63505908,
       0.56657806, 0.64766028, 0.62789926, 0.77080106, 0.44733171,
       0.6456887 , 0.63677534, 0.64295149, 0.57130401, 0.61406742,
       0.60022103, 0.67986846, 0.38624202, 0.56285679, 0.64031271])

In [125]:
thresholds = np.arange(0.4, 0.7, 0.001)
f1s = {}
for th in thresholds:
    cls_pred = [1 if el > th else 0 for el in pred_prob_1]
    f1 = f1_score(labels, cls_pred)
    f1s[th] = f1

In [129]:
 m = max(f1s.values())
for k, v in f1s.items():
    if v == m:
        print(k, v)

0.4940000000000001 0.75
0.4950000000000001 0.75
0.4960000000000001 0.75
0.4970000000000001 0.75
0.4980000000000001 0.75
0.4990000000000001 0.75
0.5000000000000001 0.75
0.5010000000000001 0.75
0.5020000000000001 0.75
0.5030000000000001 0.75


# Logistic regression selected on whole training set with 4 variables (same as FL Calibrated ABM)

In [55]:
from sklearn.model_selection import ParameterSampler
from sklearn.base import clone

def RandomizedSearch(estimator, param_distribution, n_iter, X, y):
    best_f1 = 0
    best_model = None
    for g in ParameterSampler(param_distribution, n_iter):
        estimator.set_params(**g)
        estimator.fit(X, y)
        # get score and save if best
        preds = estimator.predict(X)
        f1 = f1_score(y, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_model = clone(estimator)
    best_model.fit(X, y)
    return (best_model, best_f1)

In [56]:
log_reg = LogisticRegression(random_state=42)

param_grid = {
    #"tol": [5e-4],
    "max_iter": [5000],
    "penalty": ["elasticnet"],
    "solver": ["saga"],
    "l1_ratio": uniform(0., 1.),
    "C": uniform(0.001, 1)
    }

best_log_reg_nocv, best_f1 = RandomizedSearch(log_reg, param_grid, 10000, dataset_prep_red, labels)

In [57]:
best_log_reg_nocv

LogisticRegression(C=0.20842716122500904, l1_ratio=0.31834214527077564,
                   max_iter=5000, penalty='elasticnet', random_state=42,
                   solver='saga')

In [58]:
best_f1

0.7555555555555554

In [59]:
pred = best_log_reg_nocv.predict(dataset_prep_red)
pred

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [60]:
#Percentage of adopters
sum(pred) / len(pred)

0.9333333333333333

In [61]:
for attr, coef in zip(attributes_red, best_log_reg_nocv.coef_.tolist()[0]):
    print(coef, attr)

0.0638844792982836 PastureSurface
-0.1180846618362677 PercentRentedLand
0.006803049896753338 HighestEducationalDegree
0.0 LegalForm


In [62]:
best_log_reg_nocv.intercept_

array([0.25380651])

In [85]:
test_classifier(best_log_reg_nocv, dataset_prep_red, labels)

Cross validation scores
F1 score mean: 0.6495726495726495
precision mean: 0.5793650793650793
recall mean: 0.7777777777777777
neg_log loss mean: -0.70958709293508

Prediction on training set results
F1 score: 0.7555555555555554
Precision: 0.6071428571428571
Recall: 1.0
Logistic loss: 0.670075736753839


## Threshold tuning

In [101]:
pred_prob_1 = best_log_reg_nocv.predict_proba(dataset_prep_red)[:, 1]

In [102]:
pred_prob_1

array([0.58048409, 0.58504587, 0.50549619, 0.49920079, 0.50516267,
       0.5833068 , 0.58127699, 0.57246372, 0.54513158, 0.63394203,
       0.57263573, 0.55490388, 0.49992281, 0.61708058, 0.58469941,
       0.57422272, 0.58104057, 0.57422901, 0.56892564, 0.51001403,
       0.58063132, 0.5807848 , 0.58817095, 0.5765328 , 0.58148154,
       0.5805034 , 0.61832104, 0.50057298, 0.579636  , 0.58382559])

In [108]:
thresholds = np.arange(0.5, 0.6, 0.001)
f1s = {}
for th in thresholds:
    cls_pred = [1 if el > th else 0 for el in pred_prob_1]
    f1 = f1_score(labels, cls_pred)
    f1s[th] = f1

In [115]:
max(f1s.values())

0.7555555555555554

## All features - log_loss

In [89]:
%%time
# Nested cross-validation to try different split into train and test set
log_reg = LogisticRegression(random_state=42)

param_grid = {
    #"tol": [5e-4],
    "max_iter": [5000],
    "penalty": ["elasticnet"],
    "solver": ["saga"],
    "l1_ratio": uniform(0., 1.),
    "C": uniform(0.001, 1)
    }

NUM_TRIALS = 30
rnd_srch_ll_results = {}
for i in range(NUM_TRIALS):
     cv_split = StratifiedKFold(n_splits=3, shuffle=True, random_state=i)
     rnd_srch_ll = RandomizedSearchCV(log_reg, param_grid, cv=cv_split, n_iter=2000,
                                  scoring='neg_log_loss', return_train_score=True, verbose=0)
     rnd_srch_ll.fit(dataset_prep, labels)
     rnd_srch_ll_results[rnd_srch_ll.best_estimator_] = rnd_srch_ll.best_score_
     print("Iteration number", str(i), "completed.")

Iteration number 0 completed.
Iteration number 1 completed.
Iteration number 2 completed.
Iteration number 3 completed.
Iteration number 4 completed.
Iteration number 5 completed.
Iteration number 6 completed.
Iteration number 7 completed.
Iteration number 8 completed.
Iteration number 9 completed.
Iteration number 10 completed.
Iteration number 11 completed.
Iteration number 12 completed.
Iteration number 13 completed.
Iteration number 14 completed.
Iteration number 15 completed.
Iteration number 16 completed.
Iteration number 17 completed.
Iteration number 18 completed.
Iteration number 19 completed.
Iteration number 20 completed.
Iteration number 21 completed.
Iteration number 22 completed.
Iteration number 23 completed.
Iteration number 24 completed.
Iteration number 25 completed.
Iteration number 26 completed.
Iteration number 27 completed.
Iteration number 28 completed.
Iteration number 29 completed.
Wall time: 22min 12s


In [90]:
rnd_srch_ll_results.values()

dict_values([-0.6855196421856896, -0.686051527642654, -0.6850303102487357, -0.6841065169208478, -0.6838195739715492, -0.6822886467942922, -0.689197079430035, -0.6898192779978762, -0.6835294947121963, -0.6913445700414026, -0.6929421190218629, -0.6854436348766083, -0.6835637191587741, -0.688456251643366, -0.6924285232170422, -0.6814639833232045, -0.6876838501157397, -0.6909200479432278, -0.6768436279901934, -0.6931851296537079, -0.6819083770782562, -0.6834081544138556, -0.6821634144641834, -0.685758766566683, -0.6851098781005799, -0.6845993839619737, -0.6855769152181965, -0.6891270424427431, -0.6854876308970912, -0.6875967243639582])

In [91]:
best_score_ll = max(rnd_srch_ll_results.values())
best_score_ll

-0.6768436279901934

In [92]:
best_models_ll = [k for k, v in rnd_srch_ll_results.items() if v == best_score_ll]
best_models_ll

[LogisticRegression(C=0.13522968292915205, l1_ratio=0.006503344219503826,
                    max_iter=5000, penalty='elasticnet', random_state=42,
                    solver='saga')]

In [93]:
best_log_reg_ll = best_models[0]

In [94]:
# Get index of the best model to obtain same split when evaluating
best_indexes = [i for i, v in enumerate(rnd_srch_ll_results.values()) if v == best_score_ll]
best_index = best_indexes[0]

In [95]:
pred = best_log_reg_ll.predict(dataset_prep)
pred

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [96]:
#Percentage of adopters
sum(pred) / len(pred)

0.9

In [97]:
for attr, coef in zip(attributes, best_log_reg_ll.coef_.tolist()[0]):
    print(coef, attr)

0.08142825568407668 PastureSurface
-0.0032380144533242228 CattlePercentage
0.0 FarmerSince
-0.10657444170017065 PercentRentedLand
0.05936055123342599 HighestEducationalDegree
0.004790344854862259 ExpectationFamilySuccession
-0.059757772281346336 LegalForm
-0.07065577348863503 Beja
0.0897083149782675 Portalegre
0.0 Santarém
0.0 Setúbal
0.0 Évora


In [87]:
best_log_reg_ll.intercept_

array([0.15329])

In [88]:
test_classifier(best_log_reg_ll, dataset_prep, labels, seed=best_index)

Cross validation scores
F1 score mean: 0.6944444444444443
precision mean: 0.5518518518518519
recall mean: 0.9444444444444445
neg_log loss mean: -0.6812902367894554

Prediction on training set results
F1 score: 0.7272727272727272
Precision: 0.5925925925925926
Recall: 0.9411764705882353
Logistic loss: 0.6561420576702656


# Trial with random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Nested cross-validation to try different split into train and test set
rnd_clsf = RandomForestClassifier(n_jobs=-1)

param_grid = {
    "n_estimators": np.arange(2, 100, 1),
    "max_depth": np.arange(1, 5, 1),
    "min_samples_leaf": np.arange(2, 30, 1),
    "min_samples_split": np.arange(2, 30, 1),
    }

NUM_TRIALS = 15
rnd_srch_results_red = {}
for i in range(NUM_TRIALS):
     cv_split = StratifiedKFold(n_splits=3, shuffle=True, random_state=i)
     rnd_srch = RandomizedSearchCV(rnd_clsf, param_grid, cv=cv_split, n_iter=30,
                                  scoring='f1', return_train_score=True, verbose=0)
     rnd_srch.fit(dataset_prep_red, labels)
     rnd_srch_results_red[rnd_srch.best_estimator_] = rnd_srch.best_score_
     print("Iteration number", str(i), "completed.")

In [42]:
best = rnd_srch.best_estimator_
best

RandomForestClassifier(max_depth=2, min_samples_leaf=11, min_samples_split=22,
                       n_estimators=37, n_jobs=-1)

In [44]:
best.predict(dataset_prep_red)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [45]:
rnd_srch.best_score_

0.7222222222222222

In [46]:
for attr, coef in zip(attributes, best.feature_importances_.tolist()):
    print(coef, attr)

0.6666666666666666 PastureSurface
0.0 CattlePercentage
0.3333333333333333 FarmerSince
0.0 PercentRentedLand


In [47]:
test_classifier(best, dataset_prep_red, labels)

Cross validation scores
F1 score mean: 0.7222222222222222
precision mean: 0.5666666666666668
recall mean: 1.0

Prediction on training set results
F1 score: 0.7234042553191489
Precision: 0.5666666666666667
Recall: 1.0
