Notebook to use logistic regression on the Animal Future survey data to predict farmers adoption.

In [1]:
import pandas as pd
import numpy as np

# Data preparation

## Data upload

In [2]:
path_to_data = "./survey_data/AF_survey_data_30.xlsx"

In [3]:
dataset_original = pd.read_excel(path_to_data, index_col=0)
dataset_original.head()

Unnamed: 0_level_0,AdoptedSBP,PastureSurface,CattlePercentage,Distrito,Concelho,FarmerSince,PercentRentedLand,LegalForm,HighestEducationalDegree,HighestAgriculturalEducationalDegree,ExpectationFamilySuccession
FARM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
PT02,0,364.0,0.0,Setúbal,Grândola,29,0.0,Individual,Undergraduate,Undergraduate,Yes
PT13,1,542.58,1.0,Portalegre,Avis,11,0.0,Associated,Undergraduate,Undergraduate,Yes
PT15,1,262.7,1.0,Portalegre,Monforte,11,1.0,Associated,Undergraduate,Undergraduate,Yes
PT16,0,23.0,1.0,Évora,Évora,3,1.0,Individual,Undergraduate,Undergraduate,Yes
PT17,1,250.0,1.0,Évora,Montemor,10,1.0,Associated,Undergraduate,,Yes


## Features reduction

In [4]:
features_to_drop = ['Concelho', 'HighestAgriculturalEducationalDegree']

In [5]:
dataset = dataset_original.drop(features_to_drop, axis=1)

In [6]:
dataset.head()

Unnamed: 0_level_0,AdoptedSBP,PastureSurface,CattlePercentage,Distrito,FarmerSince,PercentRentedLand,LegalForm,HighestEducationalDegree,ExpectationFamilySuccession
FARM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
PT02,0,364.0,0.0,Setúbal,29,0.0,Individual,Undergraduate,Yes
PT13,1,542.58,1.0,Portalegre,11,0.0,Associated,Undergraduate,Yes
PT15,1,262.7,1.0,Portalegre,11,1.0,Associated,Undergraduate,Yes
PT16,0,23.0,1.0,Évora,3,1.0,Individual,Undergraduate,Yes
PT17,1,250.0,1.0,Évora,10,1.0,Associated,Undergraduate,Yes


## Data preparation

In [7]:
dataset_att = dataset.drop('AdoptedSBP', axis=1)
labels = dataset['AdoptedSBP'].copy()

### Categorical attributes

Ordinal encoding

In [8]:
ordinal_attributes = ['HighestEducationalDegree']

In [9]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder_education = OrdinalEncoder(categories=[['Primary', 'Secondary', 'Undergraduate', 'Graduate']])

One-hot encoding

In [10]:
onehot_attributes = ['Distrito']

In [11]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()
onehot_encoder.fit_transform(dataset_att[onehot_attributes])

<30x5 sparse matrix of type '<class 'numpy.float64'>'
	with 30 stored elements in Compressed Sparse Row format>

Binary attributes

In [12]:
binary_attributes = ['ExpectationFamilySuccession', 'LegalForm']

### Numerical attributes

In [13]:
dataset_att.columns

Index(['PastureSurface', 'CattlePercentage', 'Distrito', 'FarmerSince',
       'PercentRentedLand', 'LegalForm', 'HighestEducationalDegree',
       'ExpectationFamilySuccession'],
      dtype='object')

In [14]:
numerical_attributes = [feat for feat in dataset_att.columns if (
    (feat not in ordinal_attributes) and (feat not in onehot_attributes) and (feat not in binary_attributes)
)]

In [15]:
numerical_attributes

['PastureSurface', 'CattlePercentage', 'FarmerSince', 'PercentRentedLand']

### Preparation pipeline

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [17]:
preparation_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_attributes),
    ('ord_cat_edu', ordinal_encoder_education, ['HighestEducationalDegree']),
    ('other_ord_cat', OrdinalEncoder(), binary_attributes),
    ('onehot_cat', onehot_encoder, onehot_attributes)
])

In [18]:
dataset_prep = preparation_pipeline.fit_transform(dataset_att)

### Extract all attributes

In [19]:
attributes = (numerical_attributes
              + ['HighestEducationalDegree']
              + binary_attributes)
for cat_name in onehot_encoder.categories_:
    attributes += cat_name.tolist()

In [20]:
# Print prepared data as a DataFrame
pd.DataFrame(dataset_prep, columns=attributes, index=dataset_att.index).head()

Unnamed: 0_level_0,PastureSurface,CattlePercentage,FarmerSince,PercentRentedLand,HighestEducationalDegree,ExpectationFamilySuccession,LegalForm,Beja,Portalegre,Santarém,Setúbal,Évora
FARM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PT02,-0.23919,-2.378406,1.146256,-0.615057,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
PT13,0.054486,0.556318,-0.444948,-0.615057,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
PT15,-0.405778,0.556318,-0.444948,1.85887,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
PT16,-0.799967,0.556318,-1.15215,1.85887,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
PT17,-0.426663,0.556318,-0.533348,1.85887,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


# Logistic regression

## Functions for model analysis

In [21]:
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_recall_curve, roc_curve, log_loss

In [22]:
def display_scores(scores, score_name):
    #print(score_name + ' scores:', scores)
    print(score_name + ' mean:', scores.mean())
    #print(score_name + ' stdv:', scores.std())

In [23]:
def cross_val_scores(clsf, dataset_prepared, labels):
    #metrics = ['neg_log_loss', 'roc_auc', 'accuracy', 'f1', 'average_precision', 'precision', 'recall']
    #metric_names = ['negative log loss', 'ROC AUC', 'accuracy', 'f1 score', 'average precision / PR AUC', 'precision', 'recall']
    metrics = ['f1']
    metric_names = ['f1 score']
    scores = cross_validate(clsf, dataset_prepared, labels,
                            scoring=metrics, 
                            cv=cross_val_split)
    
    cv_scores = {}
    for (name, metric)  in zip(metric_names, metrics):
        cv_scores[name] = scores['test_' + metric]
    return cv_scores

In [24]:
def predict_train_set(clsf, dataset_prepared, labels):
    clsf.fit(dataset_prepared, labels)
    pred_probs = clsf.predict_proba(dataset_prepared)[:, 1]
    pred_classes = pred_probs >= 0.5
    #ll = log_loss(labels, pred_probs)
    #auc = roc_auc_score(labels, pred_probs)
    #acc = accuracy_score(labels, pred_classes)
    f1 = f1_score(labels, pred_classes)

    print('Prediction on training set results')
    #print('negative log loss score:', -ll)
    #print('ROC AUC score:', auc)
    #print('accuracy score:', acc)
    print('f1 score:', f1)
    
    #train_scores = {'log loss': ll, 'ROC AUC': auc, 'accuracy': acc, 'f1 score': f1}
    train_scores = {'f1 score': f1}
    return train_scores

In [25]:
from sklearn.base import clone

def test_classifier(clsf, dataset_prepared, labels):  
    """
    Function to:
    - train a classifier using cross validation, reporting performance measures
    - get classifier's predictions on the training set, to check it for overfiting
    """
    clsf_copy = clone(clsf) 
    cv_scores = cross_val_scores(clsf_copy, dataset_prepared, labels)
    print("Cross validation scores")
    for score_name, score_value in cv_scores.items():
        display_scores(score_value, score_name)
    print("")
    
    train_scores = predict_train_set(clsf_copy, dataset_prepared, labels)

## All features

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import reciprocal, uniform

In [27]:
cross_val_split = StratifiedKFold(n_splits=3, shuffle=True)

In [641]:
# Nested cross-validation to try different split into train and test set
log_reg = LogisticRegression()

param_grid = {
    #"tol": [5e-4],
    "max_iter": [5000],
    "penalty": ["elasticnet"],
    "solver": ["saga"],
    "l1_ratio": uniform(0., 1.),
    "C": uniform(0.001, 1)
    }

NUM_TRIALS = 30
rnd_srch_results = {}
for i in range(NUM_TRIALS):
     cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=i)
     rnd_srch = RandomizedSearchCV(log_reg, param_grid, cv=cv_split, n_iter=1000,
                                  scoring='f1', return_train_score=True, verbose=0)
     rnd_srch.fit(dataset_prep, labels)
     rnd_srch_results[rnd_srch.best_estimator_] = rnd_srch.best_score_
     print("Iteration number", str(i), "completed.")

Iteration number 0 completed.
Iteration number 1 completed.
Iteration number 2 completed.
Iteration number 3 completed.
Iteration number 4 completed.
Iteration number 5 completed.
Iteration number 6 completed.
Iteration number 7 completed.
Iteration number 8 completed.
Iteration number 9 completed.
Iteration number 10 completed.
Iteration number 11 completed.
Iteration number 12 completed.
Iteration number 13 completed.
Iteration number 14 completed.
Iteration number 15 completed.
Iteration number 16 completed.
Iteration number 17 completed.
Iteration number 18 completed.
Iteration number 19 completed.
Iteration number 20 completed.
Iteration number 21 completed.
Iteration number 22 completed.
Iteration number 23 completed.
Iteration number 24 completed.
Iteration number 25 completed.
Iteration number 26 completed.
Iteration number 27 completed.
Iteration number 28 completed.
Iteration number 29 completed.


In [650]:
rnd_srch_results.values()

dict_values([0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222])

In [651]:
best_score = max(rnd_srch_results.values())
best_score

0.7222222222222222

In [652]:
best_models = [k for k, v in rnd_srch_results.items() if v == best_score]
best_models

[LogisticRegression(C=0.18420009497390522, l1_ratio=0.8798503520474761,
                    max_iter=5000, penalty='elasticnet', solver='saga'),
 LogisticRegression(C=0.15314534840090865, l1_ratio=0.7678299455403869,
                    max_iter=5000, penalty='elasticnet', solver='saga'),
 LogisticRegression(C=0.17750873563564762, l1_ratio=0.9154102173849715,
                    max_iter=5000, penalty='elasticnet', solver='saga'),
 LogisticRegression(C=0.21511936934400033, l1_ratio=0.9978175997771237,
                    max_iter=5000, penalty='elasticnet', solver='saga'),
 LogisticRegression(C=0.09204863617862546, l1_ratio=0.6613775331282453,
                    max_iter=5000, penalty='elasticnet', solver='saga'),
 LogisticRegression(C=0.05634322030912864, l1_ratio=0.35866453265846854,
                    max_iter=5000, penalty='elasticnet', solver='saga'),
 LogisticRegression(C=0.16303404355076823, l1_ratio=0.689482669064816,
                    max_iter=5000, penalty='elasticnet', s

In [653]:
best_log_reg = best_models[0]

In [654]:
best_log_reg.predict(dataset_prep)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [655]:
for attr, coef in zip(attributes, best_log_reg.coef_.tolist()[0]):
    print(coef, attr)

0.0 PastureSurface
0.0 CattlePercentage
0.0 FarmerSince
0.0 PercentRentedLand
0.0 HighestEducationalDegree
0.0 ExpectationFamilySuccession
0.0 LegalForm
0.0 Beja
0.0 Portalegre
0.0 Santarém
0.0 Setúbal
0.0 Évora


In [656]:
test_classifier(best_log_reg, dataset_prep, labels)

Cross validation scores
f1 score mean: 0.7222222222222222

Prediction on training set results
f1 score: 0.7234042553191489


## Reduced number of features

In [28]:
numerical_attributes_red = ['PastureSurface', 'PercentRentedLand']
binary_attributes_red = ['LegalForm']

In [29]:
preparation_pipeline_red = ColumnTransformer([
    ('num', StandardScaler(), numerical_attributes_red),
    ('ord_cat_edu', ordinal_encoder_education, ['HighestEducationalDegree']),
    ('other_ord_cat', OrdinalEncoder(), binary_attributes_red),
])

In [30]:
dataset_prep_red = preparation_pipeline_red.fit_transform(dataset_att)

In [31]:
attributes_red = (numerical_attributes_red + ['HighestEducationalDegree'] + binary_attributes_red)

In [39]:
# Nested cross-validation to try different split into train and test set
log_reg = LogisticRegression()

param_grid = {
    #"tol": [5e-4],
    "max_iter": [5000],
    "penalty": ["elasticnet"],
    "solver": ["saga"],
    "l1_ratio": uniform(0., 1.),
    "C": reciprocal(0.001, 1)
    }

NUM_TRIALS = 30
rnd_srch_results_red = {}
for i in range(NUM_TRIALS):
     cv_split = StratifiedKFold(n_splits=3, shuffle=True, random_state=i)
     rnd_srch = RandomizedSearchCV(log_reg, param_grid, cv=cv_split, n_iter=1000,
                                  scoring='f1', return_train_score=True, verbose=0)
     rnd_srch.fit(dataset_prep_red, labels)
     rnd_srch_results_red[rnd_srch.best_estimator_] = rnd_srch.best_score_
     print("Iteration number", str(i), "completed.")

Iteration number 0 completed.
Iteration number 1 completed.
Iteration number 2 completed.
Iteration number 3 completed.
Iteration number 4 completed.
Iteration number 5 completed.
Iteration number 6 completed.
Iteration number 7 completed.
Iteration number 8 completed.
Iteration number 9 completed.
Iteration number 10 completed.
Iteration number 11 completed.
Iteration number 12 completed.
Iteration number 13 completed.
Iteration number 14 completed.
Iteration number 15 completed.
Iteration number 16 completed.
Iteration number 17 completed.
Iteration number 18 completed.
Iteration number 19 completed.
Iteration number 20 completed.
Iteration number 21 completed.
Iteration number 22 completed.
Iteration number 23 completed.
Iteration number 24 completed.
Iteration number 25 completed.
Iteration number 26 completed.
Iteration number 27 completed.
Iteration number 28 completed.
Iteration number 29 completed.


In [40]:
rnd_srch_results_red.values()

dict_values([0.7222222222222222, 0.7261904761904762, 0.738095238095238, 0.7222222222222222, 0.7428571428571429, 0.7222222222222222, 0.7388888888888889, 0.7222222222222222, 0.7269841269841271, 0.7222222222222222, 0.7428571428571429, 0.7222222222222222, 0.7388888888888889, 0.7472527472527473, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.7428571428571429, 0.7509157509157509, 0.7222222222222222, 0.7286324786324787, 0.7222222222222222, 0.7388888888888889, 0.7388888888888889, 0.7388888888888889, 0.7388888888888889, 0.7269841269841271, 0.7388888888888889, 0.7222222222222222, 0.7222222222222222])

In [41]:
best_score_red = max(rnd_srch_results_red.values())
best_score_red

0.7509157509157509

In [42]:
best_models_red = [k for k, v in rnd_srch_results_red.items() if v == best_score_red]
best_models_red

[LogisticRegression(C=0.5159745211945587, l1_ratio=0.033494396058209475,
                    max_iter=5000, penalty='elasticnet', solver='saga')]

In [43]:
best_log_reg_red = best_models_red[0]

In [44]:
best_log_reg_red.predict(dataset_prep_red)

array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [45]:
for attr, coef in zip(attributes_red, best_log_reg_red.coef_.tolist()[0]):
    print(coef, attr)

0.1649394077264545 PastureSurface
-0.23522868039886305 PercentRentedLand
0.16962977865355552 HighestEducationalDegree
-0.3296090864746737 LegalForm


In [46]:
best_log_reg_red.intercept_

array([-0.02515551])

In [49]:
test_classifier(best_log_reg_red, dataset_prep_red, labels)

Cross validation scores
f1 score mean: 0.6151515151515151

Prediction on training set results
f1 score: 0.7


# Logistic regression selected on whole training set with 4 variables (same as FL Calibrated ABM)

In [32]:
from sklearn.model_selection import ParameterSampler
from sklearn.base import clone

def RandomizedSearch(estimator, param_distribution, n_iter, X, y):
    best_f1 = 0
    best_model = None
    for g in ParameterSampler(param_distribution, n_iter):
        estimator.set_params(**g)
        estimator.fit(X, y)
        # get score and save if best
        preds = estimator.predict(X)
        f1 = f1_score(y, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_model = clone(estimator)
    best_model.fit(X, y)
    return (best_model, best_f1)

In [33]:
log_reg = LogisticRegression()

param_grid = {
    #"tol": [5e-4],
    "max_iter": [5000],
    "penalty": ["elasticnet"],
    "solver": ["saga"],
    "l1_ratio": uniform(0., 1.),
    "C": uniform(0.001, 1)
    }

best_log_reg_nocv, best_f1 = RandomizedSearch(log_reg, param_grid, 10000, dataset_prep_red, labels)

In [34]:
best_log_reg_nocv

LogisticRegression(C=0.2011450499485996, l1_ratio=0.3034600016432272,
                   max_iter=5000, penalty='elasticnet', solver='saga')

In [35]:
best_log_reg_nocv.predict(dataset_prep_red)

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [36]:
for attr, coef in zip(attributes_red, best_log_reg_nocv.coef_.tolist()[0]):
    print(coef, attr)

0.06431054471062879 PastureSurface
-0.11718656859772143 PercentRentedLand
0.009297341584584468 HighestEducationalDegree
0.0 LegalForm


In [37]:
best_log_reg_nocv.intercept_

array([0.24807761])

In [38]:
test_classifier(best_log_reg_nocv, dataset_prep_red, labels)

Cross validation scores
f1 score mean: 0.7222222222222222

Prediction on training set results
f1 score: 0.7555555555555554


# Trial with random forest

In [640]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [157]:
rnd_clsf = RandomForestClassifier(n_jobs=-1)

param_grid = {
    "n_estimators": np.arange(2, 100, 1),
    "max_depth": np.arange(1, 5, 1),
    "min_samples_leaf": np.arange(2, 30, 1),
    "min_samples_split": np.arange(2, 30, 1),
    "max_leaf_nodes": np.arange(2, 100, 1)
    }
        
grid_search_rnd_clsf = RandomizedSearchCV(rnd_clsf, param_grid, cv=cross_val_split, n_iter=100,
                                    scoring='f1',
                                    return_train_score=True, verbose=1)

grid_search_rnd_clsf.fit(dataset_prep, labels)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  2.1min finished


RandomizedSearchCV(cv=KFold(n_splits=3, random_state=42, shuffle=True),
                   estimator=RandomForestClassifier(n_jobs=-1), n_iter=100,
                   param_distributions={'max_depth': array([1, 2, 3, 4]),
                                        'max_leaf_nodes': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
       53, 54, 5...
                                        'n_estimators': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
       70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
       87, 88, 89, 90, 91, 92, 93, 94, 95, 9

In [158]:
best = grid_search_rnd_clsf.best_estimator_
best

RandomForestClassifier(max_depth=1, max_leaf_nodes=92, min_samples_leaf=5,
                       min_samples_split=11, n_estimators=22, n_jobs=-1)

In [159]:
best.predict(dataset_prep)

array([1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [160]:
grid_search_rnd_clsf.best_score_

0.7286324786324787

In [161]:
for attr, coef in zip(attributes, best.feature_importances_.tolist()):
    print(coef, attr)

0.13636363636363635 PastureSurface
0.045454545454545456 CattlePercentage
0.09090909090909091 FarmerSince
0.13636363636363635 PercentRentedLand
0.09090909090909091 HighestEducationalDegree
0.045454545454545456 ExpectationFamilySuccession
0.22727272727272727 LegalForm
0.13636363636363635 Beja
0.0 Portalegre
0.0 Santarém
0.0 Setúbal
0.09090909090909091 Évora


In [165]:
test_classifier(best, dataset_prep_red, labels)

Cross validation scores
negative log loss mean: -0.7090943212578465
negative log loss stdv: 0.026601056056504573
ROC AUC mean: 0.5002777777777778
ROC AUC stdv: 0.029268341300270756
accuracy mean: 0.5
accuracy stdv: 0.08164965809277258
f1 score mean: 0.6055555555555555
f1 score stdv: 0.14927809825049332
average precision / PR AUC mean: 0.6582451499118165
average precision / PR AUC stdv: 0.10584216774723841
precision mean: 0.5185185185185185
precision stdv: 0.08574694002066832
recall mean: 0.7444444444444445
recall stdv: 0.2528845928164676

Prediction on training set results
negative log loss score: -0.6275624571275097
ROC AUC score: 0.7013574660633485
accuracy score: 0.6
f1 score: 0.7
