# Compare classification methods for identifying org. science perspectives in JSTOR articles
## Using grid search and balanced samples from hand-labeled set of articles

@author: Thomas Lu, Jaren Haber PhD<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: September 2021

'''
Trains classifiers to predict whether an article is about a given perspective in org. science. To train the classifiers, uses preliminary labeled articles, broken down as follows: 
Cultural: 105 yes, 209 no
Relational: 92 yes, 230 no
Demographic: 77 yes, 249 no
Compares f1_weighted scores of four model structures using 10-Fold Cross Validation: Logistic regression, SVM, Naive Bayes, and Decision Tree. Oversamples training data to .7 (7:10 minority:majority class).
'''

# Initialize

In [1]:
######################################################
# Import libraries
######################################################

import pandas as pd
import numpy as np
import re
from collections import Counter
from datetime import date
from tqdm import tqdm
import os

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

stemmer = WordNetLemmatizer()

from gensim.models.keyedvectors import KeyedVectors

import matplotlib.pyplot as plt

import joblib
import csv

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, Perceptron, PassiveAggressiveClassifier, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, KFold
# from sklearn.experimental import enable_hist_gradient_boosting

# !pip install imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline


import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings(action='once')

import sys; sys.path.insert(0, "../preprocess/") # For loading functions from files in other directory
from quickpickle import quickpickle_dump, quickpickle_load # custom scripts for quick saving & loading to pickle format

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
######################################################
# Define filepaths
######################################################

data_folder = 'classification'
folder = 'tlu_test'

cwd = os.getcwd()

root = str.replace(cwd, f'{folder}/modeling', '')

thisday = date.today().strftime("%m%d%y")

# Directory for prepared data and trained models: save files here
data_fp = root + f'{data_folder}/data/'
model_fp = root + f'{folder}/models/'
logs = root + f'{folder}/modeling/logs/'

w2v_fp = root + 'models_storage/word_embeddings_data/word2vec_phrased_filtered_300d_2020_sept5.bin'

# Current article lists
article_list_fp = data_fp + 'filtered_length_index.csv' # Filtered index of research articles
article_paths_fp = data_fp + 'filtered_length_article_paths.csv' # List of article file paths

# Preprocessed training data
cult_labeled_fp = data_fp + 'training_cultural_preprocessed_022621.pkl'
relt_labeled_fp = data_fp + 'training_relational_preprocessed_022621.pkl'
demog_labeled_fp = data_fp + 'training_demographic_preprocessed_022621.pkl'
orgs_labeled_fp = data_fp + 'training_orgs_preprocessed_022621.pkl'

# Model filepaths
cult_model_fp = model_fp + f'classifier_cult_MLP_{str(thisday)}.joblib'
relt_model_fp = model_fp + f'classifier_relt_MLP_{str(thisday)}.joblib'
demog_model_fp = model_fp + f'classifier_demog_MLP_{str(thisday)}.joblib'
orgs_model_fp = model_fp + f'classifier_orgs_MLP_{str(thisday)}.joblib'

# # Vectorizers trained on hand-coded data (use to limit vocab of input texts)
# cult_vec_fp = model_fp + 'vectorizer_cult_022621.joblib'
# relt_vec_fp = model_fp + 'vectorizer_relt_022621.joblib'
# demog_vec_fp = model_fp + 'vectorizer_demog_022621.joblib'
# orgs_vec_fp = model_fp + 'vectorizer_orgs_022621.joblib'

In [3]:
# Load the word2vec model

w2v_model = KeyedVectors.load(w2v_fp)

## Load & inspect data

In [4]:
cult_df = quickpickle_load(cult_labeled_fp)
relt_df = quickpickle_load(relt_labeled_fp)
demog_df = quickpickle_load(demog_labeled_fp)
orgs_df = quickpickle_load(orgs_labeled_fp)

cult_df.head(10)

Unnamed: 0,text,cultural_score,primary_subject,edited_filename,article_name
0,"[[research, note, church_membership, netherlan...",0.0,Sociology,10.1086_210179,Where Do Interorganizational Networks Come From?
1,"[[polish, io_oo, sociological_review, issn, co...",1.0,Sociology,10.1086_210317,Civil Rights Law at Work: Sex Discrimination a...
2,"[[article, jjdlbsj, grapliy, compassionate, eg...",0.0,Sociology,10.1086_231084,Between Markets and Politics: Organizational R...
3,"[[reply, allison, more, comparing, regression_...",1.0,Sociology,10.1086_231174,World Society and the Nation‐State
4,"[[determinants, spousal, interaction, marital,...",1.0,Sociology,10.1086_382347,Kinship Networks and Entrepreneurs in China’s ...
5,"[[wsê, ih, ompany, profile, john, porter, musé...",1.0,Sociology,10.1086_517899,What Is Organizational Imprinting? Cultural En...
6,"[[andrew_christensen, university_california, l...",1.0,Sociology,10.1086_588742,"Homeward Bound? Interest, Identity, and Invest..."
7,"[[lawyers, consumer_protection, laws, stewart_...",0.0,Sociology,10.1086_657524,Corporate Unity in American Trade Policy: A Ne...
8,"[[establishing, sense, personal, control, tran...",1.0,Sociology,10.1086_659639,The Credit Crisis as a Problem in the Sociolog...
9,"[[guess, who, coming, town, white_supremacy, e...",0.0,Sociology,10.1525_irqr.2011.4.3.199,"Science, Health, and Nationhood"


In [5]:
# Check score distribution across classes
print(cult_df.groupby('cultural_score').size())
print()
print(relt_df.groupby('relational_score').size())
print()
print(demog_df.groupby('demographic_score').size())
print()
print(orgs_df.groupby('orgs_score').size())

cultural_score
0.0    475
0.5     24
1.0    234
dtype: int64

relational_score
0.0    420
0.5     29
1.0    287
dtype: int64

demographic_score
0.0    477
0.5      7
1.0    256
dtype: int64

orgs_score
0.0    303
0.5     10
1.0    511
dtype: int64


In [6]:
# Drop unsure cases: where X_score = 0.5
drop_unsure = True

if drop_unsure:
    cult_df_yes = cult_df[cult_df['cultural_score'] == 1.0]
    cult_df_no = cult_df[cult_df['cultural_score'] == 0.0]
    cult_df = pd.concat([cult_df_yes, cult_df_no])
    
    relt_df_yes = relt_df[relt_df['relational_score'] == 1.0]
    relt_df_no = relt_df[relt_df['relational_score'] == 0.0]
    relt_df = pd.concat([relt_df_yes, relt_df_no])
    
    demog_df_yes = demog_df[demog_df['demographic_score'] == 1.0]
    demog_df_no = demog_df[demog_df['demographic_score'] == 0.0]
    demog_df = pd.concat([demog_df_yes, demog_df_no])
    
    orgs_df_yes = orgs_df[orgs_df['orgs_score'] == 1.0]
    orgs_df_no = orgs_df[orgs_df['orgs_score'] == 0.0]
    orgs_df = pd.concat([orgs_df_yes, orgs_df_no])

In [7]:
######################################################
# Baseline: Create a paragraph embedding by averaging word embeddings
######################################################

def obtain_mean_vector(list_of_sentences, dim=300):
    """
    Obtains the preprocessed article and returns the mean of all existing word embeddings
    
    Args:
        list_of_sentences: a list of the tokenized sentences of words or phrases which constitute the article
        dim: the dimensions of the word vectors (default is 300)
        
    Returns:
        A vector of shape (300, ) characterizing the input sentences
    
    
    Stopwords and infrequent words don't need to be filtered 
    because they are not in the word2vec instance
    """
    
    i = 0
    sum_vec = np.zeros(shape=(dim,))
    for sent in list_of_sentences:
        for word in sent:
            if word in w2v_model.wv:
                sum_vec += w2v_model.wv[word]
                i += 1
    return sum_vec / i


def transform_dataframe(df):
    return np.stack(df.text.apply(obtain_mean_vector))


In [8]:
# transform the data

X_orgs = transform_dataframe(orgs_df)

X_cult = transform_dataframe(cult_df)

X_relt = transform_dataframe(relt_df)

X_demog = transform_dataframe(demog_df)

X_orgs.shape

(814, 300)

## Setup for modeling

In [9]:
######################################################
# Generates Balanced Pipelines
######################################################

def make_model_pipeline(model, undersample=False, sampling_ratio = 1.0, use_SMOTE=False, random_state=None):
    """
    Creates an sklearn pipeline object to handle over/under sampling or SMOTE sampling
    This is to avoid data leakage from oversampling before partitioning for cross-validation
    Apparently, this will not effect the test data when running grid search, which is desired
    
    Args:
        model: An sklearn classifier model to be packaged with an over/under sampling model
        undersample: boolean for over or undersampling
        sampling_ratio: ratio of minority to majority class
        use_SMOTE: boolean for enabling SMOTE (overrides undersample)
        
    Returns:
        A pipeline that will handle under/over sampling without the prior data leakage errors
    """
    # All sampling methods use the same arguments, so do this to save time
    kwargs = {'sampling_strategy':sampling_ratio, 'random_state': random_state}
    if use_SMOTE:
        sampler = SMOTE(**kwargs)
    elif undersample:
        sampler = RandomUnderSampler(**kwargs)
    else:
        sampler = RandomOverSampler(**kwargs)
    return Pipeline(steps=[('sampler', sampler), ('model', model)])


## Define algorithms and hyperparameter grids

In [21]:
######################################################
# Create different models with different hyperparameter grids
######################################################
seed = 42
models = []

# The first item in the tuple is the name of the algorithm
# The second item is the estimator object itself
# The third item is the param grid

models.append(('DecisionTree', DecisionTreeClassifier(random_state=seed), {
                        'criterion': ['gini','entropy'],
                        'min_samples_split': range(0, 21, 7),
                        'max_depth': range(6, 21, 5)
                    }))

models.append(('RandomForest', 
               RandomForestClassifier(random_state=seed),
                    {
                        'criterion': ['gini','entropy'],
                        'n_estimators': range(100, 301, 100),
                        'max_features': ['auto', 'log2'],
                        'max_depth': list(range(2, 11, 5))
                    }))

# models.append(('LogisticRegression', 
#                LogisticRegression(random_state=seed), 
#                {'penalty': ['l2', 'elasticnet', 'none'],'C': np.logspace(-4, 6, num=5)}))

# models.append(('SupportVectorMachine', SVC(gamma='auto'), 
#                {'C': np.logspace(-4, 6, num=5), 'gamma': [0.001, 0.0001], 'kernel': ['rbf', 'linear']}
#               ))

models.append(('PassiveAggressive', 
               PassiveAggressiveClassifier(random_state=seed, n_jobs=-1), 
               {'C': np.logspace(-4, 6, num=5), 'loss': ['hinge', 'squared_hinge']}))


models.append(('AdaBoost', AdaBoostClassifier(random_state=seed), 
               {'n_estimators': range(200, 1001, 400), 'learning_rate':[0.001, 0.01]}))

models.append(('MultiLayerPerceptron', 
               MLPClassifier(random_state=seed, max_iter=300),
            {'hidden_layer_sizes': [(50,50), (100,50), (100,)],
            'activation': ['relu', 'tanh'],
            'solver': ['sgd', 'adam'],
            'alpha': [0.0001, 0.001, 0.01, 0.05],
            'learning_rate': ['adaptive', 'constant']}))

In [16]:
def gridsearch(X, y, cv=10, metrics=[
    'mean_test_balanced_accuracy', 'mean_test_f1', 
    'mean_test_precision', 'mean_test_recall',
    'mean_test_accuracy', 'mean_train_accuracy']):
    
    """
    Runs a grid search over all the models in the models variable for the given X and y dataset.
    This produces a cross-validation and gathers some useful metrics.
    
    Args:
        X: the input training data of shape (n_samples, n_features)
        y: the vector of labels of shape (n_samples,) or (n_samples, n_targets)
        cv: int, cross-validation generator or an iterable, Determines the cross-validation splitting strategy
        metrics: list of metrics to gather, from https://scikit-learn.org/stable/modules/model_evaluation.html
                the first argument of this is used as the refit parameter
    Returns:
        A dataframe containing the parameters of the best fit model for each model and sampling type
    """
    
    data = []
    
    # Make raw_metrics a list of the unique metrics (f1, accuracy, etc) without the "mean_test_" prefix
    temp_metrics = [re.sub(r'\w+?_\w+?_', '', metric) for metric in metrics]
    raw_metrics = []
    for metric in temp_metrics:
        if metric not in raw_metrics:
            raw_metrics.append(metric)
    
    # iterate over every model in the models list
    for model_name, model, params in models:
        
        # modify the param keys because the pipeline changes some names
        cv_params = {f'model__{key}': value for key, value in params.items()}
        
        # also iterate over each sampling type
        for sampling_name, under, smote in [
            ('under', True, False), ('over', False, False), ('smote', False, True)]:
            
            # create a resampling pipeline object
            pipeline = make_model_pipeline(model, undersample=under, use_SMOTE=smote)
            
            # run gridsearch 
            gscv = GridSearchCV(
                pipeline, param_grid=cv_params, cv=cv, 
                scoring=raw_metrics,
                verbose=1, n_jobs=1,
                return_train_score=True, refit=raw_metrics[0])
            gscv.fit(X, y)

            # find and store the best performing parameters
            row = {'Name':f'{model_name} {sampling_name}',
                   'Params':gscv.best_params_}
            results, ind = gscv.cv_results_, gscv.best_index_
            
            for metric in metrics:
                row[metric[5:]] = results[metric][ind]
                print(model_name, sampling_name, metric[5:], results[metric][ind])
            data.append(row)
            
            print()
        print()
        
    return pd.DataFrame(data)

In [17]:
# function to simplify running the parameter searches

def run_search(df, score_name, X, prefix=''):
    Y = df[[score_name+'_score']].values[:, 0].astype('float')
    print(len(df), 'cases |', len(Y), 'codes')
    params = gridsearch(X, Y)
    params.to_csv(logs+f'{prefix}{score_name}_grid_{str(thisday)}.csv', index=False)
    return params

In [23]:
run_search(orgs_df, 'orgs', X_orgs, 'ave_embed_')

814 cases | 814 codes
Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree under test_balanced_accuracy 0.6659004768160366
DecisionTree under test_f1 0.7364177482387059
DecisionTree under test_precision 0.7564161692866013
DecisionTree under test_recall 0.7277149321266968
DecisionTree under test_accuracy 0.6818428184281843
DecisionTree under train_accuracy 0.89776090473315

Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree over test_balanced_accuracy 0.7027039848197344
DecisionTree over test_f1 0.8113362042820518
DecisionTree over test_precision 0.7542862698228431
DecisionTree over test_recall 0.8941176470588236
DecisionTree over test_accuracy 0.7517765733212887
DecisionTree over train_accuracy 0.9473134211526849

Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree smote test_balanced_accuracy 0.7000474383301707
DecisionTree smote test_f1 0.8039462310821499
DecisionTree smote test_precision 0.7507399537154517
Decis

Unnamed: 0,Name,Params,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,train_accuracy
0,DecisionTree under,"{'model__criterion': 'gini', 'model__max_depth...",0.681843,0.6659,0.736418,0.756416,0.727715,0.897761
1,DecisionTree over,"{'model__criterion': 'entropy', 'model__max_de...",0.751777,0.702704,0.811336,0.754286,0.894118,0.947313
2,DecisionTree smote,"{'model__criterion': 'gini', 'model__max_depth...",0.746838,0.700047,0.803946,0.75074,0.882353,0.950587
3,RandomForest under,"{'model__criterion': 'entropy', 'model__max_de...",0.708898,0.696698,0.755072,0.78571,0.743288,0.896259
4,RandomForest over,"{'model__criterion': 'gini', 'model__max_depth...",0.754441,0.706629,0.813544,0.762137,0.894118,0.954137
5,RandomForest smote,"{'model__criterion': 'gini', 'model__max_depth...",0.780247,0.74938,0.825598,0.808464,0.870588,0.950588
6,PassiveAggressive under,"{'model__C': 3162.2776601683795, 'model__loss'...",0.5528,0.566594,0.518988,0.682645,0.519532,0.607574
7,PassiveAggressive over,"{'model__C': 0.03162277660168379, 'model__loss...",0.572388,0.55955,0.604326,0.695071,0.610068,0.674336
8,PassiveAggressive smote,"{'model__C': 10.0, 'model__loss': 'squared_hin...",0.580789,0.568519,0.584211,0.712791,0.622738,0.629024
9,AdaBoost under,"{'model__learning_rate': 0.01, 'model__n_estim...",0.595905,0.579483,0.662225,0.688841,0.64359,0.770812


In [22]:
run_search(cult_df, 'cultural', X_cult, 'ave_embed_')

709 cases | 709 codes
Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree under test_balanced_accuracy 0.5248448581560284
DecisionTree under test_f1 0.41683977493546004
DecisionTree under test_precision 0.3511300238464924
DecisionTree under test_recall 0.5208333333333333
DecisionTree under test_accuracy 0.526297786720322
DecisionTree under train_accuracy 0.7878022576419856

Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree over test_balanced_accuracy 0.5315554656182548
DecisionTree over test_f1 0.39208244522956354
DecisionTree over test_precision 0.37503928246392093
DecisionTree over test_recall 0.4182971014492754
DecisionTree over test_accuracy 0.5699195171026157
DecisionTree over train_accuracy 0.9612899269528702

Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree smote test_balanced_accuracy 0.5547997610237434
DecisionTree smote test_f1 0.42712080710960887
DecisionTree smote test_precision 0.38808364815261365

Unnamed: 0,Name,Params,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,train_accuracy
0,DecisionTree under,"{'model__criterion': 'entropy', 'model__max_de...",0.526298,0.524845,0.41684,0.35113,0.520833,0.787802
1,DecisionTree over,"{'model__criterion': 'gini', 'model__max_depth...",0.56992,0.531555,0.392082,0.375039,0.418297,0.96129
2,DecisionTree smote,"{'model__criterion': 'gini', 'model__max_depth...",0.581107,0.5548,0.427121,0.388084,0.478261,0.942488
3,RandomForest under,"{'model__criterion': 'entropy', 'model__max_de...",0.540262,0.531776,0.419824,0.359883,0.50779,0.708823
4,RandomForest over,"{'model__criterion': 'entropy', 'model__max_de...",0.623441,0.52726,0.29529,0.388169,0.243659,0.991851
5,RandomForest smote,"{'model__criterion': 'gini', 'model__max_depth...",0.596579,0.531051,0.353147,0.38424,0.3375,0.985738
6,PassiveAggressive under,"{'model__C': 3162.2776601683795, 'model__loss'...",0.461026,0.523207,0.406857,0.423717,0.700181,0.507149
7,PassiveAggressive over,"{'model__C': 3162.2776601683795, 'model__loss'...",0.549779,0.532972,0.346727,0.361074,0.480616,0.646329
8,PassiveAggressive smote,"{'model__C': 0.03162277660168379, 'model__loss...",0.578129,0.524107,0.332347,0.354697,0.372283,0.685008
9,AdaBoost under,"{'model__learning_rate': 0.001, 'model__n_esti...",0.564286,0.549221,0.424652,0.379696,0.501812,0.623722


In [24]:
run_search(relt_df, 'relational', X_relt, 'ave_embed_')

707 cases | 707 codes
Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree under test_balanced_accuracy 0.5318144499178983
DecisionTree under test_f1 0.485673273448416
DecisionTree under test_precision 0.4328582065314007
DecisionTree under test_recall 0.5588669950738917
DecisionTree under test_accuracy 0.5264587525150904
DecisionTree under train_accuracy 0.8904581716576325

Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree over test_balanced_accuracy 0.5334154351395731
DecisionTree over test_f1 0.45369530977339234
DecisionTree over test_precision 0.4397617387697174
DecisionTree over test_recall 0.4834975369458128
DecisionTree over test_accuracy 0.5431589537223339
DecisionTree over train_accuracy 0.8035462515920738

Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree smote test_balanced_accuracy 0.5308702791461413
DecisionTree smote test_f1 0.5018415245206129
DecisionTree smote test_precision 0.4380418485273216
Dec

Unnamed: 0,Name,Params,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,train_accuracy
0,DecisionTree under,"{'model__criterion': 'gini', 'model__max_depth...",0.526459,0.531814,0.485673,0.432858,0.558867,0.890458
1,DecisionTree over,"{'model__criterion': 'gini', 'model__max_depth...",0.543159,0.533415,0.453695,0.439762,0.483498,0.803546
2,DecisionTree smote,"{'model__criterion': 'entropy', 'model__max_de...",0.517666,0.53087,0.501842,0.438042,0.602217,0.741772
3,RandomForest under,"{'model__criterion': 'gini', 'model__max_depth...",0.520262,0.524938,0.479245,0.426832,0.549877,0.905544
4,RandomForest over,"{'model__criterion': 'entropy', 'model__max_de...",0.555815,0.539943,0.451696,0.455087,0.453695,0.753249
5,RandomForest smote,"{'model__criterion': 'gini', 'model__max_depth...",0.524668,0.518124,0.449288,0.424307,0.483867,0.744461
6,PassiveAggressive under,"{'model__C': 1000000.0, 'model__loss': 'hinge'}",0.476439,0.510099,0.478591,0.402316,0.684483,0.582287
7,PassiveAggressive over,"{'model__C': 3162.2776601683795, 'model__loss'...",0.506197,0.509093,0.387882,0.440504,0.520567,0.601331
8,PassiveAggressive smote,"{'model__C': 0.03162277660168379, 'model__loss...",0.507686,0.514881,0.470355,0.416802,0.553571,0.616843
9,AdaBoost under,"{'model__learning_rate': 0.01, 'model__n_estim...",0.51338,0.51954,0.475729,0.423429,0.550985,0.771801


In [25]:
run_search(demog_df, 'demographic', X_demog, 'ave_embed_')

733 cases | 733 codes
Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree under test_balanced_accuracy 0.577548690671031
DecisionTree under test_f1 0.4949171700101128
DecisionTree under test_precision 0.4257698428654783
DecisionTree under test_recall 0.5973846153846154
DecisionTree under test_accuracy 0.571380970011107
DecisionTree under train_accuracy 0.8247728422311124

Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree over test_balanced_accuracy 0.536913188761593
DecisionTree over test_f1 0.412268574645917
DecisionTree over test_precision 0.3888221531671742
DecisionTree over test_recall 0.44461538461538463
DecisionTree over test_accuracy 0.5647908182154757
DecisionTree over train_accuracy 0.8770607440106681

Fitting 10 folds for each of 18 candidates, totalling 180 fits
DecisionTree smote test_balanced_accuracy 0.540636524822695
DecisionTree smote test_f1 0.41114315986107774
DecisionTree smote test_precision 0.39028714233078843
Decis

Unnamed: 0,Name,Params,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,train_accuracy
0,DecisionTree under,"{'model__criterion': 'gini', 'model__max_depth...",0.571381,0.577549,0.494917,0.42577,0.597385,0.824773
1,DecisionTree over,"{'model__criterion': 'entropy', 'model__max_de...",0.564791,0.536913,0.412269,0.388822,0.444615,0.877061
2,DecisionTree smote,"{'model__criterion': 'gini', 'model__max_depth...",0.570233,0.540637,0.411143,0.390287,0.442,0.979537
3,RandomForest under,"{'model__criterion': 'entropy', 'model__max_de...",0.560626,0.569183,0.487309,0.413366,0.597231,0.853869
4,RandomForest over,"{'model__criterion': 'entropy', 'model__max_de...",0.628823,0.560678,0.383833,0.460587,0.335231,0.990298
5,RandomForest smote,"{'model__criterion': 'entropy', 'model__max_de...",0.598852,0.566204,0.442165,0.432395,0.457231,0.969834
6,PassiveAggressive under,"{'model__C': 0.03162277660168379, 'model__loss...",0.561773,0.54114,0.403332,0.421624,0.467077,0.662888
7,PassiveAggressive over,"{'model__C': 0.03162277660168379, 'model__loss...",0.591984,0.550068,0.387611,0.40141,0.412769,0.681509
8,PassiveAggressive smote,"{'model__C': 0.03162277660168379, 'model__loss...",0.585135,0.563036,0.443061,0.417271,0.491231,0.658642
9,AdaBoost under,"{'model__learning_rate': 0.01, 'model__n_estim...",0.534672,0.54743,0.467793,0.391253,0.589231,0.715172
