# Compare classification methods for identifying org. science perspectives in JSTOR articles
## Using grid search and balanced samples from hand-labeled set of articles

@author: Thomas Lu, Jaren Haber PhD<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: September 2021

'''
Trains classifiers to predict whether an article is about a given perspective in org. science. To train the classifiers, uses preliminary labeled articles, broken down as follows: 
Cultural: 105 yes, 209 no
Relational: 92 yes, 230 no
Demographic: 77 yes, 249 no
Compares f1_weighted scores of four model structures using 10-Fold Cross Validation: Logistic regression, SVM, Naive Bayes, and Decision Tree. Oversamples training data to .7 (7:10 minority:majority class).
'''

# Initialize

In [1]:
!pip install nltk

[31mERROR: Error checking for conflicts.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/pip/_vendor/pkg_resources/__init__.py", line 3012, in _dep_map
    return self.__dep_map
  File "/opt/conda/lib/python3.7/site-packages/pip/_vendor/pkg_resources/__init__.py", line 2806, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/pip/_vendor/pkg_resources/__init__.py", line 3003, in _parsed_pkg_info
    return self._pkg_info
  File "/opt/conda/lib/python3.7/site-packages/pip/_vendor/pkg_resources/__init__.py", line 2806, in __getattr__
    raise AttributeError(attr)
AttributeError: _pkg_info

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/pip/_internal/commands/in

In [2]:
######################################################
# Import libraries
######################################################

import pandas as pd
import numpy as np
import re
from collections import Counter
from datetime import date
from tqdm import tqdm
import os

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

stemmer = WordNetLemmatizer()

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

import joblib
import csv

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, Perceptron, PassiveAggressiveClassifier, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, KFold
# from sklearn.experimental import enable_hist_gradient_boosting

# !pip install imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings(action='once')

import sys; sys.path.insert(0, "../preprocess/") # For loading functions from files in other directory
from quickpickle import quickpickle_dump, quickpickle_load # custom scripts for quick saving & loading to pickle format

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
######################################################
# Define filepaths
######################################################

data_folder = 'classification'
folder = 'tlu_test'

cwd = os.getcwd()
root = str.replace(cwd, f'{folder}/modeling', '')

thisday = date.today().strftime("%m%d%y")

# Directory for prepared data and trained models: save files here
data_fp = root + f'{data_folder}/data/'
model_fp = root + f'{data_folder}/models/'
logs = root + f'{folder}/modeling/logs/'

# Current article lists
article_list_fp = data_fp + 'filtered_length_index.csv' # Filtered index of research articles
article_paths_fp = data_fp + 'filtered_length_article_paths.csv' # List of article file paths

# Preprocessed training data
cult_labeled_fp = data_fp + 'training_cultural_preprocessed_022621.pkl'
relt_labeled_fp = data_fp + 'training_relational_preprocessed_022621.pkl'
demog_labeled_fp = data_fp + 'training_demographic_preprocessed_022621.pkl'
orgs_labeled_fp = data_fp + 'training_orgs_preprocessed_022621.pkl'

# Model filepaths
cult_model_fp = model_fp + f'classifier_cult_MLP_{str(thisday)}.joblib'
relt_model_fp = model_fp + f'classifier_relt_MLP_{str(thisday)}.joblib'
demog_model_fp = model_fp + f'classifier_demog_MLP_{str(thisday)}.joblib'
orgs_model_fp = model_fp + f'classifier_orgs_MLP_{str(thisday)}.joblib'

# Vectorizers trained on hand-coded data (use to limit vocab of input texts)
cult_vec_fp = model_fp + 'vectorizer_cult_022621.joblib'
relt_vec_fp = model_fp + 'vectorizer_relt_022621.joblib'
demog_vec_fp = model_fp + 'vectorizer_demog_022621.joblib'
orgs_vec_fp = model_fp + 'vectorizer_orgs_022621.joblib'

## Load & inspect data

In [4]:
cult_df = quickpickle_load(cult_labeled_fp)
relt_df = quickpickle_load(relt_labeled_fp)
demog_df = quickpickle_load(demog_labeled_fp)
orgs_df = quickpickle_load(orgs_labeled_fp)

cult_df.head(10)

Unnamed: 0,text,cultural_score,primary_subject,edited_filename,article_name
0,"[[research, note, church_membership, netherlan...",0.0,Sociology,10.1086_210179,Where Do Interorganizational Networks Come From?
1,"[[polish, io_oo, sociological_review, issn, co...",1.0,Sociology,10.1086_210317,Civil Rights Law at Work: Sex Discrimination a...
2,"[[article, jjdlbsj, grapliy, compassionate, eg...",0.0,Sociology,10.1086_231084,Between Markets and Politics: Organizational R...
3,"[[reply, allison, more, comparing, regression_...",1.0,Sociology,10.1086_231174,World Society and the Nation‐State
4,"[[determinants, spousal, interaction, marital,...",1.0,Sociology,10.1086_382347,Kinship Networks and Entrepreneurs in China’s ...
5,"[[wsê, ih, ompany, profile, john, porter, musé...",1.0,Sociology,10.1086_517899,What Is Organizational Imprinting? Cultural En...
6,"[[andrew_christensen, university_california, l...",1.0,Sociology,10.1086_588742,"Homeward Bound? Interest, Identity, and Invest..."
7,"[[lawyers, consumer_protection, laws, stewart_...",0.0,Sociology,10.1086_657524,Corporate Unity in American Trade Policy: A Ne...
8,"[[establishing, sense, personal, control, tran...",1.0,Sociology,10.1086_659639,The Credit Crisis as a Problem in the Sociolog...
9,"[[guess, who, coming, town, white_supremacy, e...",0.0,Sociology,10.1525_irqr.2011.4.3.199,"Science, Health, and Nationhood"


In [5]:
orgs_df.head()

Unnamed: 0,text,orgs_score,edited_filename,article_name
0,"[[research, note, church_membership, netherlan...",1.0,10.1086_210179,Where Do Interorganizational Networks Come From?
1,"[[polish, io_oo, sociological_review, issn, co...",1.0,10.1086_210317,Civil Rights Law at Work: Sex Discrimination a...
2,"[[article, jjdlbsj, grapliy, compassionate, eg...",1.0,10.1086_231084,Between Markets and Politics: Organizational R...
3,"[[reply, allison, more, comparing, regression_...",1.0,10.1086_231174,World Society and the Nation‐State
4,"[[determinants, spousal, interaction, marital,...",1.0,10.1086_382347,Kinship Networks and Entrepreneurs in China’s ...


In [6]:
# Check score distribution across classes
print(cult_df.groupby('cultural_score').size())
print()
print(relt_df.groupby('relational_score').size())
print()
print(demog_df.groupby('demographic_score').size())
print()
print(orgs_df.groupby('orgs_score').size())

cultural_score
0.0    475
0.5     24
1.0    234
dtype: int64

relational_score
0.0    420
0.5     29
1.0    287
dtype: int64

demographic_score
0.0    477
0.5      7
1.0    256
dtype: int64

orgs_score
0.0    303
0.5     10
1.0    511
dtype: int64


In [7]:
# Drop unsure cases: where X_score = 0.5
drop_unsure = True

if drop_unsure:
    cult_df_yes = cult_df[cult_df['cultural_score'] == 1.0]
    cult_df_no = cult_df[cult_df['cultural_score'] == 0.0]
    cult_df = pd.concat([cult_df_yes, cult_df_no])
    
    relt_df_yes = relt_df[relt_df['relational_score'] == 1.0]
    relt_df_no = relt_df[relt_df['relational_score'] == 0.0]
    relt_df = pd.concat([relt_df_yes, relt_df_no])
    
    demog_df_yes = demog_df[demog_df['demographic_score'] == 1.0]
    demog_df_no = demog_df[demog_df['demographic_score'] == 0.0]
    demog_df = pd.concat([demog_df_yes, demog_df_no])
    
    orgs_df_yes = orgs_df[orgs_df['orgs_score'] == 1.0]
    orgs_df_no = orgs_df[orgs_df['orgs_score'] == 0.0]
    orgs_df = pd.concat([orgs_df_yes, orgs_df_no])

### Check vocab size and frequent words

In [8]:
def collect_article_tokens(article, return_string=False):
    '''
    Collects words from already-tokenized sentences representing each article.
    
    Args:
        article: list of lists of words (each list is a sentence)
        return_string: whether to return single, long string representing article
    Returns:
        tokens: string if return_string, else list of tokens
    '''
    
    tokens = [] # initialize
    
    if return_string:
        for sent in article:
            sent = ' '.join(sent) # make sentence into a string
            tokens.append(sent) # add sentence to list of sentences
        tokens = ' '.join(tokens) # join sentences into string
        return tokens # return string
    
    else:
        for sent in article:
            tokens += [word for word in sent] # add each word to list of tokens
        return tokens # return list of tokens

# For capturing word frequencies, add all words from each article to single, shared list (can't use this to create models)
cult_tokens = []; cult_df['text'].apply(lambda article: cult_tokens.extend([word for word in collect_article_tokens(article)]))
relt_tokens = []; relt_df['text'].apply(lambda article: relt_tokens.extend([word for word in collect_article_tokens(article)]))
demog_tokens = []; demog_df['text'].apply(lambda article: demog_tokens.extend([word for word in collect_article_tokens(article)]))
orgs_tokens = []; orgs_df['text'].apply(lambda article: orgs_tokens.extend([word for word in collect_article_tokens(article)]))
print()




In [None]:
# Look at size of vocabulary and most frequent words
tokens = ((cult_tokens + relt_tokens) + demog_tokens) + orgs_tokens
print('Vocab size:', len(set(tokens)))
print()

# Check out most frequent words in labeled texts
freq = Counter(tokens)
print('20 most frequent words in labeled articles:')
freq.most_common(20)

Vocab size: 169701

20 most frequent words in labeled articles:


[('from', 89011),
 ('social', 77638),
 ('have', 73580),
 ('we', 70788),
 ('more', 68737),
 ('which', 66698),
 ('were', 65677),
 ('one', 55317),
 ('other', 44993),
 ('than', 43319),
 ('may', 43175),
 ('also', 41037),
 ('all', 40362),
 ('can', 39250),
 ('research', 38752),
 ('between', 38287),
 ('who', 37469),
 ('time', 35868),
 ('has', 33474),
 ('when', 33441)]

### Check frequent sentences (to improve cleaning)

In [None]:
# Add sentences from each article to empty list:
cult_sents = []; cult_df['text'].apply(
    lambda article: cult_sents.extend(
        [' '.join([word for word in sent]) for sent in article]))
relt_sents = []; relt_df['text'].apply(
    lambda article: relt_sents.extend(
        [' '.join([word for word in sent]) for sent in article]))
demog_sents = []; demog_df['text'].apply(
    lambda article: demog_sents.extend(
        [' '.join([word for word in sent]) for sent in article]))
orgs_sents = []; orgs_df['text'].apply(
    lambda article: orgs_sents.extend(
        [' '.join([word for word in sent]) for sent in article]))

sents = ((cult_sents + relt_sents) + demog_sents) + orgs_sents
print('Number of sentences:', len(sents))
print()

# Check out most frequent sentences in labeled texts
freq = Counter(sents)
print('20 most frequent sentences in labeled articles:')
freq.most_common(100)

Number of sentences: 113692

20 most frequent sentences in labeled articles:


[('entry', 12932),
 ('bottom_entry', 8227),
 ('align', 8045),
 ('row', 4734),
 ('align_center', 2573),
 ('bottom', 2279),
 ('align_left', 2004),
 ('colspec_colnum', 774),
 ('top entry', 542),
 ('caption', 430),
 ('label_label', 424),
 ('tex_math notation_latex', 404),
 ('documentclass_aastex', 404),
 ('usepackage_amsbsy', 404),
 ('usepackage_amsfonts', 404),
 ('usepackage_amssymb', 404),
 ('usepackage_bm', 404),
 ('usepackage_mathrsfs', 404),
 ('usepackage_pifont', 404),
 ('usepackage_stmaryrd', 404),
 ('usepackage_textcomp', 404),
 ('usepackage_portland xspace', 404),
 ('usepackage_amsmath amsxtra', 404),
 ('usepackage_ot ot_fontenc', 404),
 ('newcommand_cyr', 404),
 ('renewcommand_rmdefault wncyr', 404),
 ('renewcommand_sfdefault wncyss', 404),
 ('renewcommand_encodingdefault ot', 404),
 ('normalfont', 404),
 ('selectfont', 404),
 ('declaretextfontcommand_extcyr cyr', 404),
 ('pagestyle_empty', 404),
 ('declaremathsizes', 404),
 ('begin_document', 404),
 ('landscape', 404),
 ('end_do

### Load and apply text vectorizers

In [None]:
# Collect articles: Add each article as single str to list of str:
cult_docs = [] # empty list
cult_df['text'].apply(
    lambda article: cult_docs.append(
        collect_article_tokens(
            article, 
            return_string=True)))

relt_docs = [] # empty list
relt_df['text'].apply(
    lambda article: relt_docs.append(
       collect_article_tokens(
            article, 
            return_string=True)))

demog_docs = [] # empty list
demog_df['text'].apply(
    lambda article: demog_docs.append(
        collect_article_tokens(
            article, 
            return_string=True)))

orgs_docs = [] # empty list
orgs_df['text'].apply(
    lambda article: orgs_docs.append(
        collect_article_tokens(
            article, 
            return_string=True)))

print() # skip weird output




In [None]:
# Define stopwords used by JSTOR
jstor_stopwords = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"])

# Uses TFIDF weighted DTM because results in better classifier accuracy than unweighted
cult_vectorizer = joblib.load(cult_vec_fp, "r+")
X_cult = cult_vectorizer.transform(cult_docs)
print('Number of features in cultural vectorizer:', len(cult_vectorizer.get_feature_names()))
print('Every 1000th word:\n{}'.format(cult_vectorizer.get_feature_names()[::1000])) # get every 1000th word
print()

relt_vectorizer = joblib.load(relt_vec_fp, "r+")
X_relt = relt_vectorizer.transform(relt_docs)
print('Number of features in relational vectorizer:', len(relt_vectorizer.get_feature_names()))
print('Every 1000th word:\n{}'.format(relt_vectorizer.get_feature_names()[::1000])) # get every 1000th word
print()

demog_vectorizer = joblib.load(demog_vec_fp, "r+")
X_demog = demog_vectorizer.transform(demog_docs)
print('Number of features in demographic vectorizer:', len(demog_vectorizer.get_feature_names()))
print('Every 1000th word:\n{}'.format(demog_vectorizer.get_feature_names()[::1000])) # get every 1000th word
print()

orgs_vectorizer = joblib.load(orgs_vec_fp, "r+")
X_orgs = orgs_vectorizer.transform(orgs_docs)
print('Number of features in organizational soc vectorizer: {}'.format(len(orgs_vectorizer.get_feature_names())))
print('Every 1000th word:\n{}'.format(orgs_vectorizer.get_feature_names()[::1000])) # get every 1000th word

Number of features in cultural vectorizer: 100000
Every 1000th word:
['_author', 'achivement', 'aerotic', 'alam', 'amazon_basin', 'anjlated', 'appropriated', 'attacks', 'barbara_entwisle', 'berlin_berlin', 'boles', 'brotherhood', 'cantonments', 'cfi_comparative', 'ciu', 'collects', 'conceptually', 'convoluted', 'criticised', 'daunting_task', 'demonized', 'diesel_engines', 'distributed', 'dst', 'efforts', 'enhance_interpretability', 'euroqol', 'fabrigar', 'fictional', 'foreignness', 'furstenberg', 'giddens', 'greenfeld', 'hard_currency', 'het_erogeneity', 'hospital_bed', 'illuminate', 'infants', 'internally_consistent', 'jaffee_david', 'kagono_nonaka', 'kockott', 'latham_wexley', 'liebling', 'loseke_eds', 'manager', 'mazur', 'mexican_immigrants', 'modifier', 'murdie', 'neter_john', 'norfolk', 'oil_spills', 'ot_ot', 'paris_harmattan', 'perks', 'plea_negotiation', 'praeger_publishers', 'processors', 'psych_bull', 'qtj', 'ramadan', 'reappeared', 'referrals', 'relmo', 'residen_tial', 'rffir

## Setup for modeling

In [None]:
######################################################
# Generates Balanced Pipelines
######################################################

def make_model_pipeline(model, undersample=False, sampling_ratio = 1.0, use_SMOTE=False, random_state=None):
    """
    Creates an sklearn pipeline object to handle over/under sampling or SMOTE sampling
    This is to avoid data leakage from oversampling before partitioning for cross-validation
    Apparently, this will not effect the test data when running grid search, which is desired
    
    Args:
        model: An sklearn classifier model to be packaged with an over/under sampling model
        undersample: boolean for over or undersampling
        sampling_ratio: ratio of minority to majority class
        use_SMOTE: boolean for enabling SMOTE (overrides undersample)
        
    Returns:
        A pipeline that will handle under/over sampling without the prior data leakage errors
    """
    # All sampling methods use the same arguments, so do this to save time
    kwargs = {'sampling_strategy':sampling_ratio, 'random_state': random_state}
    if use_SMOTE:
        sampler = SMOTE(**kwargs)
    elif undersample:
        sampler = RandomUnderSampler(**kwargs)
    else:
        sampler = RandomOverSampler(**kwargs)
    return Pipeline(steps=[('sampler', sampler), ('model', model)])


In [None]:
def compute_predictions(text, vectorizer_model, class_model):
    '''
    Predicts the label for an input text using a given model trained to classify the texts. 
    Uses vectorizer_model to restrict the vocab of the input text so it's consistent with vocab in class_model (avoids errors).
    
    Args:
        text: preprocessed text in format of list of sentences, each a str or list of tokens
        vectorizer_model: fitted text vectorizer
        class_model: trained classification model
    Returns:
        label: label for text predicted by model, false for tie
        prob: probability for label
    '''
    
    X = vectorizer_model.transform(text) # create TF-IDF-weighted DTM from text
    try:
        probabilities = class_model.predict_proba(X)
    except: 
        return
    
    label = 'no'
    prob_no = probabilities[0][0]
    prob_yes = probabilities[0][1]
    
    # predicted label is one with greater probability
    if probabilities[0][0] < probabilities[0][1]:
        label = 'yes'
        
    return label, prob_yes, prob_no

## Define algorithms and hyperparameter grids

In [None]:
######################################################
# Create different models with different hyperparameter grids
######################################################
seed = 42
models = []

# The first item in the tuple is the name of the algorithm
# The second item is the estimator object itself
# The third item is the param grid

models.append(('DecisionTree', DecisionTreeClassifier(random_state=seed), {
                        'criterion': ['gini','entropy'],
                        'min_samples_split': range(0, 21, 6),
                        'max_depth': range(6, 21, 4)
                    }))

models.append(('RandomForest', 
               RandomForestClassifier(random_state=seed),
                    {
                        'criterion': ['gini','entropy'],
                        'n_estimators': range(100, 301, 100),
                        'max_features': ['auto', 'log2'],
                        'max_depth': list(range(2, 11, 4))
                    }))

# models.append(('LogisticRegression', 
#                LogisticRegression(random_state=seed), 
#                {'penalty': ['l2', 'elasticnet', 'none'],'C': np.logspace(-4, 6, num=5)}))

# models.append(('SupportVectorMachine', SVC(gamma='auto'), 
#                {'C': np.logspace(-4, 6, num=10), 'gamma': [0.001, 0.0001], 'kernel': ['rbf', 'linear']}
#               ))

# models.append(('PassiveAggressive', 
#                PassiveAggressiveClassifier(random_state=seed, n_jobs=-1), 
#                {'C': np.logspace(-4, 6, num=5), 'loss': ['hinge', 'squared_hinge']}))


models.append(('AdaBoost', AdaBoostClassifier(random_state=seed), 
               {'n_estimators': range(200, 1001, 200), 'learning_rate':[0.001, 0.01, 0.1]}))

models.append(('MultiLayerPerceptron', 
               MLPClassifier(random_state=seed, max_iter=300),
            {'hidden_layer_sizes': [(50,50), (100,50), (100,)],
            'activation': ['relu', 'tanh'],
            'solver': ['sgd', 'adam'],
            'alpha': [0.0001, 0.001, 0.01, 0.05],
            'learning_rate': ['adaptive', 'constant']}))

In [None]:
def gridsearch(X, y, cv=10, metrics=[
    'mean_test_balanced_accuracy', 'mean_test_f1', 
    'mean_test_precision', 'mean_test_recall',
    'mean_test_accuracy', 'mean_train_accuracy']):
    
    """
    Runs a grid search over all the models in the models variable for the given X and y dataset.
    This produces a cross-validation and gathers some useful metrics.
    
    Args:
        X: the input training data of shape (n_samples, n_features)
        y: the vector of labels of shape (n_samples,) or (n_samples, n_targets)
        cv: int, cross-validation generator or an iterable, Determines the cross-validation splitting strategy
        metrics: list of metrics to gather, from https://scikit-learn.org/stable/modules/model_evaluation.html
                the first argument of this is used as the refit parameter
    Returns:
        A dataframe containing the parameters of the best fit model for each model and sampling type
    """
    
    data = []
    
    # Make raw_metrics a list of the unique metrics (f1, accuracy, etc) without the "mean_test_" prefix
    temp_metrics = [re.sub(r'\w+?_\w+?_', '', metric) for metric in metrics]
    raw_metrics = []
    for metric in temp_metrics:
        if metric not in raw_metrics:
            raw_metrics.append(metric)
    
    # iterate over every model in the models list
    for model_name, model, params in models:
        
        # modify the param keys because the pipeline changes some names
        cv_params = {f'model__{key}': value for key, value in params.items()}
        
        # also iterate over each sampling type
        for sampling_name, under, smote in [
            ('under', True, False), ('over', False, False), ('smote', False, True)]:
            
            # create a resampling pipeline object
            pipeline = make_model_pipeline(model, undersample=under, use_SMOTE=smote)
            
            # run gridsearch 
            gscv = GridSearchCV(
                pipeline, param_grid=cv_params, cv=cv, 
                scoring=raw_metrics,
                verbose=1, n_jobs=1,
                return_train_score=True, refit=raw_metrics[0])
            gscv.fit(X, y)

            # find and store the best performing parameters
            row = {'Name':f'{model_name} {sampling_name}',
                   'Params':gscv.best_params_}
            results, ind = gscv.cv_results_, gscv.best_index_
            
            for metric in metrics:
                row[metric[5:]] = results[metric][ind]
                print(model_name, sampling_name, metric[5:], results[metric][ind])
            data.append(row)
            
            print()
        print()
        
    return pd.DataFrame(data)

In [None]:
# Run the grid search on the orgs data

orgs_df = orgs_df[['text', 'orgs_score']]
print("Number of cases:", str(X_orgs.shape[0]))

valueArray = orgs_df.values
Y = valueArray[:,1]
Y = Y.astype('float')
print("Number of codes (should match):", str(len(Y)))

orgs_params = gridsearch(X_orgs, Y)

# save gathered params and accuracy to log
orgs_params.to_csv(path_or_buf=logs + f'orgs_grid_{thisday}.csv', index=False)
orgs_params

Number of cases: 814
Number of codes (should match): 814
Fitting 10 folds for each of 32 candidates, totalling 320 fits
DecisionTree under test_balanced_accuracy 0.665369532428356
DecisionTree under test_f1 0.7361151403545533
DecisionTree under test_precision 0.7504329078432519
DecisionTree under test_recall 0.7374057315233786
DecisionTree under test_accuracy 0.684131285757302
DecisionTree under train_accuracy 0.8819247198801243

Fitting 10 folds for each of 32 candidates, totalling 320 fits
DecisionTree over test_balanced_accuracy 0.7035863377609107
DecisionTree over test_f1 0.8154183487890998
DecisionTree over test_precision 0.7538265211363433
DecisionTree over test_recall 0.9058823529411765
DecisionTree over test_accuracy 0.7554351099066545
DecisionTree over train_accuracy 0.9545467761053832

Fitting 10 folds for each of 32 candidates, totalling 320 fits
DecisionTree smote test_balanced_accuracy 0.7200664136622391
DecisionTree smote test_f1 0.8200566608492114
DecisionTree smote test

In [None]:
# Run the grid search on the culture data

cult_df = cult_df[['text', 'cultural_score']]
print("Number of cases:", str(X_cult.shape[0]))

valueArray = cult_df.values
Y = valueArray[:,1]
Y = Y.astype('float')
print("Number of codes (should match):", str(len(Y)))

cult_params = gridsearch(X_cult, Y)

# save gathered params and accuracy to log
cult_params.to_csv(path_or_buf=logs + f'cult_grid_{thisday}.csv', index=False)
cult_params

In [None]:
# Run the grid search on the relational data

relt_df = relt_df[['text', 'relational_score']]
print("Number of cases:", str(X_relt.shape[0]))

valueArray = relt_df.values
Y = valueArray[:,1]
Y = Y.astype('float')
print("Number of codes (should match):", str(len(Y)))

relt_params = gridsearch(X_relt, Y)

# save gathered params and accuracy to log
relt_params.to_csv(path_or_buf=logs + f'relt_grid_{thisday}.csv', index=False)
relt_params

In [None]:
# Run the grid search on the demographic data

demog_df = demog_df[['text', 'demographic_score']]
print("Number of cases:", str(X_demog.shape[0]))

valueArray = demog_df.values
Y = valueArray[:,1]
Y = Y.astype('float')
print("Number of codes (should match):", str(len(Y)))

demog_params = gridsearch(X_demog, Y)

# save gathered params and accuracy to log
demog_params.to_csv(path_or_buf=logs + f'demog_grid_{thisday}.csv', index=False)
demog_params