# Compare classification methods for identifying org. science perspectives in JSTOR articles
## Using balanced samples from hand-coding

@author: Jaren Haber, PhD<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: December 2020

'''
Trains classifiers to predict whether an article is about a given perspective in org. science. To train the classifiers, uses preliminary labeled articles, broken down as follows: 
Cultural: 105 yes, 209 no
Relational: 92 yes, 230 no
Demographic: 77 yes, 249 no
Compares f1_weighted scores of four model structures using 10-Fold Cross Validation: Logistic regression, SVM, Naive Bayes, and Decision Tree. Oversamples training data to .7 (7:10 minority:majority class).
'''

## Initialize

In [108]:
######################################################
# Import libraries
######################################################

import pandas as pd
import numpy as np
import re
from collections import Counter
from datetime import date
import os

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

stemmer = WordNetLemmatizer()

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

import joblib
import csv

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, KFold

# !pip install imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import sys; sys.path.insert(0, "../preprocess/") # For loading functions from files in other directory
from quickpickle import quickpickle_dump, quickpickle_load # custom scripts for quick saving & loading to pickle format

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [183]:
######################################################
# Define filepaths
######################################################

thisday = date.today().strftime("%m%d%y")

cwd = os.getcwd()
root = str.replace(cwd, 'classification/modeling', '')

# Directory for prepared data and trained models: save files here
data_fp = root + 'classification/data/'
model_fp = root + 'classification/models/'

# Current article lists
article_list_fp = data_fp + 'filtered_length_index.csv' # Filtered index of research articles
article_paths_fp = data_fp + 'filtered_length_article_paths.csv' # List of article file paths

# Preprocessed training data
cult_labeled_fp = data_fp + 'training_cultural_preprocessed_112420.pkl'
relt_labeled_fp = data_fp + 'training_relational_preprocessed_112420.pkl'
demog_labeled_fp = data_fp + 'training_demographic_preprocessed_112420.pkl'

# Model filepaths
cult_model_fp = model_fp + f'classifier_cult_{str(thisday)}.joblib'
relt_model_fp = model_fp + f'classifier_relt_{str(thisday)}.joblib'
demog_model_fp = model_fp + f'classifier_demog_{str(thisday)}.joblib'

# Vectorizers trained on hand-coded data (use to limit vocab of input texts)
cult_vec_fp = model_fp + 'vectorizer_cult_113020.joblib'
relt_vec_fp = model_fp + 'vectorizer_relt_113020.joblib'
demog_vec_fp = model_fp + 'vectorizer_demog_113020.joblib'

## Load & inspect data

In [87]:
cult_df = quickpickle_load(cult_labeled_fp)
relt_df = quickpickle_load(relt_labeled_fp)
demog_df = quickpickle_load(demog_labeled_fp)

cult_df.head(10)

Unnamed: 0,cultural_score,text
0,1.0,"[[journal, managerial, issues, vol], [xxiii, n..."
1,1.0,"[[organization, ht, icna, vol], [may, june, pp..."
2,1.0,"[[from, fiefs, clans, network, capitalism, exp..."
3,1.0,"[[collective, strategy, framework, application..."
4,1.0,"[[manag, int, rev, doi, sl, research, article,..."
5,1.0,"[[int], [studies, ofmgt], [amp, org, vol], [pp..."
6,1.0,"[[linking, organizational, values, relationshi..."
7,1.0,"[[journal, organizational, behavior, organiz],..."
8,1.0,"[[®, academy, oí, management, learning, amp, e..."
9,1.0,"[[strategie, management, journal, strat], [mgm..."


In [88]:
# Check score distribution across classes
print(cult_df.groupby('cultural_score').size())
print()
print(relt_df.groupby('relational_score').size())
print()
print(demog_df.groupby('demographic_score').size())

cultural_score
0.0    209
0.5     12
1.0    105
dtype: int64

relational_score
0.0    229
0.5      7
1.0     92
dtype: int64

demographic_score
0.0    248
0.5      5
1.0     77
dtype: int64


In [89]:
# Drop unsure cases: where X_score = 0.5
drop_unsure = True

if drop_unsure:
    cult_df_yes = cult_df[cult_df['cultural_score'] == 1.0]
    cult_df_no = cult_df[cult_df['cultural_score'] == 0.0]
    cult_df = pd.concat([cult_df_yes, cult_df_no])
    
    relt_df_yes = relt_df[relt_df['relational_score'] == 1.0]
    relt_df_no = relt_df[relt_df['relational_score'] == 0.0]
    relt_df = pd.concat([relt_df_yes, relt_df_no])
    
    demog_df_yes = demog_df[demog_df['demographic_score'] == 1.0]
    demog_df_no = demog_df[demog_df['demographic_score'] == 0.0]
    demog_df = pd.concat([demog_df_yes, demog_df_no])

### Check vocab size and frequent words

In [90]:
def collect_article_tokens(article, return_string=False):
    '''
    Collects words from already-tokenized sentences representing each article.
    
    Args:
        article: list of lists of words (each list is a sentence)
        return_string: whether to return single, long string representing article
    Returns:
        tokens: string if return_string, else list of tokens
    '''
    
    tokens = [] # initialize
    
    if return_string:
        for sent in article:
            sent = ' '.join(sent) # make sentence into a string
            tokens.append(sent) # add sentence to list of sentences
        tokens = ' '.join(tokens) # join sentences into string
        return tokens # return string
    
    else:
        for sent in article:
            tokens += [word for word in sent] # add each word to list of tokens
        return tokens # return list of tokens

# For capturing word frequencies, add all words from each article to single, shared list (can't use this to create models)
cult_tokens = []; cult_df['text'].apply(lambda article: cult_tokens.extend([word for word in collect_article_tokens(article)]))
relt_tokens = []; relt_df['text'].apply(lambda article: relt_tokens.extend([word for word in collect_article_tokens(article)]))
demog_tokens = []; demog_df['text'].apply(lambda article: demog_tokens.extend([word for word in collect_article_tokens(article)]))
print()




In [91]:
# Look at size of vocabulary and most frequent words
tokens = (cult_tokens + relt_tokens) + demog_tokens
print('Vocab size:', len(set(tokens)))
print()

# Check out most frequent words in labeled texts
freq = Counter(tokens)
print('20 most frequent words in labeled articles:')
freq.most_common(20)

Vocab size: 83116

20 most frequent words in labeled articles:


[('oasis', 40297),
 ('from', 36256),
 ('entry', 34917),
 ('we', 31897),
 ('social', 30067),
 ('have', 27000),
 ('which', 26867),
 ('more', 25646),
 ('were', 21277),
 ('char', 20564),
 ('new', 20161),
 ('one', 19987),
 ('rowsep', 18993),
 ('colsep', 18958),
 ('other', 18714),
 ('between', 17821),
 ('than', 17793),
 ('can', 17436),
 ('has', 17067),
 ('organizational', 16700)]

### Check frequent sentences (to improve cleaning)

In [92]:
# Add sentences from each article to empty list:
cult_sents = []; cult_df['text'].apply(lambda article: cult_sents.extend([' '.join([word for word in sent]) for sent in article]))
relt_sents = []; relt_df['text'].apply(lambda article: relt_sents.extend([' '.join([word for word in sent]) for sent in article]))
demog_sents = []; demog_df['text'].apply(lambda article: demog_sents.extend([' '.join([word for word in sent]) for sent in article]))

sents = (cult_sents + relt_sents) + demog_sents
print('Number of sentences:', len(sents))
print()

# Check out most frequent sentences in labeled texts
freq = Counter(sents)
print('20 most frequent sentences in labeled articles:')
freq.most_common(20)

Number of sentences: 471679

20 most frequent sentences in labeled articles:


[('valign bottom oasis entry oasis entry colname colsep rowsep align char char',
  3751),
 ('pp', 1419),
 ('oasis entry oasis entry colname colsep rowsep align char char', 1271),
 ('oasis entry colname colsep rowsep align char char', 1077),
 ('american sociological review', 1041),
 ('american journal sociology', 1006),
 ('administrative science quarterly', 947),
 ('valign bottom oasis entry colname colsep rowsep align char char', 938),
 ('sci', 491),
 ('chicago university chicago press', 463),
 ('academy management journal', 458),
 ('academy management review', 387),
 ('organ', 366),
 ('colsep rowsep oasis entry align char char', 360),
 ('new york free press', 356),
 ('new york oxford university press', 317),
 ('oasis entry oasis entry colsep rowsep align char char', 296),
 ('cambridge ma harvard university press', 291),
 ('ed', 285),
 ('cambridge cambridge university press', 271)]

### Load and apply text vectorizers

In [93]:
# Collect article tokens into a list of strings, each string an article
cult_tokens = []; cult_df['text'].apply(lambda article: cult_tokens.append(collect_article_tokens(article, return_string = True)))
relt_tokens = []; relt_df['text'].apply(lambda article: relt_tokens.append(collect_article_tokens(article, return_string = True)))
demog_tokens = []; demog_df['text'].apply(lambda article: demog_tokens.append(collect_article_tokens(article, return_string = True)))
print()




In [94]:
# Define stopwords used by JSTOR
jstor_stopwords = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"])

# Uses TFIDF weighted DTM because results in better classifier accuracy than unweighted
cult_vectorizer = joblib.load(cult_vec_fp, "r+")
X_cult = cult_vectorizer.fit_transform(cult_tokens)
print('Number of features in cultural vectorizer:', len(cult_vectorizer.get_feature_names()))
print(cult_vectorizer.get_feature_names()[::1000]) # get every 1000th word
print()

relt_vectorizer = joblib.load(relt_vec_fp, "r+")
X_relt = relt_vectorizer.fit_transform(relt_tokens)
print('Number of features in relational vectorizer:', len(relt_vectorizer.get_feature_names()))
print(relt_vectorizer.get_feature_names()[::1000]) # get every 1000th word
print()

demog_vectorizer = joblib.load(demog_vec_fp, "r+")
X_demog = demog_vectorizer.fit_transform(demog_tokens)
print('Number of features in demographic vectorizer:', len(demog_vectorizer.get_feature_names()))
print(demog_vectorizer.get_feature_names()[::1000]) # get every 1000th word

Number of features in cultural vectorizer: 65619
['0000779e2340', '37i', 'abortive', 'agnes', 'anne', 'asean', 'bagdad', 'besier', 'bouma', 'c06', 'centrating', 'classt', 'competidiversity', 'contrived', 'crush', 'decouple', 'develple', 'dissolves', 'dwarfs', 'emigre', 'eschewing', 'extertion', 'finalised', 'franchised', 'gegenstandstheorie', 'grande', 'harden', 'historiques', 'ice', 'incredibly', 'intentioned', 'itching', 'kalra', 'kryukov', 'lenders', 'looping', 'manitoba', 'meetproactive', 'misunderstood', 'munford', 'newl', 'nr170', 'opener', 'oç', 'peptide', 'plo', 'preferable', 'provinces', 'ranged', 'regime', 'resulting', 'routed', 'schachter', 'serves', 'sist', 'specialists', 'straatweg', 'surmised', 'ted', 'tinbergen', 'tribesmen', 'underagain', 'usefully', 'vociferously', 'wiki', 'yoshida']

Number of features in relational vectorizer: 67106
['00000000000000000000000000000000z000', '1979c', '64cic', 'acrylic', 'alitomatic', 'apopriate', 'atrrmon', 'barton', 'biostatistics', '

## Setup for modeling

In [123]:
######################################################
# Balance x_train, y_train
######################################################

def resample_data(X_train, Y_train, undersample = False, sampling_ratio = 1.0):
    """
    Args:
        X_train: X training data
        Y_train: Y training data
        undersample: boolean for over or undersampling
        sampling_ratio: ratio of minority to majority class
        
        archived/not used:
        sampling_strategy: strategy for resampled distribution
            if oversample: 'majority' makes minority = to majority
            if undersample: 'minority' makes majority = to minority
            
    Returns:
        X_balanced: predictors at balanced ratio
        Y_balanced: outcomes at balanced ratio
    """
    
    if undersample == True:
        undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
        X_balanced, Y_balanced = undersample.fit_resample(X_train, Y_train)
    else:
        oversample = RandomOverSampler(sampling_strategy=sampling_ratio)
        X_balanced, Y_balanced = oversample.fit_resample(X_train, Y_train)
    
    print(f'Y_train: {Counter(Y_train)}\nY_resample: {Counter(Y_balanced)}')
    
    return X_balanced, Y_balanced

In [172]:
######################################################
# k-fold cross validation for model evaluation
######################################################

# Define test options for k-fold CV
num_folds = 10 
seed = 3
scoring='f1_weighted' # set scoring metric (not used here)

def show_kfold_output(models, 
                      X, 
                      Y, 
                      num_folds = num_folds, 
                      random_state = seed, 
                      shuffle = True):
    '''
    Estimates the accuracy of different model algorithms, adds results to a results array and returns.
    Prints the accuracy results: averages and std.
    Uses cross_val_predict, which unlike cross_val_score cannot define scoring option/evaluation metric.
    
    Args:
        models: list of (name, model) tuples
        X: predictors
        Y: outcomes
        num_folds: Split data randomly into num_folds parts: (num_folds-1) for training, 1 for scoring
        random_state: seed
        shuffle: 
    
    Returns:
        results: list of model results
        names: list of model names (matches results)
        
    Source: 
        https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn
    '''
    
    results = []
    names = []
    
    for name, model in models:
        # Setup model options
        kfold = KFold(
            n_splits=num_folds, 
            random_state=seed, 
            shuffle=True)
        
        # Get kfold results
        cv_results = cross_val_predict(
            model, 
            X, 
            Y, 
            cv=kfold, 
            #scoring=scoring, 
            n_jobs=-1) # use all cores = faster
        
        # Add results and name of each algorithm to the model array
        results.append(cv_results)
        names.append(name)
        
        # Print results
        print(f'{name}:\nMean (std):\t {round(cv_results.mean(),4)} ({round(cv_results.std(),4)})')
        print('Accuracy:\t', accuracy_score(Y_balanced, cv_results))
        print()
        print('Confusion matrix:\n', confusion_matrix(Y_balanced, cv_results))
        print()
        print('Report:\n', classification_report(Y_balanced, cv_results))
        print()
        
    # Return arrays
    return results, names

## Evaluate algorithms: Cultural perspective

In [173]:
######################################################
# Prepare training and validation data
######################################################

# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

cult_df = cult_df[['text', 'cultural_score']]
print("Number of cases:", str(X_cult.shape[0]))

valueArray = cult_df.values
Y = valueArray[:,1]
Y = Y.astype('float')
print("Number of codes (should match):", str(len(Y)))

test_size = 0.2
seed = 3
X_train, X_validate, Y_train, Y_validate = train_test_split(
    X_cult, 
    Y, 
    test_size=test_size, 
    random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

Number of cases: 314
Number of codes (should match): 314
Y_train Distribution: [(0.0, 170), (1.0, 81)]


In [174]:
######################################################
# Oversample to desirable ratio
######################################################

# Use these settings here and below
sampling_ratio = 1.0 # ratio of minority to majority cases
undersample = False # whether to undersample or oversample

X_balanced, Y_balanced = resample_data(
    X_train, 
    Y_train, 
    undersample=undersample, 
    sampling_ratio=sampling_ratio)

Y_train: Counter({0.0: 170, 1.0: 81})
Y_resample: Counter({0.0: 170, 1.0: 170})


### 10-Fold Cross Validation: Cultural perspective

In [175]:
######################################################
# Use different algorithms to build models
######################################################

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, random_state=0)))
models.append(('DT', DecisionTreeClassifier()))

results, names = show_kfold_output(models=models, 
                                   X=X_balanced, 
                                   Y=Y_balanced)

KNN:
Mean (std):	 0.6176 (0.486)
Accuracy:	 0.8176470588235294

Confusion matrix:
 [[119  51]
 [ 11 159]]

Report:
               precision    recall  f1-score   support

         0.0       0.92      0.70      0.79       170
         1.0       0.76      0.94      0.84       170

    accuracy                           0.82       340
   macro avg       0.84      0.82      0.82       340
weighted avg       0.84      0.82      0.82       340


RF:
Mean (std):	 0.4912 (0.4999)
Accuracy:	 0.9382352941176471

Confusion matrix:
 [[161   9]
 [ 12 158]]

Report:
               precision    recall  f1-score   support

         0.0       0.93      0.95      0.94       170
         1.0       0.95      0.93      0.94       170

    accuracy                           0.94       340
   macro avg       0.94      0.94      0.94       340
weighted avg       0.94      0.94      0.94       340


DT:
Mean (std):	 0.5471 (0.4978)
Accuracy:	 0.8941176470588236

Confusion matrix:
 [[144  26]
 [ 10 160]]

Repor

### KNN: Cultural perspective

In [159]:
######################################################
# Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
knn_cult = KNeighborsClassifier()
knn_cult.fit(X_train, Y_train)
knn_predictions = knn_cult.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))

######################################################
# Balanced: Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
knn_cult = KNeighborsClassifier()
knn_cult.fit(X_balanced, Y_balanced)
knn_predictions = knn_cult.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))


Unbalanced Classifier [(0.0, 170), (1.0, 81)]
0.7619047619047619
[[29 10]
 [ 5 19]]
              precision    recall  f1-score   support

         0.0       0.85      0.74      0.79        39
         1.0       0.66      0.79      0.72        24

    accuracy                           0.76        63
   macro avg       0.75      0.77      0.76        63
weighted avg       0.78      0.76      0.76        63


Balanced Classifier [(0.0, 170), (1.0, 170)]
0.6190476190476191
[[18 21]
 [ 3 21]]
              precision    recall  f1-score   support

         0.0       0.86      0.46      0.60        39
         1.0       0.50      0.88      0.64        24

    accuracy                           0.62        63
   macro avg       0.68      0.67      0.62        63
weighted avg       0.72      0.62      0.61        63



### Random Forest: Cultural perspective

In [129]:
######################################################
# Compare algorithms on validation test: Random Forest
######################################################

rf_cult = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_cult.fit(X_train, Y_train) 
rf_predictions = rf_cult.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Random Forest
######################################################
rf_cult = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_cult.fit(X_balanced, Y_balanced) 
rf_predictions = rf_cult.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))


Unbalanced Classifier [(0.0, 170), (1.0, 81)]
0.7777777777777778
[[38  1]
 [13 11]]
              precision    recall  f1-score   support

         0.0       0.75      0.97      0.84        39
         1.0       0.92      0.46      0.61        24

    accuracy                           0.78        63
   macro avg       0.83      0.72      0.73        63
weighted avg       0.81      0.78      0.76        63


Balanced Classifier [(0.0, 170), (1.0, 170)]
0.7619047619047619
[[35  4]
 [11 13]]
              precision    recall  f1-score   support

         0.0       0.76      0.90      0.82        39
         1.0       0.76      0.54      0.63        24

    accuracy                           0.76        63
   macro avg       0.76      0.72      0.73        63
weighted avg       0.76      0.76      0.75        63



### Decision Tree: Cultural perspective

In [130]:
######################################################
# Compare algorithms on validation test: Decision Tree
######################################################

dt_cult = DecisionTreeClassifier()
dt_cult.fit(X_train, Y_train)
dt_predictions = dt_cult.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Decision Tree
######################################################
dt_cult = DecisionTreeClassifier()
dt_cult.fit(X_balanced, Y_balanced)
dt_predictions = dt_cult.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))


Unbalanced Classifier [(0.0, 170), (1.0, 81)]
0.7142857142857143
[[28 11]
 [ 7 17]]
              precision    recall  f1-score   support

         0.0       0.80      0.72      0.76        39
         1.0       0.61      0.71      0.65        24

    accuracy                           0.71        63
   macro avg       0.70      0.71      0.71        63
weighted avg       0.73      0.71      0.72        63


Balanced Classifier [(0.0, 170), (1.0, 170)]
0.6666666666666666
[[26 13]
 [ 8 16]]
              precision    recall  f1-score   support

         0.0       0.76      0.67      0.71        39
         1.0       0.55      0.67      0.60        24

    accuracy                           0.67        63
   macro avg       0.66      0.67      0.66        63
weighted avg       0.68      0.67      0.67        63



In [184]:
######################################################
# Save best model
######################################################

#joblib.dump(rf_cult, cult_model_fp)

['/home/jovyan/work/classification/models/classifier_cult_120220.joblib']

## Evaluate algorithms: Relational perspective

In [176]:
######################################################
# Prepare training and validation data
######################################################

# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

relt_df = relt_df[['text', 'relational_score']]
print("Number of cases:", str(X_relt.shape[0]))

valueArray = relt_df.values
Y = valueArray[:,1]
Y = Y.astype('float')
print("Number of codes (should match):", str(len(Y)))

test_size = 0.2
seed = 3
X_train, X_validate, Y_train, Y_validate = train_test_split(X_relt, Y, test_size=test_size, random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

Number of cases: 321
Number of codes (should match): 321
Y_train Distribution: [(0.0, 185), (1.0, 71)]


In [177]:
######################################################
# Oversample to desirable ratio
######################################################

X_balanced, Y_balanced = resample_data(
    X_train, Y_train, 
    undersample=undersample, 
    sampling_ratio=sampling_ratio)

Y_train: Counter({0.0: 185, 1.0: 71})
Y_resample: Counter({0.0: 185, 1.0: 185})


### 10-Fold Cross Validation: Relational perspective

In [178]:
######################################################
# Use different algorithms to build models
######################################################

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, random_state=0)))
models.append(('DT', DecisionTreeClassifier()))

results, names = show_kfold_output(models=models, 
                                   X=X_balanced, 
                                   Y=Y_balanced)

KNN:
Mean (std):	 0.5811 (0.4934)
Accuracy:	 0.8756756756756757

Confusion matrix:
 [[147  38]
 [  8 177]]

Report:
               precision    recall  f1-score   support

         0.0       0.95      0.79      0.86       185
         1.0       0.82      0.96      0.89       185

    accuracy                           0.88       370
   macro avg       0.89      0.88      0.87       370
weighted avg       0.89      0.88      0.87       370


RF:
Mean (std):	 0.4919 (0.4999)
Accuracy:	 0.981081081081081

Confusion matrix:
 [[183   2]
 [  5 180]]

Report:
               precision    recall  f1-score   support

         0.0       0.97      0.99      0.98       185
         1.0       0.99      0.97      0.98       185

    accuracy                           0.98       370
   macro avg       0.98      0.98      0.98       370
weighted avg       0.98      0.98      0.98       370


DT:
Mean (std):	 0.5703 (0.495)
Accuracy:	 0.9243243243243243

Confusion matrix:
 [[158  27]
 [  1 184]]

Report

### KNN: Relational perspective

In [138]:
######################################################
# Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
knn_relt = KNeighborsClassifier()
knn_relt.fit(X_train, Y_train)
knn_predictions = knn_relt.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))

######################################################
# Balanced: Compare algorithms on validation test: KNN
######################################################
# Make predictions on validation dataset
knn_relt = KNeighborsClassifier()
knn_relt.fit(X_balanced, Y_balanced)
knn_predictions = knn_relt.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))


Unbalanced Classifier [(0.0, 185), (1.0, 71)]
0.8461538461538461
[[38  6]
 [ 4 17]]
              precision    recall  f1-score   support

         0.0       0.90      0.86      0.88        44
         1.0       0.74      0.81      0.77        21

    accuracy                           0.85        65
   macro avg       0.82      0.84      0.83        65
weighted avg       0.85      0.85      0.85        65


Balanced Classifier [(0.0, 185), (1.0, 185)]
0.7692307692307693
[[31 13]
 [ 2 19]]
              precision    recall  f1-score   support

         0.0       0.94      0.70      0.81        44
         1.0       0.59      0.90      0.72        21

    accuracy                           0.77        65
   macro avg       0.77      0.80      0.76        65
weighted avg       0.83      0.77      0.78        65



### Random Forest: Relational perspective

In [139]:
######################################################
# Compare algorithms on validation test: Random Forest
######################################################

rf_relt = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_relt.fit(X_train, Y_train) 
rf_predictions = rf_relt.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Random Forest
######################################################

rf_relt = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_relt.fit(X_balanced, Y_balanced) 
rf_predictions = rf_relt.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))


Unbalanced Classifier [(0.0, 185), (1.0, 71)]
0.8923076923076924
[[44  0]
 [ 7 14]]
              precision    recall  f1-score   support

         0.0       0.86      1.00      0.93        44
         1.0       1.00      0.67      0.80        21

    accuracy                           0.89        65
   macro avg       0.93      0.83      0.86        65
weighted avg       0.91      0.89      0.89        65


Balanced Classifier [(0.0, 185), (1.0, 185)]
0.8923076923076924
[[44  0]
 [ 7 14]]
              precision    recall  f1-score   support

         0.0       0.86      1.00      0.93        44
         1.0       1.00      0.67      0.80        21

    accuracy                           0.89        65
   macro avg       0.93      0.83      0.86        65
weighted avg       0.91      0.89      0.89        65



### Decision Tree: Relational perspective

In [140]:
######################################################
# Compare algorithms on validation test: Decision Tree
######################################################

dt_relt = DecisionTreeClassifier()
dt_relt.fit(X_train, Y_train)
dt_predictions = dt_relt.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Decision Tree
######################################################

dt_relt = DecisionTreeClassifier()
dt_relt.fit(X_balanced, Y_balanced)
dt_predictions = dt_relt.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))


Unbalanced Classifier [(0.0, 185), (1.0, 71)]
0.8923076923076924
[[42  2]
 [ 5 16]]
              precision    recall  f1-score   support

         0.0       0.89      0.95      0.92        44
         1.0       0.89      0.76      0.82        21

    accuracy                           0.89        65
   macro avg       0.89      0.86      0.87        65
weighted avg       0.89      0.89      0.89        65


Balanced Classifier [(0.0, 185), (1.0, 185)]
0.8307692307692308
[[40  4]
 [ 7 14]]
              precision    recall  f1-score   support

         0.0       0.85      0.91      0.88        44
         1.0       0.78      0.67      0.72        21

    accuracy                           0.83        65
   macro avg       0.81      0.79      0.80        65
weighted avg       0.83      0.83      0.83        65



In [185]:
######################################################
# Save best model
######################################################

#joblib.dump(rf_relt, relt_model_fp)

['/home/jovyan/work/classification/models/classifier_relt_120220.joblib']

## Evaluate algorithms: Demographic perspective

In [179]:
######################################################
# Prepare training and validation data
######################################################

# Separate training and final validation data set. First reltove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

demog_df = demog_df[['text', 'demographic_score']]
print("Number of cases:", str(X_demog.shape[0]))

valueArray = demog_df.values
Y = valueArray[:,1]
Y = Y.astype('float')
print("Number of codes (should match):", str(len(Y)))

test_size = 0.2
seed = 3
X_train, X_validate, Y_train, Y_validate = train_test_split(X_demog, Y, test_size=test_size, random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

Number of cases: 325
Number of codes (should match): 325
Y_train Distribution: [(0.0, 200), (1.0, 60)]


In [180]:
######################################################
# Oversample to desirable ratio
######################################################

X_balanced, Y_balanced = resample_data(
    X_train, Y_train, 
    undersample=undersample, 
    sampling_ratio=sampling_ratio)

Y_train: Counter({0.0: 200, 1.0: 60})
Y_resample: Counter({1.0: 200, 0.0: 200})


### 10-Fold Cross Validation: Demographic perspective

In [181]:
######################################################
# Use different algorithms to build models
######################################################

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, random_state=0)))
models.append(('DT', DecisionTreeClassifier()))

results, names = show_kfold_output(models=models, 
                                   X=X_balanced, 
                                   Y=Y_balanced)

KNN:
Mean (std):	 0.665 (0.472)
Accuracy:	 0.835

Confusion matrix:
 [[134  66]
 [  0 200]]

Report:
               precision    recall  f1-score   support

         0.0       1.00      0.67      0.80       200
         1.0       0.75      1.00      0.86       200

    accuracy                           0.83       400
   macro avg       0.88      0.83      0.83       400
weighted avg       0.88      0.83      0.83       400


RF:
Mean (std):	 0.505 (0.5)
Accuracy:	 0.995

Confusion matrix:
 [[198   2]
 [  0 200]]

Report:
               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99       200
         1.0       0.99      1.00      1.00       200

    accuracy                           0.99       400
   macro avg       1.00      0.99      0.99       400
weighted avg       1.00      0.99      0.99       400


DT:
Mean (std):	 0.555 (0.497)
Accuracy:	 0.94

Confusion matrix:
 [[177  23]
 [  1 199]]

Report:
               precision    recall  f1-score

### KNN: Demographic perspective

In [147]:
######################################################
# Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
# from sklearn.ensemble import BaggingClassifier # improves estimates but hard with so little data
#knn_demog = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

knn_demog = KNeighborsClassifier()
knn_demog.fit(X_train, Y_train)
knn_predictions = knn_demog.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))

######################################################
# Balanced: Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
# from sklearn.ensemble import BaggingClassifier # improves estimates but hard with so little data
#knn_demog = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

knn_demog = KNeighborsClassifier()
knn_demog.fit(X_balanced, Y_balanced)
knn_predictions = knn_demog.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))


Unbalanced Classifier [(0.0, 200), (1.0, 60)]
0.9076923076923077
[[43  5]
 [ 1 16]]
              precision    recall  f1-score   support

         0.0       0.98      0.90      0.93        48
         1.0       0.76      0.94      0.84        17

    accuracy                           0.91        65
   macro avg       0.87      0.92      0.89        65
weighted avg       0.92      0.91      0.91        65


Balanced Classifier [(1.0, 200), (0.0, 200)]
0.7076923076923077
[[30 18]
 [ 1 16]]
              precision    recall  f1-score   support

         0.0       0.97      0.62      0.76        48
         1.0       0.47      0.94      0.63        17

    accuracy                           0.71        65
   macro avg       0.72      0.78      0.69        65
weighted avg       0.84      0.71      0.72        65



### Random Forest: Demographic perspective

In [148]:
######################################################
# Compare algorithms on validation test: Random Forest
######################################################

#rf_demog = BaggingClassifier(RandomForestClassifier(n_estimators=1000, random_state=0), max_samples=0.5, max_features=0.5)

rf_demog = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_demog.fit(X_train, Y_train) 
rf_predictions = rf_demog.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Random Forest
######################################################

#rf_demog = BaggingClassifier(RandomForestClassifier(n_estimators=1000, random_state=0), max_samples=0.5, max_features=0.5)

rf_demog = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_demog.fit(X_balanced, Y_balanced) 
rf_predictions = rf_demog.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))


Unbalanced Classifier [(0.0, 200), (1.0, 60)]
0.8923076923076924
[[48  0]
 [ 7 10]]
              precision    recall  f1-score   support

         0.0       0.87      1.00      0.93        48
         1.0       1.00      0.59      0.74        17

    accuracy                           0.89        65
   macro avg       0.94      0.79      0.84        65
weighted avg       0.91      0.89      0.88        65


Balanced Classifier [(1.0, 200), (0.0, 200)]
0.9230769230769231
[[47  1]
 [ 4 13]]
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95        48
         1.0       0.93      0.76      0.84        17

    accuracy                           0.92        65
   macro avg       0.93      0.87      0.89        65
weighted avg       0.92      0.92      0.92        65



### Decision Tree: Demographic perspective

In [149]:
######################################################
# Compare algorithms on validation test: Decision Tree
######################################################
#dt_demog = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)

dt_demog = DecisionTreeClassifier()
dt_demog.fit(X_train, Y_train)
dt_predictions = dt_demog.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Decision Tree
######################################################
#dt_demog = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)

dt_demog = DecisionTreeClassifier()
dt_demog.fit(X_balanced, Y_balanced)
dt_predictions = dt_demog.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))


Unbalanced Classifier [(0.0, 200), (1.0, 60)]
0.8615384615384616
[[44  4]
 [ 5 12]]
              precision    recall  f1-score   support

         0.0       0.90      0.92      0.91        48
         1.0       0.75      0.71      0.73        17

    accuracy                           0.86        65
   macro avg       0.82      0.81      0.82        65
weighted avg       0.86      0.86      0.86        65


Balanced Classifier [(1.0, 200), (0.0, 200)]
0.8923076923076924
[[44  4]
 [ 3 14]]
              precision    recall  f1-score   support

         0.0       0.94      0.92      0.93        48
         1.0       0.78      0.82      0.80        17

    accuracy                           0.89        65
   macro avg       0.86      0.87      0.86        65
weighted avg       0.89      0.89      0.89        65



In [186]:
######################################################
# Save best model
######################################################

joblib.dump(rf_demog, demog_model_fp)

['/home/jovyan/work/classification/models/classifier_demog_120220.joblib']