# Compare classification methods for identifying org. science perspectives in JSTOR articles
## Using balanced samples from hand-coding

@author: Jaren Haber, PhD<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: December 2020

'''
Trains classifiers to predict whether an article is about a given perspective in org. science. To train the classifiers, uses preliminary labeled articles, broken down as follows: 
Cultural: 105 yes, 209 no
Relational: 92 yes, 230 no
Demographic: 77 yes, 249 no
Compares f1_weighted scores of four model structures using 10-Fold Cross Validation: Logistic regression, SVM, Naive Bayes, and Decision Tree. Oversamples training data to .7 (7:10 minority:majority class).
'''

## Initialize

In [1]:
######################################################
# Import libraries
######################################################

import pandas as pd
import numpy as np
import re
from collections import Counter
from datetime import date
import os

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

stemmer = WordNetLemmatizer()

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

import joblib
import csv

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, KFold

# !pip install imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import sys; sys.path.insert(0, "../preprocess/") # For loading functions from files in other directory
from quickpickle import quickpickle_dump, quickpickle_load # custom scripts for quick saving & loading to pickle format

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
######################################################
# Define filepaths
######################################################

thisday = date.today().strftime("%m%d%y")

cwd = os.getcwd()
root = str.replace(cwd, 'classification/modeling', '')

# Directory for prepared data and trained models: save files here
data_fp = root + 'classification/data/'
model_fp = root + 'classification/models/'

# Current article lists
article_list_fp = data_fp + 'filtered_length_index.csv' # Filtered index of research articles
article_paths_fp = data_fp + 'filtered_length_article_paths.csv' # List of article file paths

# Preprocessed training data
cult_labeled_fp = data_fp + 'training_cultural_preprocessed_112420.pkl'
relt_labeled_fp = data_fp + 'training_relational_preprocessed_112420.pkl'
demog_labeled_fp = data_fp + 'training_demographic_preprocessed_112420.pkl'

# Model filepaths
cult_model_fp = model_fp + f'classifier_cult_{str(thisday)}.joblib'
relt_model_fp = model_fp + f'classifier_relt_{str(thisday)}.joblib'
demog_model_fp = model_fp + f'classifier_demog_{str(thisday)}.joblib'

# Vectorizers trained on hand-coded data (use to limit vocab of input texts)
cult_vec_fp = model_fp + f'vectorizer_cult_{str(thisday)}.joblib'
relt_vec_fp = model_fp + f'vectorizer_relt_{str(thisday)}.joblib'
demog_vec_fp = model_fp + f'vectorizer_demog_{str(thisday)}.joblib'

# Vocab of vectorizers (for verification purposes)
cult_vec_feat_fp = model_fp + f'vectorizer_features_cult_{str(thisday)}.csv'
relt_vec_feat_fp = model_fp + f'vectorizer_features_relt_{str(thisday)}.csv'
demog_vec_feat_fp = model_fp + f'vectorizer_features_demog_{str(thisday)}.csv'

## Load & inspect data

In [3]:
cult_df = quickpickle_load(cult_labeled_fp)
relt_df = quickpickle_load(relt_labeled_fp)
demog_df = quickpickle_load(demog_labeled_fp)

cult_df.head(10)

Unnamed: 0,cultural_score,text
0,1.0,"[[journal, managerial, issues, vol], [xxiii, n..."
1,1.0,"[[organization, ht, icna, vol], [may, june, pp..."
2,1.0,"[[from, fiefs, clans, network, capitalism, exp..."
3,1.0,"[[collective, strategy, framework, application..."
4,1.0,"[[manag, int, rev, doi, sl, research, article,..."
5,1.0,"[[int], [studies, ofmgt], [amp, org, vol], [pp..."
6,1.0,"[[linking, organizational, values, relationshi..."
7,1.0,"[[journal, organizational, behavior, organiz],..."
8,1.0,"[[®, academy, oí, management, learning, amp, e..."
9,1.0,"[[strategie, management, journal, strat], [mgm..."


In [4]:
# Check score distribution across classes
print(cult_df.groupby('cultural_score').size())
print()
print(relt_df.groupby('relational_score').size())
print()
print(demog_df.groupby('demographic_score').size())

cultural_score
0.0    209
0.5     12
1.0    105
dtype: int64

relational_score
0.0    229
0.5      7
1.0     92
dtype: int64

demographic_score
0.0    248
0.5      5
1.0     77
dtype: int64


### Collect article tokens

In [5]:
def collect_article_tokens(article):
    '''
    Collects words from tokenized sentences representing each article.
    
    Args:
        article: list of lists of words (each list is a sentence)
    Returns:
        list: single list of tokens
    '''
    
    tokens = []
    
    for sent in article:
        tokens += [word for word in sent]
        
    return tokens

# Add words from each article to empty list:
cult_tokens = []; cult_df['text'].apply(lambda article: cult_tokens.extend([word for word in collect_article_tokens(article)]))
relt_tokens = []; relt_df['text'].apply(lambda article: relt_tokens.extend([word for word in collect_article_tokens(article)]))
demog_tokens = []; demog_df['text'].apply(lambda article: demog_tokens.extend([word for word in collect_article_tokens(article)]))

### Check vocab size and frequent words

In [6]:
# Look at size of vocabulary and most frequent words
tokens = (cult_tokens + relt_tokens) + demog_tokens
print('Vocab size:', len(set(tokens)))
print()

# Check out most frequent words in labeled texts
freq = Counter(tokens)
print('20 most frequent words in labeled articles:')
freq.most_common(20)

Vocab size: 83116

20 most frequent words in labeled articles:


[('oasis', 40297),
 ('from', 36982),
 ('entry', 34945),
 ('we', 32514),
 ('social', 30979),
 ('have', 27674),
 ('which', 27441),
 ('more', 26129),
 ('were', 21858),
 ('char', 20565),
 ('new', 20537),
 ('one', 20489),
 ('other', 19192),
 ('rowsep', 18993),
 ('colsep', 18958),
 ('between', 18184),
 ('than', 18113),
 ('can', 17941),
 ('has', 17494),
 ('organizational', 16978)]

### Check frequent sentences (to improve cleaning)

In [7]:
# Add sentences from each article to empty list:
cult_sents = []; cult_df['text'].apply(lambda article: cult_sents.extend([' '.join([word for word in sent]) for sent in article]))
relt_sents = []; relt_df['text'].apply(lambda article: relt_sents.extend([' '.join([word for word in sent]) for sent in article]))
demog_sents = []; demog_df['text'].apply(lambda article: demog_sents.extend([' '.join([word for word in sent]) for sent in article]))

sents = (cult_sents + relt_sents) + demog_sents
print('Number of sentences:', len(sents))
print()

# Check out most frequent sentences in labeled texts
freq = Counter(sents)
print('20 most frequent sentences in labeled articles:')
freq.most_common(20)

Number of sentences: 482124

20 most frequent sentences in labeled articles:


[('valign bottom oasis entry oasis entry colname colsep rowsep align char char',
  3751),
 ('pp', 1427),
 ('oasis entry oasis entry colname colsep rowsep align char char', 1271),
 ('oasis entry colname colsep rowsep align char char', 1077),
 ('american sociological review', 1046),
 ('american journal sociology', 1015),
 ('administrative science quarterly', 951),
 ('valign bottom oasis entry colname colsep rowsep align char char', 938),
 ('sci', 556),
 ('chicago university chicago press', 469),
 ('academy management journal', 461),
 ('organ', 407),
 ('academy management review', 394),
 ('new york free press', 360),
 ('colsep rowsep oasis entry align char char', 360),
 ('new york oxford university press', 322),
 ('cambridge ma harvard university press', 296),
 ('oasis entry oasis entry colsep rowsep align char char', 296),
 ('admin', 292),
 ('ed', 288)]

## Vectorize texts

In [15]:
# Define stopwords used by JSTOR
jstor_stopwords = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"])

# Use TFIDF weighted DTM because results in better classifier accuracy than unweighted
#vectorizer = CountVectorizer(max_features=100000, min_df=1, max_df=0.8, stop_words=jstor_stopwords) # DTM
vectorizer = TfidfVectorizer(max_features=100000, min_df=1, max_df=0.8, stop_words=jstor_stopwords) # TFIDF

X_cult = vectorizer.fit_transform(cult_tokens)
joblib.dump(vectorizer, open(cult_vec_fp, "wb"))
with open(cult_vec_feat_fp,'w') as f:
    writer = csv.writer(f)
    writer.writerows([vectorizer.get_feature_names()])
    
print('Number of features in cultural vectorizer:', len(vectorizer.get_feature_names()))
print()

X_relt = vectorizer.fit_transform(relt_tokens)
joblib.dump(vectorizer, open(relt_vec_fp, "wb"))
with open(relt_vec_feat_fp,'w') as f:
    writer = csv.writer(f)
    writer.writerows([vectorizer.get_feature_names()])
    
print('Number of features in relational vectorizer:', len(vectorizer.get_feature_names()))
print()

X_demog = vectorizer.fit_transform(demog_tokens)
joblib.dump(vectorizer, open(demog_vec_fp, "wb"))
with open(demog_vec_feat_fp,'w') as f:
    writer = csv.writer(f)
    writer.writerows([vectorizer.get_feature_names()])

print('Number of features in demographic vectorizer:', len(vectorizer.get_feature_names()))
print()

print(vectorizer.get_feature_names()[::1000]) # get every 1000th word

Number of features in cultural vectorizer: 66962

Number of features in relational vectorizer: 67936

Number of features in demographic vectorizer: 65796

['000000000000o', '2r', '9p', 'afag', 'anachronistic', 'arised', 'avowed', 'belie', 'bogran', 'bur', 'cating', 'cinlar', 'commensurable', 'consumptive', 'cristoph', 'debug', 'destructive', 'dispersed', 'dulcimer', 'emancipation', 'erupted', 'externals', 'finalists', 'frank', 'gen', 'graphs', 'harmonious', 'hlend', 'idealization', 'indexing', 'interdependent', 'ix', 'kanov', 'kuczyiski', 'lentils', 'loose', 'maniha', 'megarry', 'misuse', 'multitudinous', 'nevitt', 'novel', 'onoe', 'overseen', 'pedulla1', 'plantación', 'preceed', 'prosharing', 'railroads', 'regain', 'restructurings', 'routley', 'schary', 'setabout', 'sisyphus', 'specialist', 'straighten', 'surowiecki', 'technocrats', 'timie', 'tribution', 'undefined', 'ushered', 'voir', 'wildlife', 'yi']


## Balance x_train, y_train

In [16]:
######################################################
# Balance x_train, y_train
######################################################

def resample_data(X_train, Y_train, undersample, sampling_strategy):
    """
    args
        X_train: X training data
        Y_train: Y training data
        undersmample: boolean for over or undersampling
        sampling_strategy: strategy for resampled distribution
            if oversample: 'majority' makes minority = to majority
            if undersample: 'minority' makes majority = to minority
    """
    
    if undersample == True:
        undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
        X_balanced, Y_balanced = undersample.fit_resample(X_train, Y_train)
    else:
        oversample = RandomOverSampler(sampling_strategy=sampling_strategy)
        X_balanced, Y_balanced = oversample.fit_resample(X_train, Y_train)
    
    print(f'Y_train: {Counter(Y_train)}\nY_resample: {Counter(Y_balanced)}')
    
    return X_balanced, Y_balanced

## Evaluate algorithms: Cultural perspective

In [26]:
X_cult.shape[0]

2060700

In [24]:
######################################################
# Prepare training and validation data
######################################################

# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

cult_df = cult_df[['text', 'cultural_score']]

valueArray = cult_df.values
Y = valueArray[:,1]
Y = Y.astype('float')
test_size = 0.2
seed = 3
X_train, X_validate, Y_train, Y_validate = train_test_split(X_cult, Y, test_size=test_size, random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

# Setup 10-fold cross validation to estimate the accuracy of different models
# Split data into 10 parts
# Test options and evaluation metric
num_folds = 10
# num_instances = len(X_train)
seed = 7
scoring='f1_weighted'

ValueError: Found input variables with inconsistent numbers of samples: [2060700, 326]

In [14]:
######################################################
# Undersample to minority size
######################################################
sampling_strategy = .7
undersample = False

X_balanced, Y_balanced = resample_data(X_train, Y_train, undersample=undersample, sampling_strategy=sampling_strategy)

Y_train: Counter({0.0: 331, 1.0: 21})
Y_resample: Counter({0.0: 331, 1.0: 165})


### 10-Fold Cross Validation: Disinfectants myths

In [15]:
######################################################
# Use different algorithms to build models
######################################################

# Add each algorithm and its name to the model array
models = []
models.append(('KNN',KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, random_state=0)))
models.append(('DT', DecisionTreeClassifier()))

# Evaluate each model, add results to a results array,
# Print the accuracy results (remember these are averages and std)
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_balanced, Y_balanced, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {round(cv_results.mean(),4)}, ({round(cv_results.std(),4)})')

KNN: 0.9643, (0.0291)
RF: 1.0, (0.0)
DT: 0.9879, (0.0161)


### KNN: Disinfectants myths

In [16]:
######################################################
# Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
knn_dis = KNeighborsClassifier()
knn_dis.fit(X_train, Y_train)
knn_predictions = knn_dis.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))

######################################################
# Balanced: Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
knn_dis = KNeighborsClassifier()
knn_dis.fit(X_balanced, Y_balanced)
knn_predictions = knn_dis.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))


Unbalanced Classifier [(0.0, 331), (1.0, 21)]
0.9659090909090909
[[82  0]
 [ 3  3]]
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98        82
         1.0       1.00      0.50      0.67         6

    accuracy                           0.97        88
   macro avg       0.98      0.75      0.82        88
weighted avg       0.97      0.97      0.96        88


Balanced Classifier [(0.0, 331), (1.0, 165)]
0.9318181818181818
[[77  5]
 [ 1  5]]
              precision    recall  f1-score   support

         0.0       0.99      0.94      0.96        82
         1.0       0.50      0.83      0.62         6

    accuracy                           0.93        88
   macro avg       0.74      0.89      0.79        88
weighted avg       0.95      0.93      0.94        88



### Random Forest: Disinfectants myths

In [17]:
######################################################
# Compare algorithms on validation test: Random Forest
######################################################

rf_dis = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_dis.fit(X_train, Y_train) 
rf_predictions = rf_dis.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Random Forest
######################################################
rf_dis = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_dis.fit(X_balanced, Y_balanced) 
rf_predictions = rf_dis.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))


Unbalanced Classifier [(0.0, 331), (1.0, 21)]
0.9545454545454546
[[82  0]
 [ 4  2]]
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98        82
         1.0       1.00      0.33      0.50         6

    accuracy                           0.95        88
   macro avg       0.98      0.67      0.74        88
weighted avg       0.96      0.95      0.94        88


Balanced Classifier [(0.0, 331), (1.0, 165)]
0.9772727272727273
[[82  0]
 [ 2  4]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99        82
         1.0       1.00      0.67      0.80         6

    accuracy                           0.98        88
   macro avg       0.99      0.83      0.89        88
weighted avg       0.98      0.98      0.98        88



### Decision Tree: Disinfectants myths

In [18]:
######################################################
# Compare algorithms on validation test: Decision Tree
######################################################

dt_dis = DecisionTreeClassifier()
dt_dis.fit(X_train, Y_train)
dt_predictions = dt_dis.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Decision Tree
######################################################
dt_dis = DecisionTreeClassifier()
dt_dis.fit(X_balanced, Y_balanced)
dt_predictions = dt_dis.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))


Unbalanced Classifier [(0.0, 331), (1.0, 21)]
0.9886363636363636
[[82  0]
 [ 1  5]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99        82
         1.0       1.00      0.83      0.91         6

    accuracy                           0.99        88
   macro avg       0.99      0.92      0.95        88
weighted avg       0.99      0.99      0.99        88


Balanced Classifier [(0.0, 331), (1.0, 165)]
0.9886363636363636
[[82  0]
 [ 1  5]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99        82
         1.0       1.00      0.83      0.91         6

    accuracy                           0.99        88
   macro avg       0.99      0.92      0.95        88
weighted avg       0.99      0.99      0.99        88



In [19]:
######################################################
# Save best model
######################################################

# joblib.dump(rf_dis, dis_mod_fp)

## Evaluate algorithms: Relational perspective

In [20]:
######################################################
# Prepare training and validation data
######################################################

# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

valueArray = rem_df_is_myth.values
Y = valueArray[:,1]
Y = Y.astype('float')
test_size = 0.2
seed = 3
X_train, X_validate, Y_train, Y_validate = train_test_split(X_rem, Y, test_size=test_size, random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

# Setup 10-fold cross validation to estimate the accuracy of different models
# Split data into 10 parts
# Test options and evaluation metric
num_folds = 10
# num_instances = len(X_train)
seed = 7
scoring = 'f1_weighted'

Y_train Distribution: [(0.0, 324), (1.0, 28)]


In [21]:
######################################################
# Undersample to minority size
######################################################
sampling_strategy = .5
undersample = False

X_balanced, Y_balanced = resample_data(X_train, Y_train, undersample=undersample, sampling_strategy=sampling_strategy)

Y_train: Counter({0.0: 324, 1.0: 28})
Y_resample: Counter({0.0: 324, 1.0: 162})


### 10-Fold Cross Validation: Home remedies myths

In [22]:
######################################################
# Use different algorithms to build models
######################################################

# Add each algorithm and its name to the model array
models = []
models.append(('KNN',KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, random_state=0)))
models.append(('DT', DecisionTreeClassifier()))

# Evaluate each model, add results to a results array,
# Print the accuracy results (remember these are averages and std)
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_balanced, Y_balanced, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {round(cv_results.mean(),4)}, ({round(cv_results.std(),4)})')

KNN: 0.9241, (0.0531)
RF: 0.9979, (0.0062)
DT: 0.9636, (0.0216)


### KNN: Home remedies myths

In [23]:
######################################################
# Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
knn_rem = KNeighborsClassifier()
knn_rem.fit(X_train, Y_train)
knn_predictions = knn_rem.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))

######################################################
# Balanced: Compare algorithms on validation test: KNN
######################################################
# Make predictions on validation dataset
knn_rem = KNeighborsClassifier()
knn_rem.fit(X_balanced, Y_balanced)
knn_predictions = knn_rem.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))


Unbalanced Classifier [(0.0, 324), (1.0, 28)]
0.9318181818181818
[[81  1]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        82
         1.0       0.50      0.17      0.25         6

    accuracy                           0.93        88
   macro avg       0.72      0.58      0.61        88
weighted avg       0.91      0.93      0.92        88


Balanced Classifier [(0.0, 324), (1.0, 162)]
0.8522727272727273
[[71 11]
 [ 2  4]]
              precision    recall  f1-score   support

         0.0       0.97      0.87      0.92        82
         1.0       0.27      0.67      0.38         6

    accuracy                           0.85        88
   macro avg       0.62      0.77      0.65        88
weighted avg       0.92      0.85      0.88        88



### Random Forest: Home remedies myths

In [24]:
######################################################
# Compare algorithms on validation test: Random Forest
######################################################

rf_rem = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_rem.fit(X_train, Y_train) 
rf_predictions = rf_rem.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Random Forest
######################################################

rf_rem = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_rem.fit(X_balanced, Y_balanced) 
rf_predictions = rf_rem.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))


Unbalanced Classifier [(0.0, 324), (1.0, 28)]
0.9431818181818182
[[82  0]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97        82
         1.0       1.00      0.17      0.29         6

    accuracy                           0.94        88
   macro avg       0.97      0.58      0.63        88
weighted avg       0.95      0.94      0.92        88


Balanced Classifier [(0.0, 324), (1.0, 162)]
0.9431818181818182
[[82  0]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97        82
         1.0       1.00      0.17      0.29         6

    accuracy                           0.94        88
   macro avg       0.97      0.58      0.63        88
weighted avg       0.95      0.94      0.92        88



### Decision Tree: Home remedies myths

In [25]:
######################################################
# Compare algorithms on validation test: Decision Tree
######################################################

dt_rem = DecisionTreeClassifier()
dt_rem.fit(X_train, Y_train)
dt_predictions = dt_rem.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Decision Tree
######################################################

dt_rem = DecisionTreeClassifier()
dt_rem.fit(X_balanced, Y_balanced)
dt_predictions = dt_rem.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))


Unbalanced Classifier [(0.0, 324), (1.0, 28)]
0.8977272727272727
[[78  4]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.95        82
         1.0       0.20      0.17      0.18         6

    accuracy                           0.90        88
   macro avg       0.57      0.56      0.56        88
weighted avg       0.89      0.90      0.89        88


Balanced Classifier [(0.0, 324), (1.0, 162)]
0.8863636363636364
[[77  5]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94        82
         1.0       0.17      0.17      0.17         6

    accuracy                           0.89        88
   macro avg       0.55      0.55      0.55        88
weighted avg       0.89      0.89      0.89        88



In [26]:
######################################################
# Save best model
######################################################

# joblib.dump(rf_rem, rem_mod_fp)

## Evaluate algorithms: Demographic perspective

In [27]:
######################################################
# Prepare training and validation data
######################################################

# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

valueArray = wth_df_is_myth.values
Y = valueArray[:,1]
Y = Y.astype('float')
test_size = 0.5
seed = 15
X_train, X_validate, Y_train, Y_validate = train_test_split(X_wth, Y, test_size=test_size, random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

# Setup 10-fold cross validation to estimate the accuracy of different models
# Split data into 10 parts
# Test options and evaluation metric
num_folds = 10
# num_instances = len(X_train)
seed = 7
scoring = 'f1_weighted'

Y_train Distribution: [(1.0, 207), (0.0, 13)]


### 10-Fold Cross Validation: Weather myths

In [28]:
######################################################
# Use different algorithms to build models
######################################################

# Add each algorithm and its name to the model array
models = []
models.append(('KNN',KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, random_state=0)))
models.append(('DT', DecisionTreeClassifier()))

# Evaluate each model, add results to a results array,
# Print the accuracy results (remember these are averages and std)
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_balanced, Y_balanced, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {round(cv_results.mean(),4)}, ({round(cv_results.std(),4)})')

KNN: 0.9241, (0.0531)
RF: 0.9979, (0.0062)
DT: 0.9656, (0.0201)


In [29]:
######################################################
# Undersample to minority size
######################################################
sampling_strategy = .5
undersample = False

X_balanced, Y_balanced = resample_data(X_train, Y_train, undersample=undersample, sampling_strategy=sampling_strategy)

Y_train: Counter({1.0: 207, 0.0: 13})
Y_resample: Counter({1.0: 207, 0.0: 103})


### KNN: Weather myths

In [30]:
######################################################
# Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
# from sklearn.ensemble import BaggingClassifier # improves estimates but hard with so little data
#knn_wth = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

knn_wth = KNeighborsClassifier()
knn_wth.fit(X_train, Y_train)
knn_predictions = knn_wth.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))

######################################################
# Balanced: Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
# from sklearn.ensemble import BaggingClassifier # improves estimates but hard with so little data
#knn_wth = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

knn_wth = KNeighborsClassifier()
knn_wth.fit(X_balanced, Y_balanced)
knn_predictions = knn_wth.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))


Unbalanced Classifier [(1.0, 207), (0.0, 13)]
0.9636363636363636
[[  0   8]
 [  0 212]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.96      1.00      0.98       212

    accuracy                           0.96       220
   macro avg       0.48      0.50      0.49       220
weighted avg       0.93      0.96      0.95       220


Balanced Classifier [(1.0, 207), (0.0, 103)]
0.9181818181818182
[[  1   7]
 [ 11 201]]
              precision    recall  f1-score   support

         0.0       0.08      0.12      0.10         8
         1.0       0.97      0.95      0.96       212

    accuracy                           0.92       220
   macro avg       0.52      0.54      0.53       220
weighted avg       0.93      0.92      0.93       220



  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest: Weather myths

In [31]:
######################################################
# Compare algorithms on validation test: Random Forest
######################################################

#rf_wth = BaggingClassifier(RandomForestClassifier(n_estimators=1000, random_state=0), max_samples=0.5, max_features=0.5)

rf_wth = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_wth.fit(X_train, Y_train) 
rf_predictions = rf_wth.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Random Forest
######################################################

#rf_wth = BaggingClassifier(RandomForestClassifier(n_estimators=1000, random_state=0), max_samples=0.5, max_features=0.5)

rf_wth = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_wth.fit(X_balanced, Y_balanced) 
rf_predictions = rf_wth.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))


Unbalanced Classifier [(1.0, 207), (0.0, 13)]
0.9636363636363636
[[  0   8]
 [  0 212]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.96      1.00      0.98       212

    accuracy                           0.96       220
   macro avg       0.48      0.50      0.49       220
weighted avg       0.93      0.96      0.95       220



  _warn_prf(average, modifier, msg_start, len(result))



Balanced Classifier [(1.0, 207), (0.0, 103)]
0.9636363636363636
[[  0   8]
 [  0 212]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.96      1.00      0.98       212

    accuracy                           0.96       220
   macro avg       0.48      0.50      0.49       220
weighted avg       0.93      0.96      0.95       220



  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree: Weather myths

In [32]:
######################################################
# Compare algorithms on validation test: Decision Tree
######################################################
#dt_wth = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)

dt_wth = DecisionTreeClassifier()
dt_wth.fit(X_train, Y_train)
dt_predictions = dt_wth.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Decision Tree
######################################################
#dt_wth = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)

dt_wth = DecisionTreeClassifier()
dt_wth.fit(X_balanced, Y_balanced)
dt_predictions = dt_wth.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))


Unbalanced Classifier [(1.0, 207), (0.0, 13)]
0.9363636363636364
[[  0   8]
 [  6 206]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.96      0.97      0.97       212

    accuracy                           0.94       220
   macro avg       0.48      0.49      0.48       220
weighted avg       0.93      0.94      0.93       220


Balanced Classifier [(1.0, 207), (0.0, 103)]
0.9090909090909091
[[  4   4]
 [ 16 196]]
              precision    recall  f1-score   support

         0.0       0.20      0.50      0.29         8
         1.0       0.98      0.92      0.95       212

    accuracy                           0.91       220
   macro avg       0.59      0.71      0.62       220
weighted avg       0.95      0.91      0.93       220



In [33]:
######################################################
# Save best model
######################################################

# joblib.dump(rf_wth, wth_mod_fp)