# Compare classification methods for identifying org. science perspectives in JSTOR articles using Word Embeddings
## Using grid search and balanced samples from hand-labeled set of articles

@author: Thomas Lu, Jaren Haber PhD<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: September 2021

'''
Trains classifiers to predict whether an article is about a given perspective in org. science. To train the classifiers, uses preliminary labeled articles, broken down as follows: 
Cultural: 105 yes, 209 no
Relational: 92 yes, 230 no
Demographic: 77 yes, 249 no
Compares f1_weighted scores of four model structures using 10-Fold Cross Validation: Logistic regression, SVM, Naive Bayes, and Decision Tree. Oversamples training data to .7 (7:10 minority:majority class).
'''

# Initialize

In [None]:
# !pip install imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

X_train, X_test, y_train, y_test = train_test_split(X_cult, Y, test_size=0.10, random_state=42)
print('original data size: train', X_train.shape, 'test', X_test.shape)

def oversample_shuffle(X, y):
    ros = RandomOverSampler(random_state=42, sampling_strategy=1.0)
    X, y = ros.fit_resample(X, y)
    p = np.random.permutation(len(X))
    return X[p], y[p]

X_train, y_train = oversample_shuffle(X_train, y_train)
X_test, y_test = oversample_shuffle(X_test, y_test)
print('new data size: train', X_train.shape, 'test', X_test.shape)

In [1]:
######################################################
# Import libraries
######################################################

import pandas as pd
import numpy as np
import re
from collections import Counter
from datetime import date
from tqdm import tqdm
import os

from gensim.models.keyedvectors import KeyedVectors

import matplotlib.pyplot as plt

import joblib
import csv

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, KFold

# !pip install imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

import torch
import torch.nn as nn
import torch.nn.functional as F

import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings(action='once')

import sys; sys.path.insert(0, "../preprocess/") # For loading functions from files in other directory
from quickpickle import quickpickle_dump, quickpickle_load # custom scripts for quick saving & loading to pickle format

In [2]:
######################################################
# Define filepaths
######################################################

data_folder = 'classification'
folder = 'tlu_test'

cwd = os.getcwd()

root = str.replace(cwd, f'{folder}/modeling', '')

thisday = date.today().strftime("%m%d%y")

# Directory for prepared data and trained models: save files here
data_fp = root + f'{data_folder}/data/'
model_fp = root + f'{folder}/models/'
logs = root + f'{folder}/modeling/logs/'

w2v_fp = root + 'models_storage/word_embeddings_data/word2vec_phrased_filtered_300d_2020_sept5.bin'

# Current article lists
article_list_fp = data_fp + 'filtered_length_index.csv' # Filtered index of research articles
article_paths_fp = data_fp + 'filtered_length_article_paths.csv' # List of article file paths

# Preprocessed training data
cult_labeled_fp = data_fp + 'training_cultural_preprocessed_022621.pkl'
relt_labeled_fp = data_fp + 'training_relational_preprocessed_022621.pkl'
demog_labeled_fp = data_fp + 'training_demographic_preprocessed_022621.pkl'
orgs_labeled_fp = data_fp + 'training_orgs_preprocessed_022621.pkl'

# Model filepaths
cult_model_fp = model_fp + f'classifier_cult_MLP_{str(thisday)}.joblib'
relt_model_fp = model_fp + f'classifier_relt_MLP_{str(thisday)}.joblib'
demog_model_fp = model_fp + f'classifier_demog_MLP_{str(thisday)}.joblib'
orgs_model_fp = model_fp + f'classifier_orgs_MLP_{str(thisday)}.joblib'


In [3]:
# Load the word2vec model, find and set the special token ids

w2v_model = KeyedVectors.load(w2v_fp)

key2index = w2v_model.wv.key_to_index

embedding_dim = 300

PAD_IDX = len(key2index)
PERIOD_IDX = PAD_IDX + 1
UNK_IDX = PAD_IDX + 2

## Load & inspect data

In [4]:
# cult_df = quickpickle_load(cult_labeled_fp)
# relt_df = quickpickle_load(relt_labeled_fp)
# demog_df = quickpickle_load(demog_labeled_fp)
orgs_df = quickpickle_load(orgs_labeled_fp)

orgs_df.head(10)

Unnamed: 0,text,orgs_score,edited_filename,article_name
0,"[[research, note, church_membership, netherlan...",1.0,10.1086_210179,Where Do Interorganizational Networks Come From?
1,"[[polish, io_oo, sociological_review, issn, co...",1.0,10.1086_210317,Civil Rights Law at Work: Sex Discrimination a...
2,"[[article, jjdlbsj, grapliy, compassionate, eg...",1.0,10.1086_231084,Between Markets and Politics: Organizational R...
3,"[[reply, allison, more, comparing, regression_...",1.0,10.1086_231174,World Society and the Nation‐State
4,"[[determinants, spousal, interaction, marital,...",1.0,10.1086_382347,Kinship Networks and Entrepreneurs in China’s ...
5,"[[wsê, ih, ompany, profile, john, porter, musé...",1.0,10.1086_517899,What Is Organizational Imprinting? Cultural En...
6,"[[andrew_christensen, university_california, l...",1.0,10.1086_588742,"Homeward Bound? Interest, Identity, and Invest..."
7,"[[lawyers, consumer_protection, laws, stewart_...",1.0,10.1086_657524,Corporate Unity in American Trade Policy: A Ne...
8,"[[establishing, sense, personal, control, tran...",1.0,10.1086_659639,The Credit Crisis as a Problem in the Sociolog...
9,"[[guess, who, coming, town, white_supremacy, e...",1.0,10.1525_irqr.2011.4.3.199,"Science, Health, and Nationhood"


In [5]:
# Check score distribution across classes
# print(cult_df.groupby('cultural_score').size())
# print()
# print(relt_df.groupby('relational_score').size())
# print()
# print(demog_df.groupby('demographic_score').size())
# print()
print(orgs_df.groupby('orgs_score').size())

orgs_score
0.0    303
0.5     10
1.0    511
dtype: int64


In [6]:
# Drop unsure cases: where X_score = 0.5
drop_unsure = True

if drop_unsure:
#     cult_df_yes = cult_df[cult_df['cultural_score'] == 1.0]
#     cult_df_no = cult_df[cult_df['cultural_score'] == 0.0]
#     cult_df = pd.concat([cult_df_yes, cult_df_no])
    
#     relt_df_yes = relt_df[relt_df['relational_score'] == 1.0]
#     relt_df_no = relt_df[relt_df['relational_score'] == 0.0]
#     relt_df = pd.concat([relt_df_yes, relt_df_no])
    
#     demog_df_yes = demog_df[demog_df['demographic_score'] == 1.0]
#     demog_df_no = demog_df[demog_df['demographic_score'] == 0.0]
#     demog_df = pd.concat([demog_df_yes, demog_df_no])
    
    orgs_df_yes = orgs_df[orgs_df['orgs_score'] == 1.0]
    orgs_df_no = orgs_df[orgs_df['orgs_score'] == 0.0]
    orgs_df = pd.concat([orgs_df_yes, orgs_df_no])

In [7]:
######################################################
# Convert the data into token ids up to length max_len
######################################################

max_len = 500

def obtain_token_ids(list_of_sentences, length = max_len):
    """
    Obtains the preprocessed article and returns the token ids of the first `length` tokens.
    Unknown words use a UNK_IDX token, sentence ends are represented with a 
    
    Args:
        list_of_sentences: a list of the tokenized sentences of words or phrases which constitute the article
        length: the length to set the article
    Returns:
        A list of length `length` characterizing the input sentences
    
    """
    
    tokens = []
    for sent in list_of_sentences:
        for word in sent:
            tokens.append(key2index[word] if word in key2index else UNK_IDX)
        tokens.append(PERIOD_IDX)
        if len(tokens) >= length:
            break
    if len(tokens) < length:
        tokens += [PAD_IDX] * (length - len(tokens))
    return tokens[:length]


def transform_dataframe(df):
    return np.stack(df.text.apply(obtain_token_ids))



In [8]:
np.random.seed(42)

X_cult = transform_dataframe(orgs_df)
Y = orgs_df['orgs_score'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X_cult, Y, test_size=0.10, random_state=42)
print('original data size: train', X_train.shape, 'test', X_test.shape)

def oversample_shuffle(X, y):
    ros = RandomOverSampler(random_state=42, sampling_strategy=1.0)
    X, y = ros.fit_resample(X, y)
    p = np.random.permutation(len(X))
    return X[p], y[p]

X_train, y_train = oversample_shuffle(X_train, y_train)
X_test, y_test = oversample_shuffle(X_test, y_test)
print('new data size: train', X_train.shape, 'test', X_test.shape)

original data size: train (732, 500) test (82, 500)
new data size: train (926, 500) test (96, 500)


In [9]:
def get_batches(x, y, batch_size=12):
    batches_x=[]
    batches_y=[]
    for i in range(0, len(x), batch_size):
        batches_x.append(torch.LongTensor(x[i:i+batch_size]))
        batches_y.append(torch.FloatTensor(y[i:i+batch_size]))
    return batches_x, batches_y


batch_X_train, batch_y_train = get_batches(X_train, y_train)
batch_X_test, batch_y_test = get_batches(X_test, y_test)


In [10]:
######################################################
# Prepares a CNN model that will take in a list of token ids and output a predicted probability
######################################################

class CNN(nn.Module):

    def __init__(self):
        super().__init__()
        self.seq_len = max_len
              
        self.embeddings = nn.Embedding.from_pretrained(
            torch.cat([torch.FloatTensor(w2v_model.wv.vectors), 
               torch.zeros((1, embedding_dim), dtype=torch.float), 
               torch.randn((2, embedding_dim), dtype=torch.float)], 
              dim=0), 
            freeze=False)

        self.conv_1 = nn.Conv1d(in_channels=300, out_channels=50, kernel_size=1, stride=1)
        self.pool_1 = nn.MaxPool1d(kernel_size=self.seq_len, stride=1)

        self.conv_2 = nn.Conv1d(in_channels=300, out_channels=50, kernel_size=2, stride=1)
        self.pool_2 = nn.MaxPool1d(kernel_size=self.seq_len-1, stride=1)

        self.conv_3 = nn.Conv1d(in_channels=300, out_channels=50, kernel_size=3, stride=1)
        self.pool_3 = nn.MaxPool1d(kernel_size=self.seq_len-2, stride=1)

        self.fc = nn.Linear(50 * 3, 1)

    
    def forward(self, x): 
        x0 = self.embeddings(x)
        
        x0 = x0.permute(0, 2, 1)

        x1 = torch.tanh(self.conv_1(x0))
        x1 = self.pool_1(x1)

        x2 = torch.tanh(self.conv_2(x0))
        x2 = self.pool_2(x2)

        x3 = torch.tanh(self.conv_3(x0))
        x3 = self.pool_3(x3)

        combined=torch.cat((x1, x2, x3), axis=1).squeeze()
        
        out = self.fc(combined)
        return F.sigmoid(out.squeeze())

    
    def predict(self, X):
        self.eval()
        y = []
        with torch.no_grad():
            for x in X:
                y_preds = self.forward(x)
                y.append(y_preds)
        return torch.cat(y, dim=0)

## Setup for modeling

In [11]:
def train(model, optimizer, loss_fn, num_epochs=8):
    """
    Runs a training loop for a given pytorch model, optimizer, and loss function for num_epochs epochs
    """

    best_dev_acc = 0.

    for epoch in range(num_epochs):

        model.train()
        for x, y in zip(batch_X_train, batch_y_train):
            y_pred = model.forward(x)
            loss = loss_fn(y_pred.view(-1), y.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        predictions = model.predict(batch_X_test).cpu().numpy()

        dev_accuracy = accuracy_score(y_test, predictions > 0.5)
        best_dev_acc = max(best_dev_acc, dev_accuracy)
        dev_roc = roc_auc_score(y_test, predictions)
        if epoch % 1 == 0:
            print("Epoch %s, dev accuracy: %.3f, dev AUC: %.3f" % (epoch, dev_accuracy, dev_roc))

    print("\nBest Performing Model achieves dev accuracy of : %.3f" % (best_dev_acc))

In [13]:
# For reproducible results, sets a random seed and trains the CNN model

torch.manual_seed(42)

model = CNN()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

loss_fn = nn.BCELoss()

train(model, optimizer, loss_fn)

Epoch 0, dev accuracy: 0.646, dev AUC: 0.691
Epoch 1, dev accuracy: 0.740, dev AUC: 0.779
Epoch 2, dev accuracy: 0.802, dev AUC: 0.805
Epoch 3, dev accuracy: 0.802, dev AUC: 0.807
Epoch 4, dev accuracy: 0.802, dev AUC: 0.809
Epoch 5, dev accuracy: 0.802, dev AUC: 0.809
Epoch 6, dev accuracy: 0.802, dev AUC: 0.806
Epoch 7, dev accuracy: 0.802, dev AUC: 0.805

Best Performing Model achieves dev accuracy of : 0.802


In [20]:
######################################################
# Prepares an LSTM model that will take in a list of token ids and output a predicted probability
######################################################

class LSTM(torch.nn.Module) :
    def __init__(self, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding.from_pretrained(
            torch.cat([torch.FloatTensor(w2v_model.wv.vectors), 
               torch.zeros((1, embedding_dim), dtype=torch.float), 
               torch.randn((2, embedding_dim), dtype=torch.float)], 
              dim=0), 
            freeze=False)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (h, c) = self.lstm(x)
        return F.sigmoid(self.linear(h[-1]))
    
    def predict(self, X):
        self.eval()
        y = []
        with torch.no_grad():
            for x in X:
                y_preds = self.forward(x)
                y.append(y_preds)
        return torch.cat(y, dim=0)

In [21]:
# For reproducible results, sets a random seed and trains the LSTM model

torch.manual_seed(42)

model = LSTM(300)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

loss_fn = nn.BCELoss()

train(model, optimizer, loss_fn)

Epoch 0, dev accuracy: 0.573, dev AUC: 0.621
Epoch 1, dev accuracy: 0.781, dev AUC: 0.762
Epoch 2, dev accuracy: 0.729, dev AUC: 0.785
Epoch 3, dev accuracy: 0.646, dev AUC: 0.766
Epoch 4, dev accuracy: 0.698, dev AUC: 0.784
Epoch 5, dev accuracy: 0.677, dev AUC: 0.788
Epoch 6, dev accuracy: 0.677, dev AUC: 0.785
Epoch 7, dev accuracy: 0.677, dev AUC: 0.790

Best Performing Model achieves dev accuracy of : 0.781
