# Compare classification methods for identifying org. science perspectives in JSTOR articles
## Using grid search and balanced samples from hand-labeled set of articles

@author: Thomas Lu, Jaren Haber PhD<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: September 2021

'''
Trains classifiers to predict whether an article is about a given perspective in org. science. To train the classifiers, uses preliminary labeled articles, broken down as follows: 
Cultural: 105 yes, 209 no
Relational: 92 yes, 230 no
Demographic: 77 yes, 249 no
Compares f1_weighted scores of four model structures using 10-Fold Cross Validation: Logistic regression, SVM, Naive Bayes, and Decision Tree. Oversamples training data to .7 (7:10 minority:majority class).
'''

# Initialize

In [1]:
!pip install nltk



In [1]:
######################################################
# Import libraries
######################################################

import pandas as pd
import numpy as np
import re
from collections import Counter
from datetime import date
from tqdm import tqdm
import os

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

stemmer = WordNetLemmatizer()

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

import joblib
import csv

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, Perceptron, PassiveAggressiveClassifier, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, KFold
# from sklearn.experimental import enable_hist_gradient_boosting

# !pip install imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings(action='once')

import sys; sys.path.insert(0, "../preprocess/") # For loading functions from files in other directory
from quickpickle import quickpickle_dump, quickpickle_load # custom scripts for quick saving & loading to pickle format

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
######################################################
# Define filepaths
######################################################

data_folder = 'classification'
folder = 'tlu_test'

cwd = os.getcwd()
root = str.replace(cwd, f'{folder}/modeling', '')

thisday = date.today().strftime("%m%d%y")

# Directory for prepared data and trained models: save files here
data_fp = root + f'{data_folder}/data/'
model_fp = root + f'{data_folder}/models/'
logs = root + f'{folder}/modeling/logs/'

# Current article lists
article_list_fp = data_fp + 'filtered_length_index.csv' # Filtered index of research articles
article_paths_fp = data_fp + 'filtered_length_article_paths.csv' # List of article file paths

def get_filepath(name):
    return data_fp + name

In [3]:
s = 0
for theory in ['cultural', 'demographic', 'relational', 'orgs']:
    df = quickpickle_load(get_filepath(f'training_{theory}_raw_022621.pkl'))
    s += len(df)
s

3033

In [4]:
out = pd.read_excel(get_filepath('filtered_length_article_names.xlsx'))
out2 = out[['article_name', 'file_name', 'jstor_url']]
out2

Unnamed: 0,article_name,file_name,jstor_url
0,360-Degree Assessment: Time for Reinvention,journal-article-10.2307_40214201,https://www.jstor.org/stable/40214201
1,9/11 Montage: Professors Remember,journal-article-10.2307_40214096,https://www.jstor.org/stable/40214096
2,A Blended Value Framework for Educating the Ne...,journal-article-10.2307_23412379,external-fulltext-any
3,A Case for Great Books in Management Education,journal-article-10.2307_40214312,https://www.jstor.org/stable/40214312
4,A Conversation With James G. March on Learning...,journal-article-10.2307_41318070,external-fulltext-any
5,A Conversation with Milton Blood: The New AACS...,journal-article-10.2307_40214313,https://www.jstor.org/stable/40214313
6,A Cross-Cultural Empirical Analysis of Person-...,journal-article-10.2307_40214343,https://www.jstor.org/stable/40214343
7,A Meta-Analysis of the Predictive Validity of ...,journal-article-10.2307_40214516,https://www.jstor.org/stable/40214516
8,A Meta-Analysis on the Effects of Service-Lear...,journal-article-10.2307_23100454,external-fulltext-any
9,A Model of Business School Students' Acceptanc...,journal-article-10.2307_40214228,https://www.jstor.org/stable/40214228


In [8]:
urls_only = out2[out2['jstor_url'] != 'external-fulltext-any']

In [11]:
out2.to_csv('filtered_article_file_url.csv', index=False)

In [10]:
urls_only.to_csv('filtered_article_file_url_existing.csv', index=False)

In [13]:
!pwd

/home/jovyan/work/tlu_test/modeling


## Load & inspect data

In [3]:
cult_df = quickpickle_load(cult_labeled_fp)
relt_df = quickpickle_load(relt_labeled_fp)
demog_df = quickpickle_load(demog_labeled_fp)
orgs_df = quickpickle_load(orgs_labeled_fp)

cult_df.head(10)

Unnamed: 0,text,cultural_score,primary_subject,edited_filename,article_name
0,"[[research, note, church_membership, netherlan...",0.0,Sociology,10.1086_210179,Where Do Interorganizational Networks Come From?
1,"[[polish, io_oo, sociological_review, issn, co...",1.0,Sociology,10.1086_210317,Civil Rights Law at Work: Sex Discrimination a...
2,"[[article, jjdlbsj, grapliy, compassionate, eg...",0.0,Sociology,10.1086_231084,Between Markets and Politics: Organizational R...
3,"[[reply, allison, more, comparing, regression_...",1.0,Sociology,10.1086_231174,World Society and the Nation‐State
4,"[[determinants, spousal, interaction, marital,...",1.0,Sociology,10.1086_382347,Kinship Networks and Entrepreneurs in China’s ...
5,"[[wsê, ih, ompany, profile, john, porter, musé...",1.0,Sociology,10.1086_517899,What Is Organizational Imprinting? Cultural En...
6,"[[andrew_christensen, university_california, l...",1.0,Sociology,10.1086_588742,"Homeward Bound? Interest, Identity, and Invest..."
7,"[[lawyers, consumer_protection, laws, stewart_...",0.0,Sociology,10.1086_657524,Corporate Unity in American Trade Policy: A Ne...
8,"[[establishing, sense, personal, control, tran...",1.0,Sociology,10.1086_659639,The Credit Crisis as a Problem in the Sociolog...
9,"[[guess, who, coming, town, white_supremacy, e...",0.0,Sociology,10.1525_irqr.2011.4.3.199,"Science, Health, and Nationhood"
