# Assemble hand-coded articles and prepare for modeling

@author: Jaren Haber, PhD<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: November 2020<br>

@description: '''Loads and merges two datasets in preparation for classification model training. We're dealing with three theoretical perspectives in org. science (cultural, demographic, and relational) and two subject areas (sociology & management/OB, not differentiated here). The first dataset is of articles hand-coded by the author and Prof. Haveman, and it comes as a clean .csv file. This first contains lots of false positives (from the previous approach based on cosine measures), so it consists of mainly negative cases. The second dataset is of articles identified by Prof. Haveman as being foundation/definitive for each perspective. This comes as a list of citations, one per perspective, and requires some pretty heavy cleaning to match with articles in the main JSTOR articles dataset.'''

## Initialize

In [1]:
# import packages
import imp, importlib # For working with modules
import pandas as pd # for working with dataframes
import numpy as np # for working with numbers
import pickle # For working with .pkl files
from tqdm import tqdm # Shows progress over iterations, including in pandas via "progress_apply"
tqdm.pandas(desc='')
import sys # For terminal tricks
import _pickle as cPickle # Optimized version of pickle
import gc # For managing garbage collector
import timeit # For counting time taken for a process
import datetime # For working with dates & times
import tables
import random
import os; from os import listdir; from os.path import isfile, join

In [2]:
# define filepaths
cwd = os.getcwd()
root = str.replace(cwd, 'classification/preprocess', '')
#root = '/home/jovyan/work/' # set root directory

# dictionary counts (using core dictionaries) and matched subjects 
counts_fp = root + 'dictionary_methods/counts_and_subject.csv'

# per-article info on cosine scores using each dictionary (core or 100-term dictionaries??)
cosines_fp = root + 'models_storage/word_embeddings_data/text_with_cosine_scores_wdg_2020_oct27.csv'

# per-article metadata with URLs
meta_fp = root + 'dictionary_methods/code/metadata_combined.h5' 

# Filtered index of research articles
articles_list_fp = root + 'dictionary_methods/code/filtered_index.csv'

# coded output directory: save files here
output_fp = root + 'classification/data/hand_coded/'
coded_11620 = output_fp + 'coded_sample_cleaned_111620.csv'
coded_cult = output_fp + 'true_positives_cultural.csv'
coded_relt = output_fp + 'true_positives_relational.csv'
coded_demog = output_fp + 'true_positives_demographic.csv'

# for text files
ocr_fp = root + 'jstor_data/ocr/' 

In [None]:
# collect article file list
colnames = ['file_name']
articles = pd.read_csv(articles_list_fp, names=colnames, header=None)

files_to_be_opened = [ocr_fp + file + '.txt' for file in tqdm(articles.file_name)]
all_files = [ocr_fp + f for f in tqdm(listdir(ocr_fp)) if isfile(join(ocr_fp, f))]

files = [file for file in tqdm(all_files) if file in files_to_be_opened]

100%|██████████| 69659/69659 [00:00<00:00, 1436647.60it/s]
100%|██████████| 399129/399129 [00:02<00:00, 170673.53it/s]
 56%|█████▋    | 225355/399128 [05:54<04:26, 650.93it/s]