# Assemble hand-coded articles and prepare for modeling

@author: Jaren Haber, PhD, Georgetown University<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: November 2020<br>

@description: '''Loads and merges two datasets in preparation for classification model training. We're dealing with three theoretical perspectives in org. science (cultural, demographic, and relational) and two subject areas (sociology & management/OB, not differentiated here). The first dataset is of articles hand-coded by the author and Prof. Haveman, and it comes as a clean .csv file. This first contains lots of false positives (from the previous approach based on cosine measures), so it consists of mainly negative cases. The second dataset is of articles identified by Prof. Haveman as being foundation/definitive for each perspective. This comes as a list of citations, one per perspective, and requires some pretty heavy cleaning to match with articles in the main JSTOR articles dataset.'''

## Initialize

In [294]:
# import packages
import imp, importlib # For working with modules
import pandas as pd # for working with dataframes
import numpy as np # for working with numbers
import pickle # For working with .pkl files
import re # for regex magic
from tqdm import tqdm # Shows progress over iterations, including in pandas via "progress_apply"
tqdm.pandas(desc='')
import sys # For terminal tricks
import _pickle as cPickle # Optimized version of pickle
import gc # For managing garbage collector
import timeit # For counting time taken for a process
import datetime # For working with dates & times
!pip install openpyxl
import openpyxl # for saving in excel format
import tables
import random
import os; from os import listdir; from os.path import isfile, join
from quickpickle import quickpickle_dump # custom script for quick saving to pickle format

Collecting openpyxl
[?25l  Downloading https://files.pythonhosted.org/packages/5c/90/61f83be1c335a9b69fa773784a785d9de95c7561d1661918796fd1cba3d2/openpyxl-3.0.5-py2.py3-none-any.whl (242kB)
[K     |████████████████████████████████| 245kB 5.3MB/s eta 0:00:01
[?25hCollecting jdcal (from openpyxl)
  Downloading https://files.pythonhosted.org/packages/f0/da/572cbc0bc582390480bbd7c4e93d14dc46079778ed915b505dc494b37c57/jdcal-1.4.1-py2.py3-none-any.whl
Collecting et-xmlfile (from openpyxl)
  Downloading https://files.pythonhosted.org/packages/22/28/a99c42aea746e18382ad9fb36f64c1c1f04216f41797f2f0fa567da11388/et_xmlfile-1.0.1.tar.gz
Building wheels for collected packages: et-xmlfile
  Building wheel for et-xmlfile (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/2a/77/35/0da0965a057698121fc7d8c5a7a9955cdbfb3cc4e2423cad39
Successfully built et-xmlfile
Installing collected packages: jdcal, et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.0.1 jdcal-

In [269]:
# define filepaths
cwd = os.getcwd()
root = str.replace(cwd, 'classification/preprocess', '')
#root = '/home/jovyan/work/' # set root directory

# for text files
ocr_fp = root + 'jstor_data/ocr/' 

# dictionary counts (using core dictionaries) and matched subjects 
counts_fp = root + 'dictionary_methods/counts_and_subject.csv'

# per-article info on cosine scores using each dictionary (core or 100-term dictionaries??)
cosines_fp = root + 'models_storage/word_embeddings_data/text_with_cosine_scores_wdg_2020_oct27.csv'

# per-article metadata with URLs
meta_fp = root + 'dictionary_methods/code/metadata_combined.h5' 

# Filtered index of research articles
article_list_fp = root + 'classification/data/filtered_length_index.csv'

# Filtered list of article names and general data, sorted by journal then article name
article_names_fp = root + 'classification/data/filtered_length_article_names.xlsx'

# coded output directory: save files here
output_fp = root + 'classification/data/'

# True positives (to be merged): Hand coded and foundational sets from H2
coded_11620 = output_fp + 'hand_coded/coded_sample_cleaned_111620.csv'
coded_cult_fp = output_fp + 'hand_coded/true_positives_cultural.csv'
coded_relt_fp = output_fp + 'hand_coded/true_positives_relational.csv'
coded_demog_fp = output_fp + 'hand_coded/true_positives_demographic.csv'

# Results of merging files
matched_fp = root + 'classification/data/true_positives_matched_112420.xlsx'
not_matched_fp = root + 'classification/data/true_positives_match_failed_112420.xlsx'

# Training data (output)
training_cult_fp = output_fp + 'training_cultural_112420.pkl'
training_relt_fp = output_fp + 'training_relational_112420.pkl'
training_demog_fp = output_fp + 'training_demographic_112420.pkl'

In [None]:
# collect article file list
colnames = ['file_name']
articles = pd.read_csv(article_list_fp, names=colnames, header=None)

files_to_be_opened = [ocr_fp + file + '.txt' for file in tqdm(articles.file_name)]
all_files = [ocr_fp + f for f in tqdm(listdir(ocr_fp)) if isfile(join(ocr_fp, f))]

files = [file for file in tqdm(all_files) if file in files_to_be_opened]

100%|██████████| 69659/69659 [00:00<00:00, 1436647.60it/s]
100%|██████████| 399129/399129 [00:02<00:00, 170673.53it/s]
 56%|█████▋    | 225355/399128 [05:54<04:26, 650.93it/s]

## Read in & merge data

### Meta data

In [280]:
# Read in metadata file
df_meta = pd.read_hdf(meta_fp)
df_meta.reset_index(drop=False, inplace=True) # extract file name from index

# For merging purposes, get ID alone from file name, e.g. 'journal-article-10.2307_2065002' -> '10.2307_2065002'
df_meta['edited_filename'] = df_meta['file_name'].apply(lambda x: x[16:]) 
df_meta = df_meta[["edited_filename", "article_name", "jstor_url", "abstract", "journal_title", "given_names", "primary_subject", "year", "type"]] # keep only relevant columns

df_meta.head()

Unnamed: 0,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type
0,10.2307_4167860,Cross-Dialectal Variation in Arabic: Competing...,https://www.jstor.org/stable/4167860,Most researchers of Arabic sociolinguistics as...,Language in Society,,Other,1979,research-article
1,10.2307_2578336,,https://www.jstor.org/stable/2578336,,Social Forces,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1983,book-review
2,10.2307_2654760,,https://www.jstor.org/stable/2654760,,Contemporary Sociology,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1998,book-review
3,10.2307_43242281,editor's note: A KNIGHT'S TALE,https://www.jstor.org/stable/43242281,,Corporate Knights,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Other,2005,misc
4,10.2307_42862018,,https://www.jstor.org/stable/42862018,,Social Science Quarterly,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1985,book-review


In [281]:
# Read in filtered index, counts
df = pd.read_csv(article_list_fp, low_memory=False, header=None, names=["file_name"])
df['edited_filename'] = df['file_name'].apply(lambda x: x[16:]) # New col with only article ID

df_counts = pd.read_csv(counts_fp, low_memory=False)
df_counts['edited_filename'] = df_counts['article_id'].apply(lambda x: x[16:]) # New col with only article ID
df_counts = df_counts[['edited_filename', 'word_count']]

# Merge meta data, counts into articles list DF
df = pd.merge(df, df_meta, how='left', on='edited_filename') # meta data
df = pd.merge(df, df_counts, how='left', on='edited_filename') # counts

# Filter to only full articles: >=1000 words (eliminates 69659 - 65372 = 4287 cases)
#df = df[df['word_count'] >= 1000]

# Show all columns in resulting DF
print("All columns:\n", list(df))
print()

print("Rows, cols in data:", df.shape)

df.head()

All columns:
 ['file_name', 'edited_filename', 'article_name', 'jstor_url', 'abstract', 'journal_title', 'given_names', 'primary_subject', 'year', 'type', 'word_count']

Rows, cols in data: (65372, 11)


Unnamed: 0,file_name,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,word_count
0,journal-article-10.2307_2065002,10.2307_2065002,Toward More Cumulative Inquiry,https://www.jstor.org/stable/2065002,,Contemporary Sociology,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Sociology,1978,research-article,3529
1,journal-article-10.2307_3380821,10.2307_3380821,An Analysis of an Incentive Sick Leave Policy ...,https://www.jstor.org/stable/3380821,Local health departments are under tremendous ...,Public Productivity & Management Review,"[Werner, Werner, Konrad, Rudi, Paul, Jean, Rob...",Management & Organizational Behavior,1986,research-article,5195
2,journal-article-10.2307_2095822,10.2307_2095822,Local Friendship Ties and Community Attachment...,https://www.jstor.org/stable/2095822,This study presents a multilevel empirical tes...,American Sociological Review,"[Alice O., Peter, W. Erwin, Bert, Robert W., C...",Sociology,1983,research-article,7100
3,journal-article-10.2307_40836133,10.2307_40836133,Knowledge Transfer within the Multinational Fi...,https://www.jstor.org/stable/40836133,This paper examines the process of knowledge t...,MIR: Management International Review,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Management & Organizational Behavior,2005,research-article,7110
4,journal-article-10.2307_2579666,10.2307_2579666,Dynamics of Labor Market Segmentation in Polan...,https://www.jstor.org/stable/2579666,Research in the early 1980s showed that indust...,Social Forces,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Sociology,1990,research-article,5313


### Coded data

In [282]:
# Read in true positives from H2--in citation format
coded_cult = pd.read_csv(coded_cult_fp, low_memory=False, header=None, 
                         encoding="windows-1252").rename(columns = {0:'citation'})
coded_relt = pd.read_csv(coded_relt_fp, low_memory=False, header=None, 
                         encoding="windows-1252").rename(columns = {0:'citation'})
coded_demog = pd.read_csv(coded_demog_fp, low_memory=False, header=None, 
                          encoding="windows-1252").rename(columns = {0:'citation'})

# Assign scores for each article from H2's foundational set
outside_score = np.NaN # score for two perspectives other than one coded intentionally (0 if negative case, NaN otherwise)

coded_cult['cultural_score'] = 1
coded_cult['relational_score'] = outside_score
coded_cult['demographic_score'] = outside_score

coded_relt['cultural_score'] = outside_score
coded_relt['relational_score'] = 1
coded_relt['demographic_score'] = outside_score

coded_demog['cultural_score'] = outside_score
coded_demog['relational_score'] = outside_score
coded_demog['demographic_score'] = 1

# Merge the three sets into one
coded_h2 = pd.concat([coded_cult, coded_relt, coded_demog])

coded_h2.head(10)

Unnamed: 0,citation,cultural_score,relational_score,demographic_score
0,"Barley, Stephen R. 1983. Semiotics and the s...",1.0,,
1,"Barney, Jay B. 1986. Organizational culture:...",1.0,,
2,"Castilla, Emilio J., and Stephen Benard. 2010...",1.0,,
3,"Dutton, Jane E., and Janet M. Dukerich. 1991....",1.0,,
4,"Fine, Gary Alan. 1984. Negotiated orders and...",1.0,,
5,"Fiol, C. Marlene. 2002. Capitalizing on para...",1.0,,
6,"Goldberg, Amir, Sameer B. Srivastava, V. Govin...",1.0,,
7,"Morrill, Calvin. 1991. Conflict management, ...",1.0,,
8,"Ouchi, William G., and Alan L. Wilkins. 1985....",1.0,,
9,"Pettigrew, Andrew M. 1979. On studying organ...",1.0,,


In [283]:
# Read in hand-coded data
coded_df = pd.read_csv(coded_11620, low_memory=False, header=0)
coded_df['file_name'] = coded_df['edited_filename'].apply(lambda name: 'journal-article-' + str(name)) # restore original filenames
coded_df.head()

Unnamed: 0,cultural_score,relational_score,demographic_score,article_name,abstract,jstor_url,year,journal_title,edited_filename,culture_word2vec_cosine,culture_ngram_count.1,cultural_author_count,relational_word2vec_cosine,relational_ngram_count.1,relational_author_count,demographic_word2vec_cosine,demographic_ngram_count.1,demographic_author_count,file_name
0,1.0,0.0,0.0,"Intersecting Three Muddy Roads: Stability, Leg...",Several decades of research by multiple academ...,https://www.jstor.org/stable/25822540,2011.0,Journal of Managerial Issues,10.2307_25822540,0.754487,227.0,7.0,0.61303,33.0,1.0,0.560983,119.0,0.0,journal-article-10.2307_25822540
1,1.0,0.0,0.0,Rational Decision Making as Performative Praxi...,Organizational theorists built their knowledge...,external-fulltext-any,2011.0,Organization Science,10.2307_20868880,0.721939,55.0,6.0,0.588276,16.0,1.0,0.534615,2.0,0.0,journal-article-10.2307_20868880
2,1.0,1.0,0.0,From Fiefs to Clans and Network Capitalism: Ex...,China's rapid economic development is being ac...,https://www.jstor.org/stable/2393869,1986.0,Administrative Science Quarterly,10.2307_2393869,0.715111,73.0,0.0,0.644378,66.0,0.0,0.530408,17.0,0.0,journal-article-10.2307_2393869
3,1.0,1.0,0.0,The Collective Strategy Framework: An Applicat...,This paper investigates empirically the compet...,https://www.jstor.org/stable/2392643,1984.0,Administrative Science Quarterly,10.2307_2392643,0.702606,114.0,9.0,0.671079,88.0,2.0,0.6746,124.0,9.0,journal-article-10.2307_2392643
4,1.0,0.0,0.0,"Political Institutional Change, Obsolescing Le...",This paper studies the practice of integration...,https://www.jstor.org/stable/41682289,2012.0,MIR: Management International Review,10.2307_41682289,0.68824,218.0,1.0,0.692662,111.0,0.0,0.586081,98.0,0.0,journal-article-10.2307_41682289


## Merge true positives into hand-coded data via titles

### Extract and preprocess article titles

In [284]:
def get_title(citation, is_title = False):
    '''
    Extracts title from citation format & preprocesses it: 
    lower-case, remove punctuation, strip whitespace.
    
    Args:
        citation: in ASA format: 'author(s). year (four digits). title: subtitle. journal name & issue.'
        is_title: binary = True if input is just a title (not full citation)
        
    Returns:
        str: extracted and preprocessed title
    '''
    
    title_pattern = r'(?<=\d{4}\.).*[\.\?\!].' # regex pattern for getting title
    
    if not is_title: 
        title = str(re.findall( # match to title pattern
            title_pattern, citation))[4:-4] # force to string (b/c weird output), slice to remove brackets and quotes
    else:
        title = str(citation) # title is input, not citation
        
    # Same cleaning either way: remove any non-words or whitespace (+any x suffix), then lower case
    title = re.sub('\W+', ' ', title).strip(' x').lower()
    
    return title

In [285]:
# Extract & clean titles in true positives:
coded_h2['article_name_edited'] = coded_h2['citation'].apply(
    lambda cite: get_title(cite))

print("Sample of citations and extracted titles:\n")
for i, row in coded_h2.sample(n=7).iterrows():
    print(row['citation'])
    print(row['article_name_edited'])
    print()

Sample of citations and extracted titles:

Carroll, Glenn R.  1985.  Concentration and specialization:  Dynamics of niche width in populations of organizations.  American Journal of Sociology, 90:  1262-1283.
concentration and specialization dynamics of niche width in populations of organizations

Kanter, Rosabeth Moss.  1977.  Some effects of proportions on group life:  Skewed sex ratios and responses to token women.  American Journal of Sociology, 82:  965-990.
some effects of proportions on group life skewed sex ratios and responses to token women

Baldassarri, Delia, and Mario Diani.  2007.  The integrative power of civic networks.  American Journal of Sociology, 113 (3):  735-780.
the integrative power of civic networks

Gargiulo, Martin.  1993.  Two-step leverage: Managing constraint in organizational politics.  Administrative Science Quarterly, 38 (1):  1-19.
two step leverage managing constraint in organizational politics

Zatzick, Christopher D., Marta M. Elvira, and Lisa E. C

In [286]:
# Preprocess article names in meta data: lower-case, remove punctuation, strip whitespace
df['article_name_edited'] = df['article_name'].apply(
    lambda title: get_title(title, is_title = True))

print("Sample of full and cleaned article titles:\n")
for i, row in df.sample(n=7).iterrows():
    print(row['article_name'])
    print(row['article_name_edited'])
    print()

Sample of full and cleaned article titles:

The Concept of Management Information Systems for Managers
the concept of management information systems for managers

Harnessing the Resources of Community: The Ultimate Performance Agenda
harnessing the resources of community the ultimate performance agenda

Action research: Solving real-world problems
action research solving real world problems

Sovereign Immunity and International Organization: The Case of DeLuca v. the United Nations
sovereign immunity and international organization the case of deluca v the united nations

THORSTEIN VEBLEN, HETERODOX ECONOMIST, IN RETROSPECT
thorstein veblen heterodox economist in retrospect

Dynamics of Organizational Expansion in National Systems of Education
dynamics of organizational expansion in national systems of education

The Legitimation of Inequality: Psychosocial Dispositions, Education, and Attitudes toward Income Inequality in China
the legitimation of inequality psychosocial dispositions e

### Match and check results

In [287]:
# Merge meta data into true positives using preprocessed article names
coded_h2 = pd.merge(coded_h2, df, how = 'left', on = 'article_name_edited')

uncoded_h2 = coded_h2[coded_h2['file_name'].isnull()].reset_index(drop=True)
coded_h2 = coded_h2[coded_h2['file_name'].notnull()].reset_index(drop=True)
print("# articles matched so far:", str(len(coded_h2)))
print("# articles left to match:", str(len(uncoded_h2)))

uncoded_h2.head()

# articles matched so far: 207
# articles left to match: 190


Unnamed: 0,citation,cultural_score,relational_score,demographic_score,article_name_edited,file_name,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,word_count
0,"Barney, Jay B. 1986. Organizational culture:...",1.0,,,organizational culture can it be a source of s...,,,,,,,,,,,
1,"Dutton, Jane E., and Janet M. Dukerich. 1991....",1.0,,,keeping an eye on the mirror image and identit...,,,,,,,,,,,
2,"Goldberg, Amir, Sameer B. Srivastava, V. Govin...",1.0,,,fitting in or standing out the tradeoffs of st...,,,,,,,,,,,
3,"Morrill, Calvin. 1991. Conflict management, ...",1.0,,,conflict management honor and organizational c...,,,,,,,,,,,
4,"Rafaeli, Anat, and Michael G. Pratt. 1993. T...",1.0,,,tailored meanings a look at dress,,,,,,,,,,,


In [288]:
# Concatenate h2-coded data with hand-coded data
coded_df = pd.concat([coded_df, coded_h2], axis=0, join='inner')
coded_df

Unnamed: 0,cultural_score,relational_score,demographic_score,article_name,abstract,jstor_url,year,journal_title,edited_filename,file_name
0,1.0,0.0,0.0,"Intersecting Three Muddy Roads: Stability, Leg...",Several decades of research by multiple academ...,https://www.jstor.org/stable/25822540,2011,Journal of Managerial Issues,10.2307_25822540,journal-article-10.2307_25822540
1,1.0,0.0,0.0,Rational Decision Making as Performative Praxi...,Organizational theorists built their knowledge...,external-fulltext-any,2011,Organization Science,10.2307_20868880,journal-article-10.2307_20868880
2,1.0,1.0,0.0,From Fiefs to Clans and Network Capitalism: Ex...,China's rapid economic development is being ac...,https://www.jstor.org/stable/2393869,1986,Administrative Science Quarterly,10.2307_2393869,journal-article-10.2307_2393869
3,1.0,1.0,0.0,The Collective Strategy Framework: An Applicat...,This paper investigates empirically the compet...,https://www.jstor.org/stable/2392643,1984,Administrative Science Quarterly,10.2307_2392643,journal-article-10.2307_2392643
4,1.0,0.0,0.0,"Political Institutional Change, Obsolescing Le...",This paper studies the practice of integration...,https://www.jstor.org/stable/41682289,2012,MIR: Management International Review,10.2307_41682289,journal-article-10.2307_41682289
5,1.0,0.0,0.0,Culture and Meaning: Making Sense of Conflicti...,,https://www.jstor.org/stable/40397128,1989,International Studies of Management & Organiza...,10.2307_40397128,journal-article-10.2307_40397128
6,1.0,0.0,0.0,Linking Organizational Values to Relationships...,This study explores the organizational values ...,https://www.jstor.org/stable/2640266,1995,Organization Science,10.2307_2640266,journal-article-10.2307_2640266
7,1.0,0.0,0.0,Beyond the red tape: How victims of terrorism ...,We use a storyteller perspective to examine ho...,external-fulltext-any,2011,Journal of Organizational Behavior,10.2307_41415713,journal-article-10.2307_41415713
8,1.0,0.0,0.0,Embedding Sustainability Across the Organizati...,This article is a response to Haugh and Talwar...,https://www-jstor-org.proxy.library.georgetown...,2011,Academy of Management Learning & Education,10.2307_23100442,journal-article-10.2307_23100442
9,1.0,1.0,0.0,When Experience Meets National Institutional E...,We develop an institutional change perspective...,https://www.jstor.org/stable/27735492,2009,Strategic Management Journal,10.2307_27735492,journal-article-10.2307_27735492


In [289]:
# Extract and assess conclusive results for each perspective
coded_cult_final = coded_df[coded_df['cultural_score'].notnull()][['cultural_score', 'file_name']]
coded_relt_final = coded_df[coded_df['relational_score'].notnull()][['relational_score', 'file_name']]
coded_demog_final = coded_df[coded_df['demographic_score'].notnull()][['demographic_score', 'file_name']]

print('Number of coded files for each perspective:\n')
print(f'Cultural: {str(len(coded_cult_final[coded_cult_final["cultural_score"]==1.0]))} yes, {str(len(coded_cult_final[coded_cult_final["cultural_score"]==0.0]))} no')
print(f'Relational: {str(len(coded_relt_final[coded_relt_final["relational_score"]==1.0]))} yes, {str(len(coded_relt_final[coded_relt_final["relational_score"]==0.0]))} no')
print(f'Demographic: {str(len(coded_demog_final[coded_demog_final["demographic_score"]==1.0]))} yes, {str(len(coded_demog_final[coded_demog_final["demographic_score"]==0.0]))} no')

Number of coded files for each perspective:

Cultural: 105 yes, 209 no
Relational: 92 yes, 230 no
Demographic: 77 yes, 249 no


## Merge full text with coded articles

## Save final coded data

In [295]:
# Save reference file, sorted by journal then article name
df[['article_name', 'article_name_edited', 'journal_title', 'year', 'file_name', 'jstor_url']].sort_values(
    by=['journal_title', 'article_name_edited'], ascending=True).to_excel(
    article_names_fp, index = False)

In [297]:
# Save lists of true positives: those that matched, those that didn't
coded_df.sort_values(by='article_name', ascending=True).to_excel(
    matched_fp, index=False)

uncoded_h2[['citation', 'article_name_edited']].sort_values(
    by='article_name_edited', ascending=True).to_excel(
    not_matched_fp, index=False)

In [260]:
# Save training data for classifiers: true positives + negatives for each perspective
quickpickle_dump(coded_cult_final, training_cult_fp)
quickpickle_dump(coded_relt_final, training_relt_fp)
quickpickle_dump(coded_demog_final, training_demog_fp)

Number of coded files for each perspective:
Cultural: 105 yes, 209 no
Relational: 92 yes, 230 no
Demographic: 77 yes, 249 no
