# Assemble hand-coded articles and prepare for modeling

@author: Jaren Haber, PhD, Georgetown University<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: November 2020<br>

@description: '''Loads and merges two datasets in preparation for classification model training. Saves final datasets for preprocessing and model training, one per perspective, each just with scores and raw text (to be preprocessed later). We're dealing with three theoretical perspectives in org. science (cultural, demographic, and relational) and two subject areas (sociology & management/OB, not differentiated here). The first dataset is of articles hand-coded by the author and Prof. Haveman, and it comes as a clean .csv file. This first contains lots of false positives (from the previous approach based on cosine measures), so it consists of mainly negative cases. The second dataset is of articles identified by Prof. Haveman as being foundation/definitive for each perspective. This comes as a list of citations, one per perspective, and requires some pretty heavy cleaning to match with articles in the main JSTOR articles dataset.'''

## Initialize

In [1]:
#!pip install openpyxl
#!pip install spacy
#import nltk; nltk.download('words')

# import packages
import imp, importlib # For working with modules
import pandas as pd # for working with dataframes
import numpy as np # for working with numbers
import pickle # For working with .pkl files
import re # for regex magic
from tqdm import tqdm # Shows progress over iterations, including in pandas via "progress_apply"
import sys # For terminal tricks
import _pickle as cPickle # Optimized version of pickle
import gc # For managing garbage collector
import timeit # For counting time taken for a process
import datetime # For working with dates & times
from datetime import date
import openpyxl # for saving in excel format
import tables
import random
import os; from os import listdir; from os.path import isfile, join
from quickpickle import quickpickle_dump, quickpickle_load # custom scripts for quick saving & loading to pickle format

In [2]:
# define filepaths

thisday = date.today().strftime("%m%d%y")

cwd = os.getcwd()
root = str.replace(cwd, 'classification/preprocess', '')

# for text files
ocr_fp = root + 'jstor_data/ocr/' 

# Directory for prepared data and trained models: save files here
data_fp = root + 'classification/data/'

# Current article lists
article_list_fp = data_fp + 'filtered_length_index.csv' # Filtered index of research articles
article_paths_fp = data_fp + 'filtered_length_article_paths.csv' # List of article file paths
article_names_fp = data_fp + 'filtered_length_article_names.xlsx' # Filtered list of article names and general data, sorted by journal then article name

# dictionary counts (using core dictionaries) and matched subjects 
counts_fp = root + 'dictionary_methods/counts_and_subject.csv'

# per-article metadata with URLs
meta_fp = root + 'dictionary_methods/code/metadata_combined.h5' 

# per-article info on cosine scores using each dictionary (core or 100-term dictionaries??)
cosines_fp = root + 'models_storage/word_embeddings_data/text_with_cosine_scores_wdg_2020_oct27.csv'

# per-article preprocessed text (lines up with filtered_index.csv)
texts_fp = root + 'models_storage/word_embeddings_data/cleaned_text_nested_2020_sept5.pkl'
article_list_old_fp = root + 'models_storage/word_embeddings_data/filtered_index.csv'

# True positives (to be merged): Hand coded and foundational sets from H2
coded_11620 = data_fp + 'hand_coded/coded_sample_cleaned_111620.csv'
coded_cult_fp = data_fp + 'hand_coded/true_positives_cultural.csv'
coded_relt_fp = data_fp + 'hand_coded/true_positives_relational.csv'
coded_demog_fp = data_fp + 'hand_coded/true_positives_demographic.csv'

# Output: merging results and training data
matched_fp = data_fp + f'true_positives_matched_{str(thisday)}.xlsx'
not_matched_fp = data_fp + f'true_positives_match_failed_{str(thisday)}.xlsx'

training_cult_fp = data_fp + f'training_cultural_raw_{str(thisday)}.pkl'
training_relt_fp = data_fp + f'training_relational_raw_{str(thisday)}.pkl'
training_demog_fp = data_fp + f'training_demographic_raw_{str(thisday)}.pkl'

## Read in & merge data

### Meta data

In [3]:
# Read in metadata file
df_meta = pd.read_hdf(meta_fp)
df_meta.reset_index(drop=False, inplace=True) # extract file name from index

# For merging purposes, get ID alone from file name, e.g. 'journal-article-10.2307_2065002' -> '10.2307_2065002'
df_meta['edited_filename'] = df_meta['file_name'].apply(lambda x: x[16:]) 
df_meta = df_meta[["edited_filename", "article_name", "jstor_url", "abstract", "journal_title", "given_names", "primary_subject", "year", "type"]] # keep only relevant columns

df_meta.head()

Unnamed: 0,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type
0,10.2307_4167860,Cross-Dialectal Variation in Arabic: Competing...,https://www.jstor.org/stable/4167860,Most researchers of Arabic sociolinguistics as...,Language in Society,,Other,1979,research-article
1,10.2307_2578336,,https://www.jstor.org/stable/2578336,,Social Forces,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1983,book-review
2,10.2307_2654760,,https://www.jstor.org/stable/2654760,,Contemporary Sociology,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1998,book-review
3,10.2307_43242281,editor's note: A KNIGHT'S TALE,https://www.jstor.org/stable/43242281,,Corporate Knights,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Other,2005,misc
4,10.2307_42862018,,https://www.jstor.org/stable/42862018,,Social Science Quarterly,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1985,book-review


In [4]:
# Read in filtered index, counts
df = pd.read_csv(article_list_fp, low_memory=False, header=None, names=["file_name"])
df['edited_filename'] = df['file_name'].apply(lambda x: x[16:]) # New col with only article ID

df_counts = pd.read_csv(counts_fp, low_memory=False)
df_counts['edited_filename'] = df_counts['article_id'].apply(lambda x: x[16:]) # New col with only article ID
df_counts = df_counts[['edited_filename', 'word_count']]

# Merge meta data, counts into articles list DF
df = pd.merge(df, df_meta, how='left', on='edited_filename') # meta data
df = pd.merge(df, df_counts, how='left', on='edited_filename') # counts

# Filter to only full articles: >=1000 words (eliminates 69659 - 65372 = 4287 cases)
#df = df[df['word_count'] >= 1000]

# Show all columns in resulting DF
print("All columns:\n", list(df))
print()

print("Rows, cols in data:", df.shape)

df.head()

All columns:
 ['file_name', 'edited_filename', 'article_name', 'jstor_url', 'abstract', 'journal_title', 'given_names', 'primary_subject', 'year', 'type', 'word_count']

Rows, cols in data: (65372, 11)


Unnamed: 0,file_name,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,word_count
0,journal-article-10.2307_2065002,10.2307_2065002,Toward More Cumulative Inquiry,https://www.jstor.org/stable/2065002,,Contemporary Sociology,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Sociology,1978,research-article,3529
1,journal-article-10.2307_3380821,10.2307_3380821,An Analysis of an Incentive Sick Leave Policy ...,https://www.jstor.org/stable/3380821,Local health departments are under tremendous ...,Public Productivity & Management Review,"[Werner, Werner, Konrad, Rudi, Paul, Jean, Rob...",Management & Organizational Behavior,1986,research-article,5195
2,journal-article-10.2307_2095822,10.2307_2095822,Local Friendship Ties and Community Attachment...,https://www.jstor.org/stable/2095822,This study presents a multilevel empirical tes...,American Sociological Review,"[Alice O., Peter, W. Erwin, Bert, Robert W., C...",Sociology,1983,research-article,7100
3,journal-article-10.2307_40836133,10.2307_40836133,Knowledge Transfer within the Multinational Fi...,https://www.jstor.org/stable/40836133,This paper examines the process of knowledge t...,MIR: Management International Review,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Management & Organizational Behavior,2005,research-article,7110
4,journal-article-10.2307_2579666,10.2307_2579666,Dynamics of Labor Market Segmentation in Polan...,https://www.jstor.org/stable/2579666,Research in the early 1980s showed that indust...,Social Forces,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Sociology,1990,research-article,5313


### Coded data

In [43]:
# Read in true positives from H2--in citation format
coded_cult = pd.read_csv(coded_cult_fp, low_memory=False, header=None, 
                         encoding="windows-1252").rename(columns = {0:'citation'})
coded_relt = pd.read_csv(coded_relt_fp, low_memory=False, header=None, 
                         encoding="windows-1252").rename(columns = {0:'citation'})
coded_demog = pd.read_csv(coded_demog_fp, low_memory=False, header=None, 
                          encoding="windows-1252").rename(columns = {0:'citation'})

# Assign scores for each article from H2's foundational set
outside_score = np.NaN # score for two perspectives other than one coded intentionally (0 if negative case, NaN otherwise)

coded_cult['cultural_score'] = 1
coded_cult['relational_score'] = outside_score
coded_cult['demographic_score'] = outside_score

coded_relt['cultural_score'] = outside_score
coded_relt['relational_score'] = 1
coded_relt['demographic_score'] = outside_score

coded_demog['cultural_score'] = outside_score
coded_demog['relational_score'] = outside_score
coded_demog['demographic_score'] = 1

# Merge the three sets into one
coded_h2 = pd.concat([coded_cult, coded_relt, coded_demog])

coded_h2.head(10)

Unnamed: 0,citation,cultural_score,relational_score,demographic_score
0,"Barley, Stephen R. 1983. Semiotics and the s...",1.0,,
1,"Barney, Jay B. 1986. Organizational culture:...",1.0,,
2,"Castilla, Emilio J., and Stephen Benard. 2010...",1.0,,
3,"Dutton, Jane E., and Janet M. Dukerich. 1991....",1.0,,
4,"Fine, Gary Alan. 1984. Negotiated orders and...",1.0,,
5,"Fiol, C. Marlene. 2002. Capitalizing on para...",1.0,,
6,"Goldberg, Amir, Sameer B. Srivastava, V. Govin...",1.0,,
7,"Morrill, Calvin. 1991. Conflict management, ...",1.0,,
8,"Ouchi, William G., and Alan L. Wilkins. 1985....",1.0,,
9,"Pettigrew, Andrew M. 1979. On studying organ...",1.0,,


In [47]:
len(coded_h2)

391

In [45]:
# Read in hand-coded data
coded_df = pd.read_csv(coded_11620, low_memory=False, header=0)
coded_df.head()

Unnamed: 0,cultural_score,relational_score,demographic_score,article_name,abstract,jstor_url,year,journal_title,edited_filename,culture_word2vec_cosine,culture_ngram_count.1,cultural_author_count,relational_word2vec_cosine,relational_ngram_count.1,relational_author_count,demographic_word2vec_cosine,demographic_ngram_count.1,demographic_author_count
0,1.0,0.0,0.0,"Intersecting Three Muddy Roads: Stability, Leg...",Several decades of research by multiple academ...,https://www.jstor.org/stable/25822540,2011.0,Journal of Managerial Issues,10.2307_25822540,0.754487,227.0,7.0,0.61303,33.0,1.0,0.560983,119.0,0.0
1,1.0,0.0,0.0,Rational Decision Making as Performative Praxi...,Organizational theorists built their knowledge...,external-fulltext-any,2011.0,Organization Science,10.2307_20868880,0.721939,55.0,6.0,0.588276,16.0,1.0,0.534615,2.0,0.0
2,1.0,1.0,0.0,From Fiefs to Clans and Network Capitalism: Ex...,China's rapid economic development is being ac...,https://www.jstor.org/stable/2393869,1986.0,Administrative Science Quarterly,10.2307_2393869,0.715111,73.0,0.0,0.644378,66.0,0.0,0.530408,17.0,0.0
3,1.0,1.0,0.0,The Collective Strategy Framework: An Applicat...,This paper investigates empirically the compet...,https://www.jstor.org/stable/2392643,1984.0,Administrative Science Quarterly,10.2307_2392643,0.702606,114.0,9.0,0.671079,88.0,2.0,0.6746,124.0,9.0
4,1.0,0.0,0.0,"Political Institutional Change, Obsolescing Le...",This paper studies the practice of integration...,https://www.jstor.org/stable/41682289,2012.0,MIR: Management International Review,10.2307_41682289,0.68824,218.0,1.0,0.692662,111.0,0.0,0.586081,98.0,0.0


In [46]:
len(coded_df)

260

## Merge true positives into hand-coded data via titles

### Extract and preprocess article titles

In [48]:
def get_title(citation, is_title = False):
    '''
    Extracts title from citation format & preprocesses it: 
    lower-case, remove punctuation, strip whitespace.
    
    Args:
        citation: in ASA format: 'author(s). year (four digits). title: subtitle. journal name & issue.'
        is_title: binary = True if input is just a title (not full citation)
        
    Returns:
        str: extracted and preprocessed title
    '''
    
    title_pattern = r'(?<=\d{4}\.).*[\.\?\!].' # regex pattern for getting title
    
    if not is_title: 
        title = str(re.findall( # match to title pattern
            title_pattern, citation))[4:-4] # force to string (b/c weird output), slice to remove brackets and quotes
    else:
        title = str(citation) # title is input, not citation
        
    # Same cleaning either way: remove any non-words or whitespace (+any x suffix), then lower case
    title = re.sub('\W+', ' ', title).strip(' x').lower()
    
    return title

In [49]:
# Extract & clean titles in true positives:
coded_h2['article_name_edited'] = coded_h2['citation'].apply(
    lambda cite: get_title(cite))

print("Sample of citations and extracted titles:\n")
for i, row in coded_h2.sample(n=7).iterrows():
    print(row['citation'])
    print(row['article_name_edited'])
    print()

Sample of citations and extracted titles:

Pfeffer, Jeffrey.  1972.  Merger as a response to organizational interdependence.  Administrative Science Quarterly, 17:  382-394.
merger as a response to organizational interdependence

Gibson, Christina B., and Jennifer L. Gibbs.  2006.  Unpacking the concept of virtuality:  The effects of geographic dispersion, electronic dependence, dynamic structure, and national diversity on team innovation.  Administrative Science Quarterly, 51:  451-495.
unpacking the concept of virtuality the effects of geographic dispersion electronic dependence dynamic structure and national diversity on team innovation

Jay, Jason.  2013.  Navigating paradox as a mechanism of change and innovation in hybrid organizations.  Academy of Management Journal, 56 (1):  137-59. 
navigating paradox as a mechanism of change and innovation in hybrid organizations academy of management journal 56 1 137 59

Lomi, Alessandro.  1995.  The population ecology of organizational foun

In [50]:
# Preprocess article names in meta data: lower-case, remove punctuation, strip whitespace
df['article_name_edited'] = df['article_name'].apply(
    lambda title: get_title(title, is_title = True))

print("Sample of full and cleaned article titles:\n")
for i, row in df.sample(n=7).iterrows():
    print(row['article_name'])
    print(row['article_name_edited'])
    print()

Sample of full and cleaned article titles:

THE CHANGING ROLE OF THE JAPANESE MARKET AND ITS IMPACT ON GLOBAL STRATEGY
the changing role of the japanese market and its impact on global strategy

Top Manager and Network Effects on the Adoption of Innovative Management Practices: A Study of TQM in a Public Hospital System
top manager and network effects on the adoption of innovative management practices a study of tqm in a public hospital system

Race, Gender and Class Lessons from Hurricane Katrina
race gender and class lessons from hurricane katrina


Multilevel Covariance Structure Analysis by Fitting Multiple Single-Level Models
multilevel covariance structure analysis by fitting multiple single level models

OUR LANGUAGE POLICY IN A SOCIOLINGUISTIC PERSPECTIVE
our language policy in a sociolinguistic perspective

Management Systems in Multilevel Organizations
management systems in multilevel organizations



### Debugging file name matching

In [53]:
fragment = 'what is organizational imprinting'
#fragment = 'when worlds collide'
print(coded_df[coded_df['article_name'].apply(lambda name: fragment in str(name).lower())]['article_name'].value_counts())
print()
print(df_meta[df_meta['article_name'].apply(lambda name: fragment in str(name).lower())]['article_name_edited'].value_counts())
print()
print(df[df['article_name'].apply(lambda name: fragment in str(name).lower())]['article_name_edited'].value_counts())
print()
print(coded_h2[coded_h2['citation'].apply(lambda name: fragment in str(name).lower())]['citation'].value_counts())

Series([], Name: article_name, dtype: int64)

what is organizational imprinting cultural entrepreneurship in the founding of the paris opera    1
Name: article_name_edited, dtype: int64

what is organizational imprinting cultural entrepreneurship in the founding of the paris opera    1
Name: article_name_edited, dtype: int64

Johnson, Victoria.  2007.  What is organizational imprinting?  Cultural entrepreneurship in the founding of the Paris Opera.  American Journal of Sociology, 113 (1):  97-127.     1
Name: citation, dtype: int64


### Match using file names

In [10]:
# Merge meta data into true positives using preprocessed article names
coded_h2 = pd.merge(coded_h2, df, how = 'left', on = 'article_name_edited')

uncoded_h2 = coded_h2[coded_h2['file_name'].isnull()].reset_index(drop=True)
coded_h2 = coded_h2[coded_h2['file_name'].notnull()].reset_index(drop=True)
print("# articles matched so far:", str(len(coded_h2)))
print("# articles left to match:", str(len(uncoded_h2)))

uncoded_h2.head()

# articles matched so far: 207
# articles left to match: 190


Unnamed: 0,citation,cultural_score,relational_score,demographic_score,article_name_edited,file_name,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,word_count
0,"Barney, Jay B. 1986. Organizational culture:...",1.0,,,organizational culture can it be a source of s...,,,,,,,,,,,
1,"Dutton, Jane E., and Janet M. Dukerich. 1991....",1.0,,,keeping an eye on the mirror image and identit...,,,,,,,,,,,
2,"Goldberg, Amir, Sameer B. Srivastava, V. Govin...",1.0,,,fitting in or standing out the tradeoffs of st...,,,,,,,,,,,
3,"Morrill, Calvin. 1991. Conflict management, ...",1.0,,,conflict management honor and organizational c...,,,,,,,,,,,
4,"Rafaeli, Anat, and Michael G. Pratt. 1993. T...",1.0,,,tailored meanings a look at dress,,,,,,,,,,,


In [11]:
# For those not matched in final filtered data, try to match using original metadata 
df_meta['article_name_edited'] = df_meta['article_name'].apply(
    lambda title: get_title(title, is_title = True)) # clean title in metadata

# Filter to only those unmatched articles in metadata
uncoded_meta = uncoded_h2[
    uncoded_h2['article_name_edited'].apply(
        lambda title: title in 
        df_meta['article_name_edited'].tolist()) == True] # apply mask

print(f"Matched {str(len(uncoded_meta))} additional articles using original metadata:") # show count of result

# For consistency, remove from uncoded DF those that match with original meta data
uncoded_h2 = uncoded_h2[
    uncoded_h2['article_name_edited'].apply(
        lambda title: title not in 
        df_meta['article_name_edited'].tolist()) == True] # apply mask

# Drop empty columns, then merge in meta data
uncoded_meta = uncoded_meta.drop(columns = ['file_name', 'edited_filename', 'article_name', 
                                            'jstor_url', 'abstract', 'word_count', 'journal_title', 
                                            'given_names', 'primary_subject', 'year', 'type'])
uncoded_meta = pd.merge(uncoded_meta, df_meta, 
                       how = 'left', on = 'article_name_edited') # merge in meta data
uncoded_meta['file_name'] = uncoded_meta['edited_filename'].apply(lambda name: 'journal-article-' + name)
uncoded_meta['word_count'] = np.NaN # add empty word count column
uncoded_meta = uncoded_meta[list(coded_h2)] # align columns with other DF

# look at unmatched cases that are in metadata
# Should look identical to matched coded cases, makes merging easier
uncoded_meta

Matched 33 additional articles using original metadata:


Unnamed: 0,citation,cultural_score,relational_score,demographic_score,article_name_edited,file_name,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,word_count
0,"Berman, Elizabeth Popp 2006. Before the prof...",1.0,,,before the professional project success and fa...,journal-article-10.2307_4501749,10.2307_4501749,Before the Professional Project: Success and F...,https://www.jstor.org/stable/4501749,Theories of the professions do not sufficientl...,Theory and Society,,Sociology,1911,research-article,
1,"Clemens, Elisabeth S. 1993. Organizational r...",1.0,,,organizational repertoires and institutional c...,journal-article-10.2307_2781235,10.2307_2781235,Organizational Repertoires and Institutional C...,https://www.jstor.org/stable/2781235,Although social movements are often presumed t...,American Journal of Sociology,"[Greg, Yves, Kalevi, Hans Christian, Biplab, E...",Sociology,1912,research-article,
2,"Fligstein, Neil. 1985. The spread of the mul...",1.0,,,the spread of the multidivisional form among l...,journal-article-10.2307_2095547,10.2307_2095547,The Spread of the Multidivisional Form Among L...,https://www.jstor.org/stable/2095547,The multidivisional form is the favored form o...,American Sociological Review,"[JADWIGA, Paul J., Wayne, Charles, WILLIAM V.,...",Sociology,1970,research-article,
3,"Mizruchi, Mark S., and Linda C. Fein. 1999. ...",1.0,,,the social construction of organizational know...,journal-article-10.2307_2667051,10.2307_2667051,The Social Construction of Organizational Know...,https://www.jstor.org/stable/2667051,Arguing that knowledge in the social sciences ...,Administrative Science Quarterly,"[Duncan L., Melvin, DAVID L., Andrew, Paul R.,...",Management & Organizational Behavior,1970,research-article,
4,"Rowan, Brian. 1982. Organizational structure...",1.0,,,organizational structure and the institutional...,journal-article-10.2307_2392303,10.2307_2392303,Organizational Structure and the Institutional...,https://www.jstor.org/stable/2392303,This paper develops an institutional approach ...,Administrative Science Quarterly,"[Jean, Stanley L., Robert E., Ussama, Leonard,...",Management & Organizational Behavior,1928,research-article,
5,"Tolbert, Pamela S., and Lynne G. Zucker. 1983...",1.0,,,institutional sources of change in the formal ...,journal-article-10.2307_2392383,10.2307_2392383,Institutional Sources of Change in the Formal ...,https://www.jstor.org/stable/2392383,This paper investigates the diffusion and inst...,Administrative Science Quarterly,"[Eva, Winfried, María Aidé, Kurt H., Jennifer,...",Management & Organizational Behavior,1915,research-article,
6,"Zucker, Lynne G. 1977. The role of instituti...",1.0,,,the role of institutionalization in cultural p...,journal-article-10.2307_2094862,10.2307_2094862,The Role of Institutionalization in Cultural P...,https://www.jstor.org/stable/2094862,Traditional approaches to institutionalization...,American Sociological Review,"[Duncan L., Melvin, DAVID L., Andrew, Paul R.,...",Sociology,1970,research-article,
7,"Boeker, Warren. 1989. The development and in...",,1.0,,the development and institutionalization of su...,journal-article-10.2307_2393150,10.2307_2393150,The Development and Institutionalization of Su...,https://www.jstor.org/stable/2393150,The research reported here examined the effect...,Administrative Science Quarterly,"[ROBERT, Riziki S., Idris S., Patrick, L. A., ...",Management & Organizational Behavior,1965,research-article,
8,"Davis, Gerald F. 1991. Agents without princi...",,1.0,,agents without principles the spread of the po...,journal-article-10.2307_2393275,10.2307_2393275,Agents without Principles? The Spread of the P...,https://www.jstor.org/stable/2393275,This study compares the agency theory of the f...,Administrative Science Quarterly,"[Mabel, David, Jorge A., Benito E., Danièle, L...",Management & Organizational Behavior,1964,research-article,
9,"Fligstein, Neil. 1987. The intraorganization...",,1.0,,the intraorganizational power struggle rise of...,journal-article-10.2307_2095391,10.2307_2095391,The Intraorganizational Power Struggle: Rise o...,https://www.jstor.org/stable/2095391,Choosing a president in an organization is an ...,American Sociological Review,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Sociology,1970,research-article,


In [23]:
# Concatenate h2-coded data with hand-coded data
coded_df = pd.concat([coded_df, coded_h2, uncoded_meta], axis=0, join='inner')
coded_df = coded_df[coded_df['edited_filename'].notnull()] # Remove empty row
coded_df

Unnamed: 0,cultural_score,relational_score,demographic_score,article_name,abstract,jstor_url,year,journal_title,edited_filename
0,1.0,0.0,0.0,"Intersecting Three Muddy Roads: Stability, Leg...",Several decades of research by multiple academ...,https://www.jstor.org/stable/25822540,2011,Journal of Managerial Issues,10.2307_25822540
1,1.0,0.0,0.0,Rational Decision Making as Performative Praxi...,Organizational theorists built their knowledge...,external-fulltext-any,2011,Organization Science,10.2307_20868880
2,1.0,1.0,0.0,From Fiefs to Clans and Network Capitalism: Ex...,China's rapid economic development is being ac...,https://www.jstor.org/stable/2393869,1986,Administrative Science Quarterly,10.2307_2393869
3,1.0,1.0,0.0,The Collective Strategy Framework: An Applicat...,This paper investigates empirically the compet...,https://www.jstor.org/stable/2392643,1984,Administrative Science Quarterly,10.2307_2392643
4,1.0,0.0,0.0,"Political Institutional Change, Obsolescing Le...",This paper studies the practice of integration...,https://www.jstor.org/stable/41682289,2012,MIR: Management International Review,10.2307_41682289
5,1.0,0.0,0.0,Culture and Meaning: Making Sense of Conflicti...,,https://www.jstor.org/stable/40397128,1989,International Studies of Management & Organiza...,10.2307_40397128
6,1.0,0.0,0.0,Linking Organizational Values to Relationships...,This study explores the organizational values ...,https://www.jstor.org/stable/2640266,1995,Organization Science,10.2307_2640266
7,1.0,0.0,0.0,Beyond the red tape: How victims of terrorism ...,We use a storyteller perspective to examine ho...,external-fulltext-any,2011,Journal of Organizational Behavior,10.2307_41415713
8,1.0,0.0,0.0,Embedding Sustainability Across the Organizati...,This article is a response to Haugh and Talwar...,https://www-jstor-org.proxy.library.georgetown...,2011,Academy of Management Learning & Education,10.2307_23100442
9,1.0,1.0,0.0,When Experience Meets National Institutional E...,We develop an institutional change perspective...,https://www.jstor.org/stable/27735492,2009,Strategic Management Journal,10.2307_27735492


## Merge full text with coded articles

### Create file paths from filenames

In [13]:
tqdm.pandas(desc='Creating file paths...')
coded_df['file_path'] = coded_df['edited_filename'].progress_apply(
    lambda name: ocr_fp + 'journal-article-' + str(name) + '.txt')

# Check out results
for i, row in coded_df.iloc[:10,].iterrows():
    print(row['file_path'])

Creating file paths...: 100%|██████████| 499/499 [00:00<00:00, 357099.08it/s]

/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_25822540.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_20868880.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_2393869.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_2392643.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_41682289.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_40397128.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_2640266.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_41415713.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_23100442.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_27735492.txt





### Read in text files

In [14]:
def read_file(fp):
    '''Reads text file.
    
    Args:
        fp: path to text file
        
    Returns:
        str: text from file'''
    
    with open(fp, 'r') as myfile:
        text = myfile.read()
        
    return text

tqdm.pandas(desc='Reading text files...')
coded_df['text'] = coded_df['file_path'].progress_apply(lambda fp: read_file(fp))

Reading text files...: 100%|██████████| 499/499 [00:00<00:00, 6137.96it/s]


In [15]:
tqdm.pandas(desc='Reading text files...')
coded_df['text'] = coded_df['file_path'].progress_apply(lambda fp: read_file(fp))

# Check out results
for i, row in coded_df.iloc[:10,].iterrows():
    print(row['text'][:100])
    print()

Reading text files...: 100%|██████████| 499/499 [00:00<00:00, 7330.88it/s]

<plain_text><page sequence="1">JOURNAL OF MANAGERIAL ISSUES Vol. XXIII Number 1 Spring 2011: 96-112 

<plain_text><page sequence="1">Organization ht icna Vol. 22, No. 3, May-June 2011, pp. 573-586 issn 

<plain_text><page sequence="1">From Fiefs to Clans and Network Capitalism: Explaining China's Emergi

<plain_text><page sequence="1">The Collective Strategy Framework: An Applica- tion to Competing Pre-

<plain_text><page sequence="1">Manag Int Rev (2012) 52:847-877 DOI 10.1007/sl 1575-012-0141-4 RESEAR

<plain_text><page sequence="1">Int. Studies ofMgt. &amp; Org.t Vol. 19, No. 3, pp. 64-81. M.E.Sharpe

<plain_text><page sequence="1">Linking Organizational Values to Relationships with External Constitu

<plain_text><page sequence="1">Journal of Organizational Behavior, J. Organiz. Behav. 32, 938-954 (2

<plain_text><page sequence="1">® Academy oí Management Learning &amp; Education, 2011, Vol. 10, No. 

<plain_text><page sequence="1">Strategie Management Journal Strat. Mgmt. /.. 30: 1




## Check & save final coded data

In [16]:
# Extract and assess conclusive results for each perspective
coded_cult_final = coded_df[coded_df['cultural_score'].notnull()][['text', 'cultural_score']]
coded_relt_final = coded_df[coded_df['relational_score'].notnull()][['text', 'relational_score']]
coded_demog_final = coded_df[coded_df['demographic_score'].notnull()][['text', 'demographic_score']]

print('Number of coded files for each perspective:\n')
print(f'Cultural: {str(len(coded_cult_final[coded_cult_final["cultural_score"]==1.0]))} yes, {str(len(coded_cult_final[coded_cult_final["cultural_score"]==0.0]))} no')
print(f'Relational: {str(len(coded_relt_final[coded_relt_final["relational_score"]==1.0]))} yes, {str(len(coded_relt_final[coded_relt_final["relational_score"]==0.0]))} no')
print(f'Demographic: {str(len(coded_demog_final[coded_demog_final["demographic_score"]==1.0]))} yes, {str(len(coded_demog_final[coded_demog_final["demographic_score"]==0.0]))} no')

Number of coded files for each perspective:

Cultural: 112 yes, 209 no
Relational: 104 yes, 229 no
Demographic: 91 yes, 248 no


In [17]:
# Save reference file, sorted by journal then article name
df[['article_name', 'article_name_edited', 'journal_title', 'year', 'file_name', 'jstor_url']].sort_values(
    by=['journal_title', 'article_name_edited'], ascending=True).to_excel(
    article_names_fp, index = False)

In [18]:
# Save lists of true positives: those that matched, those that didn't
coded_df.sort_values(by='article_name', ascending=True).to_excel(
    matched_fp, index=False)

uncoded_h2[['citation', 'article_name_edited']].sort_values(
    by='article_name_edited', ascending=True).to_excel(
    not_matched_fp, index=False)

In [19]:
# Save training data for classifiers: true positives + negatives for each perspective
quickpickle_dump(coded_cult_final, training_cult_fp)
quickpickle_dump(coded_relt_final, training_relt_fp)
quickpickle_dump(coded_demog_final, training_demog_fp)