# Assemble hand-coded articles and prepare for modeling

@author: Jaren Haber, PhD, Georgetown University<br>
@coauthors: Prof. Heather Haveman, UC Berkeley; Yoon Sung Hong, Wayfair<br>
@contact: Jaren.Haber@georgetown.edu<br>
@project: Computational Literature Review of Organizational Scholarship<br>
@date: November 2020<br>

@description: '''Loads and merges two datasets in preparation for classification model training. Saves final datasets for preprocessing and model training, one per perspective, each just with scores and raw text (to be preprocessed later). We're dealing with three theoretical perspectives in org. science (cultural, demographic, and relational) and two subject areas (sociology & management/OB, not differentiated here). The first dataset is of articles hand-coded by the author and Prof. Haveman, and it comes as a clean .csv file. This first contains lots of false positives (from the previous approach based on cosine measures), so it consists of mainly negative cases. The second dataset is of articles identified by Prof. Haveman as being foundation/definitive for each perspective. This comes as a list of citations, one per perspective, and requires some pretty heavy cleaning to match with articles in the main JSTOR articles dataset.'''

## Initialize

In [11]:
!pip install openpyxl
!pip install tables
!pip install fuzzywuzzy
!pip install python-Levenshtein

Collecting python-Levenshtein
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K     |████████████████████████████████| 51kB 6.0MB/s eta 0:00:011
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/de/c2/93/660fd5f7559049268ad2dc6d81c4e39e9e36518766eaf7e342
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.0


In [12]:
#!pip install openpyxl
#!pip install spacy
#!pip install fuzzywuzzy
#!pip install python-Levenshtein
#import nltk; nltk.download('words')

# import packages
import imp, importlib # For working with modules
import pandas as pd # for working with dataframes
import numpy as np # for working with numbers
import pickle # For working with .pkl files
import re # for regex magic
from tqdm import tqdm # Shows progress over iterations, including in pandas via "progress_apply"
import sys # For terminal tricks
import _pickle as cPickle # Optimized version of pickle
import gc # For managing garbage collector
import timeit # For counting time taken for a process
import datetime # For working with dates & times
from datetime import date
import openpyxl # for saving in excel format
import tables
from fuzzywuzzy import fuzz, process
import random
import os; from os import listdir; from os.path import isfile, join
from quickpickle import quickpickle_dump, quickpickle_load # custom scripts for quick saving & loading to pickle format

In [13]:
# define filepaths

thisday = date.today().strftime("%m%d%y")

cwd = os.getcwd()
root = str.replace(cwd, 'classification/preprocess', '')

# for text files
ocr_fp = root + 'jstor_data/ocr/' 

# Directory for prepared data and trained models: save files here
data_fp = root + 'classification/data/'

# Current article lists
article_list_fp = data_fp + 'filtered_length_index.csv' # Filtered index of research articles
article_paths_fp = data_fp + 'filtered_length_article_paths.csv' # List of article file paths
article_names_fp = data_fp + 'filtered_length_article_names.xlsx' # Filtered list of article names and general data, sorted by journal then article name

# dictionary counts (using core dictionaries) and matched subjects 
counts_fp = root + 'dictionary_methods/counts_and_subject.csv'

# per-article metadata with URLs
meta_fp = root + 'dictionary_methods/code/metadata_combined.h5' 

# per-article info on cosine scores using each dictionary (core or 100-term dictionaries??)
cosines_fp = root + 'models_storage/word_embeddings_data/text_with_cosine_scores_wdg_2020_oct27.csv'

# per-article preprocessed text (lines up with filtered_index.csv)
texts_fp = root + 'models_storage/word_embeddings_data/cleaned_text_nested_2020_sept5.pkl'
article_list_old_fp = root + 'models_storage/word_embeddings_data/filtered_index.csv'

# True positives (to be merged): Hand coded and foundational sets from H2
coded_all_fp1 = data_fp + 'hand_coded/coded_sample_cleaned_111620.csv'
coded_all_fp2 = data_fp + 'hand_coded/coded_sample_cleaned_022221.csv'
coded_orgs_fp = data_fp + 'hand_coded/org_soc_coded_sample_022221.csv'
coded_cult_fp = data_fp + 'hand_coded/true_positives_cultural.csv'
coded_relt_fp = data_fp + 'hand_coded/true_positives_relational.csv'
coded_demog_fp = data_fp + 'hand_coded/true_positives_demographic.csv'

# Output: merging results and training data
matched_fp = data_fp + f'true_positives_matched_{str(thisday)}.xlsx'
not_matched_fp = data_fp + f'true_positives_match_failed_{str(thisday)}.xlsx'

training_cult_fp = data_fp + f'training_cultural_raw_{str(thisday)}.pkl'
training_relt_fp = data_fp + f'training_relational_raw_{str(thisday)}.pkl'
training_demog_fp = data_fp + f'training_demographic_raw_{str(thisday)}.pkl'
training_orgs_fp = data_fp + f'training_orgs_raw_{str(thisday)}.pkl'

## Read in & merge data

### Meta data

In [14]:
# Read in metadata file
df_meta = pd.read_hdf(meta_fp)
df_meta.reset_index(drop=False, inplace=True) # extract file name from index

# For merging purposes, get ID alone from file name, e.g. 'journal-article-10.2307_2065002' -> '10.2307_2065002'
df_meta['edited_filename'] = df_meta['file_name'].apply(lambda x: x[16:]) 
df_meta = df_meta[["edited_filename", "article_name", "jstor_url", "abstract", "journal_title", "given_names", "primary_subject", "year", "type"]] # keep only relevant columns

df_meta.head()

Unnamed: 0,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type
0,10.2307_4167860,Cross-Dialectal Variation in Arabic: Competing...,https://www.jstor.org/stable/4167860,Most researchers of Arabic sociolinguistics as...,Language in Society,,Other,1979,research-article
1,10.2307_2578336,,https://www.jstor.org/stable/2578336,,Social Forces,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1983,book-review
2,10.2307_2654760,,https://www.jstor.org/stable/2654760,,Contemporary Sociology,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1998,book-review
3,10.2307_43242281,editor's note: A KNIGHT'S TALE,https://www.jstor.org/stable/43242281,,Corporate Knights,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Other,2005,misc
4,10.2307_42862018,,https://www.jstor.org/stable/42862018,,Social Science Quarterly,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1985,book-review


In [15]:
# Read in filtered index, counts
df = pd.read_csv(article_list_fp, low_memory=False, header=None, names=["file_name"])
df['edited_filename'] = df['file_name'].apply(lambda x: x[16:]) # New col with only article ID

df_counts = pd.read_csv(counts_fp, low_memory=False)
df_counts['edited_filename'] = df_counts['article_id'].apply(lambda x: x[16:]) # New col with only article ID
df_counts = df_counts[['edited_filename', 'word_count']]

# Merge meta data, counts into articles list DF
df = pd.merge(df, df_meta, how='left', on='edited_filename') # meta data
df = pd.merge(df, df_counts, how='left', on='edited_filename') # counts

# Filter to only full articles: >=1000 words (eliminates 69659 - 65372 = 4287 cases)
#df = df[df['word_count'] >= 1000]

# Eliminate empty rows
df = df[df['article_name'].notnull()]

# Show all columns in resulting DF
print("All columns:\n", list(df))
print()

print("Rows, cols in data:", df.shape)

df.head()

All columns:
 ['file_name', 'edited_filename', 'article_name', 'jstor_url', 'abstract', 'journal_title', 'given_names', 'primary_subject', 'year', 'type', 'word_count']

Rows, cols in data: (65370, 11)


Unnamed: 0,file_name,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,word_count
0,journal-article-10.2307_2065002,10.2307_2065002,Toward More Cumulative Inquiry,https://www.jstor.org/stable/2065002,,Contemporary Sociology,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Sociology,1978,research-article,3529
1,journal-article-10.2307_3380821,10.2307_3380821,An Analysis of an Incentive Sick Leave Policy ...,https://www.jstor.org/stable/3380821,Local health departments are under tremendous ...,Public Productivity & Management Review,"[Werner, Werner, Konrad, Rudi, Paul, Jean, Rob...",Management & Organizational Behavior,1986,research-article,5195
2,journal-article-10.2307_2095822,10.2307_2095822,Local Friendship Ties and Community Attachment...,https://www.jstor.org/stable/2095822,This study presents a multilevel empirical tes...,American Sociological Review,"[Alice O., Peter, W. Erwin, Bert, Robert W., C...",Sociology,1983,research-article,7100
3,journal-article-10.2307_40836133,10.2307_40836133,Knowledge Transfer within the Multinational Fi...,https://www.jstor.org/stable/40836133,This paper examines the process of knowledge t...,MIR: Management International Review,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Management & Organizational Behavior,2005,research-article,7110
4,journal-article-10.2307_2579666,10.2307_2579666,Dynamics of Labor Market Segmentation in Polan...,https://www.jstor.org/stable/2579666,Research in the early 1980s showed that indust...,Social Forces,"[Ariela, ARTHUR J., John A., Marilyn, Janemari...",Sociology,1990,research-article,5313


In [20]:
df['word_count'].max()

76536

### Coded data

In [180]:
# Read in true positives from H2--in citation format
coded_cult = pd.read_csv(coded_cult_fp, low_memory=False, header=None, 
                         encoding="windows-1252").rename(columns = {0:'citation'})
coded_relt = pd.read_csv(coded_relt_fp, low_memory=False, header=None, 
                         encoding="windows-1252").rename(columns = {0:'citation'})
coded_demog = pd.read_csv(coded_demog_fp, low_memory=False, header=None, 
                          encoding="windows-1252").rename(columns = {0:'citation'})

# Assign scores for each article from H2's foundational set
outside_score = np.NaN # score for two perspectives other than one coded intentionally (0 if negative case, NaN otherwise)

coded_cult['cultural_score'] = 1
coded_cult['relational_score'] = outside_score
coded_cult['demographic_score'] = outside_score

coded_relt['cultural_score'] = outside_score
coded_relt['relational_score'] = 1
coded_relt['demographic_score'] = outside_score

coded_demog['cultural_score'] = outside_score
coded_demog['relational_score'] = outside_score
coded_demog['demographic_score'] = 1

# Merge the three sets into one
coded_h2 = pd.concat([coded_cult, coded_relt, coded_demog])

print(str(len(coded_h2)))
coded_h2.head(10)

Unnamed: 0,citation,cultural_score,relational_score,demographic_score
0,"Barley, Stephen R. 1983. Semiotics and the s...",1.0,,
1,"Barney, Jay B. 1986. Organizational culture:...",1.0,,
2,"Castilla, Emilio J., and Stephen Benard. 2010...",1.0,,
3,"Dutton, Jane E., and Janet M. Dukerich. 1991....",1.0,,
4,"Fine, Gary Alan. 1984. Negotiated orders and...",1.0,,
5,"Fiol, C. Marlene. 2002. Capitalizing on para...",1.0,,
6,"Goldberg, Amir, Sameer B. Srivastava, V. Govin...",1.0,,
7,"Morrill, Calvin. 1991. Conflict management, ...",1.0,,
8,"Ouchi, William G., and Alan L. Wilkins. 1985....",1.0,,
9,"Pettigrew, Andrew M. 1979. On studying organ...",1.0,,


In [None]:
# Read in hand-coded organizational soc data
coded_orgs_df = pd.read_csv(coded_orgs_fp, low_memory=False, header=0)
print(str(len(coded_orgs_df)))
coded_orgs_df.head()

In [182]:
# Read in hand-coded perspective data
keepcols = ['cultural_score', 'relational_score', 'demographic_score', 
             'article_name', 'abstract', 'jstor_url', 
             'year', 'journal_title', 'edited_filename']
coded_df1 = pd.read_csv(coded_all_fp1, low_memory=False, header=0)[keepcols]
coded_df2 = pd.read_csv(coded_all_fp2, low_memory=False, header=0)[keepcols]
coded_df = pd.concat([coded_df1, coded_df2])
print(str(len(coded_df)))
coded_df.head()

Unnamed: 0,cultural_score,relational_score,demographic_score,article_name,abstract,jstor_url,year,journal_title,edited_filename,culture_word2vec_cosine,culture_ngram_count.1,cultural_author_count,relational_word2vec_cosine,relational_ngram_count.1,relational_author_count,demographic_word2vec_cosine,demographic_ngram_count.1,demographic_author_count
0,1.0,0.0,0.0,"Intersecting Three Muddy Roads: Stability, Leg...",Several decades of research by multiple academ...,https://www.jstor.org/stable/25822540,2011.0,Journal of Managerial Issues,10.2307_25822540,0.754487,227.0,7.0,0.61303,33.0,1.0,0.560983,119.0,0.0
1,1.0,0.0,0.0,Rational Decision Making as Performative Praxi...,Organizational theorists built their knowledge...,external-fulltext-any,2011.0,Organization Science,10.2307_20868880,0.721939,55.0,6.0,0.588276,16.0,1.0,0.534615,2.0,0.0
2,1.0,1.0,0.0,From Fiefs to Clans and Network Capitalism: Ex...,China's rapid economic development is being ac...,https://www.jstor.org/stable/2393869,1986.0,Administrative Science Quarterly,10.2307_2393869,0.715111,73.0,0.0,0.644378,66.0,0.0,0.530408,17.0,0.0
3,1.0,1.0,0.0,The Collective Strategy Framework: An Applicat...,This paper investigates empirically the compet...,https://www.jstor.org/stable/2392643,1984.0,Administrative Science Quarterly,10.2307_2392643,0.702606,114.0,9.0,0.671079,88.0,2.0,0.6746,124.0,9.0
4,1.0,0.0,0.0,"Political Institutional Change, Obsolescing Le...",This paper studies the practice of integration...,https://www.jstor.org/stable/41682289,2012.0,MIR: Management International Review,10.2307_41682289,0.68824,218.0,1.0,0.692662,111.0,0.0,0.586081,98.0,0.0


## Merge true positives into hand-coded data via titles

### Extract and preprocess article titles

In [184]:
def get_title(citation, is_title = False):
    '''
    Extracts title from citation format & preprocesses it: 
    lower-case, remove punctuation, strip whitespace.
    
    Args:
        citation: in ASA format: 'author(s). year (four digits). title: subtitle. journal name & issue.'
        is_title: binary = True if input is just a title (not full citation)
        
    Returns:
        str: extracted and preprocessed title
    '''
    
    title_pattern = r'(?<=\d{4}\.).*[\.\?\!].' # regex pattern for getting title
    citation = citation.strip() # remove trailing white spaces
    
    if not is_title: # is citation format
        title = str(re.findall( # match to title pattern
            title_pattern, citation))[4:-4] # force to string (b/c weird output), slice to remove brackets and quotes
    else:
        title = str(citation) # title is input, not citation
        
    # Same cleaning either way: remove any non-words or whitespace (+any x suffix), then lower case
    title = re.sub('\W+', ' ', title).strip(' x').lower()
    
    return title

In [185]:
# Test out title extraction using some atypical cases
challenge1 = 'Pache, Anne-Claire, and Filipe Santos.  2010.  When worlds collide:  The internal dynamics of organizational responses to conflicting institutional demands.  Academy of Management Review, 35 (3):  455-476.'
challenge2 = 'Johnson, Victoria.  2007.  What is organizational imprinting?  Cultural entrepreneurship in the founding of the Paris Opera.  American Journal of Sociology, 113 (1):  97-127.'
challenge3 = 'Morrill, Calvin.  1991.  Conflict management, honor, and organizational change.  American Journal of Sociology, 97 (3):  585-621. '
challenge4 = 'Rivera, Lauren A.  2012.  Hiring as cultural matching:  The case of elite professional service firms.  American Sociological Review, 77 (6):  999-1022. '
challenge5 = 'Clemens, Elisabeth S., and James M. Cook.  1999.  Politics and institutionalism:  Explaining durability and change.  Annual Review of Sociology, 25:  441-466. '
challenge6 = 'DiMaggio, Paul J., and Walter W. Powell.  1983.  The iron cage revisited:  Institutional isomorphism and collective rationality in organizational fields.  American Sociological Review, 48:  147-160. '
challenge7 = 'Jay, Jason.  2013.  Navigating paradox as a mechanism of change and innovation in hybrid organizations.  Academy of Management Journal, 56 (1):  137-59. '
challenge8 = 'Meyer, John W., and Brian Rowan.  1977.  Institutionalized organizations:  Formal structure as myth and ceremony.  American Journal of Sociology, 83:  340-363.  (http://www.jstor.org/stable/2778293)'

challenges = [challenge1, challenge2, challenge3, challenge4, challenge5, challenge6, challenge7, challenge8]

for challenge in challenges:
    print(get_title(challenge))
    print()

when worlds collide the internal dynamics of organizational responses to conflicting institutional demands

what is organizational imprinting cultural entrepreneurship in the founding of the paris opera

conflict management honor and organizational change

hiring as cultural matching the case of elite professional service firms

politics and institutionalism explaining durability and change

the iron cage revisited institutional isomorphism and collective rationality in organizational fields

navigating paradox as a mechanism of change and innovation in hybrid organizations

institutionalized organizations formal structure as myth and ceremony american journal of sociology 83 340 363 http www jstor



In [186]:
# Extract & clean titles in true positives:
coded_h2['article_name_edited'] = coded_h2['citation'].apply(
    lambda cite: get_title(cite))

print("Sample of citations and extracted titles:\n")
for i, row in coded_h2.sample(n=7).iterrows():
    print(row['citation'])
    print(row['article_name_edited'])
    print()

Sample of citations and extracted titles:

Jehn, Karen A., Gregory B. Northcraft, and Margaret A. Neale.  1999.  Why differences make a difference:  A field study of diversity, conflict, and performance in workgroups.  Administrative Science Quarterly, 44:  741-763.
why differences make a difference a field study of diversity conflict and performance in workgroups

Dobrev, Stanislav D., Tai-Young Kim, and Glenn R. Carroll.  2003.  Shifting gears, shifting niches:  Organizational inertia and change in the evolution of the U.S. automobile manufacturers 1885–1981.  Organization Science, 14:  264-282.
shifting gears shifting niches organizational inertia and change in the evolution of the u s automobile manufacturers 1885 1981

Gulati, Ranjay.  1995.  Social structure and alliance formation patterns:  A longitudinal analysis.  Administrative Science Quarterly, 40:  619-652.
social structure and alliance formation patterns a longitudinal analysis

Levinthal, Daniel A.  1991.  Random walks a

In [187]:
# Preprocess article names in meta data: lower-case, remove punctuation, strip whitespace
df['article_name_edited'] = df['article_name'].apply(
    lambda title: get_title(title, is_title = True))

print("Sample of full and cleaned article titles:\n")
for i, row in df.sample(n=7).iterrows():
    print(row['article_name'])
    print(row['article_name_edited'])
    print()

Sample of full and cleaned article titles:

Body to Body: On the Political Anatomy of Crowds
body to body on the political anatomy of crowds

Research on Leadership Selection and Training: One View of the Future
research on leadership selection and training one view of the future

Altering the Product Life Cycle of Consumer Durables: The Case of Minivans
altering the product life cycle of consumer durables the case of minivans

Competitive Position and Promotion Rates: Commercial Television Station Top Management, 1953-1988
competitive position and promotion rates commercial television station top management 1953 1988

Nation as a Context for Strategy: The Effects of National Characteristics on Business-Level Strategies
nation as a context for strategy the effects of national characteristics on business level strategies

When “You” Become One of “Them”
when you become one of them

Vulnerable Elderly Households: Expenditures on Necessities by Older Americans
vulnerable elderly household

### Match using file names

In [188]:
# Merge meta data into true positives using preprocessed article names
coded_h2 = pd.merge(coded_h2, df, how = 'left', on = 'article_name_edited')

uncoded_h2 = coded_h2[coded_h2['file_name'].isnull()].reset_index(drop=True)
coded_h2 = coded_h2[coded_h2['file_name'].notnull()].reset_index(drop=True)
print("# articles matched so far:", str(len(coded_h2)))
print("# articles left to match:", str(len(uncoded_h2)))

uncoded_h2.head()

# articles matched so far: 243
# articles left to match: 154


Unnamed: 0,citation,cultural_score,relational_score,demographic_score,article_name_edited,file_name,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,word_count
0,"Barney, Jay B. 1986. Organizational culture:...",1.0,,,organizational culture can it be a source of s...,,,,,,,,,,,
1,"Dutton, Jane E., and Janet M. Dukerich. 1991....",1.0,,,keeping an eye on the mirror image and identit...,,,,,,,,,,,
2,"Goldberg, Amir, Sameer B. Srivastava, V. Govin...",1.0,,,fitting in or standing out the tradeoffs of st...,,,,,,,,,,,
3,"Rafaeli, Anat, and Michael G. Pratt. 1993. T...",1.0,,,tailored meanings a look at dress,,,,,,,,,,,
4,"Trice, Harrison M., and Janice M. Beyer. 1984...",1.0,,,studying organizational culture through rites ...,,,,,,,,,,,


In [189]:
# For those not matched in final filtered data, try to match using original metadata 
df_meta = df_meta[df_meta['article_name'].notnull()] # eliminate cases with no name in metadata
df_meta['article_name_edited'] = df_meta['article_name'].apply(
    lambda title: get_title(title, is_title = True)) # clean title in metadata

# Filter to only those unmatched articles in metadata
uncoded_meta = uncoded_h2[
    uncoded_h2['article_name_edited'].apply(
        lambda title: title in 
        df_meta['article_name_edited'].tolist()) == True] # apply mask

# For consistency, remove from uncoded DF those that match with original meta data
uncoded_h2 = uncoded_h2[
    uncoded_h2['article_name_edited'].apply(
        lambda title: title not in 
        df_meta['article_name_edited'].tolist()) == True] # apply mask

print(f"There remain {str(len(uncoded_h2))} unmatched articles.")
print(f"This is after matching {str(len(uncoded_meta))} more articles using original metadata:") # show count of result

# Drop empty columns, then merge in meta data
uncoded_meta = uncoded_meta.drop(columns = ['file_name', 'edited_filename', 'article_name', 
                                            'jstor_url', 'abstract', 'word_count', 'journal_title', 
                                            'given_names', 'primary_subject', 'year', 'type'])
uncoded_meta = pd.merge(uncoded_meta, df_meta, 
                       how = 'left', on = 'article_name_edited') # merge in meta data
uncoded_meta['file_name'] = uncoded_meta['edited_filename'].apply(lambda name: 'journal-article-' + name)
uncoded_meta['word_count'] = np.NaN # add empty word count column
uncoded_meta = uncoded_meta[list(coded_h2)] # align columns with other DF

# look at unmatched cases that are in metadata
# Should look identical to matched coded cases, makes merging easier
uncoded_meta

There remain 116 unmatched articles.
This is after matching 38 more articles using original metadata:


Unnamed: 0,citation,cultural_score,relational_score,demographic_score,article_name_edited,file_name,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,word_count
0,"Berman, Elizabeth Popp 2006. Before the prof...",1.0,,,before the professional project success and fa...,journal-article-10.2307_4501749,10.2307_4501749,Before the Professional Project: Success and F...,https://www.jstor.org/stable/4501749,Theories of the professions do not sufficientl...,Theory and Society,,Sociology,1911,research-article,
1,"Clemens, Elisabeth S. 1993. Organizational r...",1.0,,,organizational repertoires and institutional c...,journal-article-10.2307_2781235,10.2307_2781235,Organizational Repertoires and Institutional C...,https://www.jstor.org/stable/2781235,Although social movements are often presumed t...,American Journal of Sociology,"[Greg, Yves, Kalevi, Hans Christian, Biplab, E...",Sociology,1912,research-article,
2,"Fligstein, Neil. 1985. The spread of the mul...",1.0,,,the spread of the multidivisional form among l...,journal-article-10.2307_2095547,10.2307_2095547,The Spread of the Multidivisional Form Among L...,https://www.jstor.org/stable/2095547,The multidivisional form is the favored form o...,American Sociological Review,"[JADWIGA, Paul J., Wayne, Charles, WILLIAM V.,...",Sociology,1970,research-article,
3,"Meyer, John W., and Brian Rowan. 1977. Insti...",1.0,,,institutionalized organizations formal structu...,journal-article-10.2307_2778293,10.2307_2778293,Institutionalized Organizations: Formal Struct...,https://www.jstor.org/stable/2778293,Many formal organizational structures arise as...,American Journal of Sociology,"[Mary R., Albert W., Helen, R. A., Judith R., ...",Sociology,1965,research-article,
4,"Mizruchi, Mark S., and Linda C. Fein. 1999. ...",1.0,,,the social construction of organizational know...,journal-article-10.2307_2667051,10.2307_2667051,The Social Construction of Organizational Know...,https://www.jstor.org/stable/2667051,Arguing that knowledge in the social sciences ...,Administrative Science Quarterly,"[Duncan L., Melvin, DAVID L., Andrew, Paul R.,...",Management & Organizational Behavior,1970,research-article,
5,"Rowan, Brian. 1982. Organizational structure...",1.0,,,organizational structure and the institutional...,journal-article-10.2307_2392303,10.2307_2392303,Organizational Structure and the Institutional...,https://www.jstor.org/stable/2392303,This paper develops an institutional approach ...,Administrative Science Quarterly,"[Jean, Stanley L., Robert E., Ussama, Leonard,...",Management & Organizational Behavior,1928,research-article,
6,"Tolbert, Pamela S., and Lynne G. Zucker. 1983...",1.0,,,institutional sources of change in the formal ...,journal-article-10.2307_2392383,10.2307_2392383,Institutional Sources of Change in the Formal ...,https://www.jstor.org/stable/2392383,This paper investigates the diffusion and inst...,Administrative Science Quarterly,"[Eva, Winfried, María Aidé, Kurt H., Jennifer,...",Management & Organizational Behavior,1915,research-article,
7,"Zucker, Lynne G. 1977. The role of instituti...",1.0,,,the role of institutionalization in cultural p...,journal-article-10.2307_2094862,10.2307_2094862,The Role of Institutionalization in Cultural P...,https://www.jstor.org/stable/2094862,Traditional approaches to institutionalization...,American Sociological Review,"[Duncan L., Melvin, DAVID L., Andrew, Paul R.,...",Sociology,1970,research-article,
8,"Boeker, Warren. 1989. The development and in...",,1.0,,the development and institutionalization of su...,journal-article-10.2307_2393150,10.2307_2393150,The Development and Institutionalization of Su...,https://www.jstor.org/stable/2393150,The research reported here examined the effect...,Administrative Science Quarterly,"[ROBERT, Riziki S., Idris S., Patrick, L. A., ...",Management & Organizational Behavior,1965,research-article,
9,"Burt, Ronald S., Kenneth P. Christman, and Har...",,1.0,,testing a structural theory of corporate coopt...,journal-article-10.2307_2094897,10.2307_2094897,Testing a Structural Theory of Corporate Coopt...,https://www.jstor.org/stable/2094897,Moving away from description of directorate ti...,American Sociological Review,"[Mabel, David, Jorge A., Benito E., Danièle, L...",Sociology,1967,research-article,


### Debug unmatched cases

In [190]:
# Debug unmatched cases: Look for fragments in article metadata
for piece in uncoded_h2['article_name_edited'].tolist():
    print(piece) # Looking for this unmatched article
    fragment = ' '.join(piece.split()[:3]) # take first chunk
    
    # Look for matches in metadata
    print(df_meta[df_meta['article_name'].apply(lambda name: fragment in str(name).lower())]['article_name'].value_counts())
    print(df_meta[df_meta['article_name_edited'].apply(lambda name: fragment in str(name).lower())]['article_name_edited'].value_counts())
    #print(df[df['article_name'].apply(lambda name: fragment in str(name).lower())]['article_name'].value_counts())
    print()

organizational culture can it be a source of sustained competitive advantage
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

keeping an eye on the mirror image and identity in organizational adaptation
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

fitting in or standing out the tradeoffs of structural and cultural embeddedness
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

tailored meanings a look at dress
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

studying organizational culture through rites and ceremonies
On Studying Organizational Cultures    1
Name: article_name, dtype: int64
on studying organizational cultures    1
Name: article_name_edited, dtype: int64

displacing disney some notes on the flow of culture
Series([], Name: article_name, dtype: int64)
Series([], Name: article_

towards a theory of rights for qualitative researchers                                                                                                                        1
coordinating futures toward a theory of anticipation                                                                                                                          1
the structure and dynamics of love towards a theory of marital quality and stability                                                                                          1
gemeinschaft verstehen a theory of the middle range                                                                                                                           1
a theory of group stability                                                                                                                                                   1
integrating the ideas of dissenting economists into a theory of transformational leadership                             

Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

seizing opportunity in emerging fields how entrepreneurs legitimated the professional form of management consulting
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

acquiring organizational legitimacy through illegitimate actions a marriage of institutional and impression management theories
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

understanding radical organizational change bringing together the old and the new institutionalism
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

institutional entrepreneurship in mature fields the big five accounting firms
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

theorizing change the role of professional associations in the transformation of institut

In Search of the Machiavellian Milquetoasts: Comparing Attitudes of Bureaucrats and Ordinary People                                           1
The Philanthropic Poor: In Search of Explanations for the Relative Generosity of Lower Income Households                                      1
In Search of a Protestant Twentieth Century: American Religion and Power Since 1900                                                           1
In Search of Denominational Subcultures: Religious Affiliation and "Pro-Family" Issues Revisited                                              1
In Search of the Master Science: Population Policy in the Seventies                                                                           1
In Search of Kilometer Zero: Digital Archives, Technological Revisionism, and the Sino-Vietnamese Border                                      1
In Search of Smoking Guns: What Makes Income Inequality Vary over Time in Different Countries?                                          

Series([], Name: article_name_edited, dtype: int64)

from ritual to reality demography ideology and decoupling in a post communist government agency
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

explaining institutional decoupling the case of stock repurchase programs
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

multiple institutional logics in organizations explaining their varied nature and implications
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

seeing like the fed the roles of culture cognition and framing in the failure to anticipate the financial crisis of 2008
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

moving institutional logics forward emotion and meaningful material practice
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: in

Series([], Name: article_name_edited, dtype: int64)

when worlds collide the internal dynamics of organizational responses to conflicting institutional demands
When worlds collide                                                                                    1
When Worlds Collide: A Case Study of Ethics, Privatization, and Performance in Juvenile Corrections    1
When Worlds Collide: Scientists Doing Science in the Social World                                      1
When Worlds Collide: Health Surveillance, Privacy, and Public Policy                                   1
Name: article_name, dtype: int64
when worlds collide health surveillance privacy and public policy                                   1
when worlds collide a case study of ethics privatization and performance in juvenile corrections    1
when worlds collide                                                                                 1
when worlds collide scientists doing science in the social world                 

the rise of hamburg as a global marketplace in the seventeenth century a comparative political economy perspective                                                           1
the rise of the west                                                                                                                                                         1
plugging into democracy the rise of cybercitizen activism                                                                                                                    1
the rise of professions and professional organization in modern egypt                                                                                                        1
the rise of absolutism and noble rebellion in early modern habsburg austria 1570 to 1620                                                                                     1
the masculine female the rise of women doctors in colonial india c 1870 1940                                                 

Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

toward organizational pluralism institutional intrapreneurship in integrative medicine
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

institutionalization as an interplay between actions meanings and actors the case of a rape crisis center in israel
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

a structural theory of interlocking corporate directorates
A Structural Theory of Personal Consistency                                                                                                           1
A Behavioral Foundation for a Structural Theory of Power in Exchange Networks                                                                         1
Toward a Structural Theory of Psychopathology                                                                                               

Mechanisms of Peer Influence Among Adolescents: Cohesion Versus Structural Equivalence    1
Social Contagion and Innovation: Cohesion versus Structural Equivalence                   1
Name: article_name, dtype: int64
social contagion and innovation cohesion versus structural equivalence                   1
mechanisms of peer influence among adolescents cohesion versus structural equivalence    1
Name: article_name_edited, dtype: int64

a note on social capital and network content
Comments on "A Note on the Location of Depots"                                                                      2
A Note on the Separation of Ownership from Control                                                                  2
A Note on the Integration of AIDS into the Sociology of Human Sexuality                                             1
Reply: A Note on Pert Assumptions                                                                                   1
Picking the Lock: A Note on "Locks at the R

Series([], Name: article_name, dtype: int64)
network related personality and the agency question multirole evidence from a virtual world    1
Name: article_name_edited, dtype: int64

reinforced structural holes
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

risks returns and relational lending personal ties in microfinance
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

a0 the strength of weak ties
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

the search transfer problem the role of weak ties in sharing knowledge across organizational subunits
Series([], Name: article_name, dtype: int64)
the search transfer problem the role of weak ties in sharing knowledge across organization subunits    1
Name: article_name_edited, dtype: int64

race opportunity and diversity of social circles in managerial networks
Series([], Name: article_name, dty

the value of queueing theory a case study                                                                                                                                  2
the value of marketing expertise                                                                                                                                           1
the value of formal planning for strategic decisions review of empirical research                                                                                          1
the value of counsel 20 years of representation before a public housing eviction board                                                                                     1
pharmaceutical high profits the value of r d or oligopolistic rents                                                                                                        1
marriage promises and the value of a woman s testimony in colonial mexico                                                              

Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

a0 how employees prior affiliations constrain organizational network change xa0 a study of u s venture capital and private equity
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

preentry contacts and the generation of nascent networks in organizations
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

knowledge transfer and intraorganizational networks effects of network position and absorptive capacity on business unit innovation and performance
When and How Trustworthiness Matters: Knowledge Transfer and the Moderating Effect of Casual Ambiguity       1
Comparing the Resource-Based and Relational Views: Knowledge Transfer and Spillover in Vertical Alliances    1
Knowledge Transfer and International Joint Ventures: The Case of Nummi and General Motors                    1
Name: article_name, d

Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

corporate governance and nested authority cohesive network structure actor driven mechanisms and the balance of power in american corporations
Patterns of Corporate Governance and Technical Efficiency in Italian Manufacturing                                                   1
Enron, Corporate Governance and Deterrence                                                                                           1
Corporate Governance and the Former East Germany: the Role of the Treuhandanstalt in Moulding the New German Economy                 1
Corporate Governance and the Bankrupt Firm: An Empirical Assessment                                                                  1
Corporate Governance and Executive Remuneration: A Contingency Framework                                                             1
Corporate Governance and Firm Efficiency: Evidence from China's Publicly Listed Firm

errata who gets the bird or how the communists won power and trust in america s unions the relative autonomy of intraclass political struggles    1
who gets the daddy bonus organizational hegemonic masculinity and the impact of fatherhood on earnings                                            1
who gets the bird or how the communists won power and trust in america s unions the relative autonomy of intraclass political struggles           1
who gets the carrot and who gets the stick evidence of gender disparities in executive remuneration                                               1
Name: article_name_edited, dtype: int64

accounting for the gap a firm study manipulating organizational accountability and transparency in pay decisions
Well-Being Outcomes in Bolivia: Accounting for the Effects of Ethnicity and Regional Location                                                                                    1
A retreat from permanent employment? Accounting for the rise of professiona

Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

the power in demography women s social constructions of gender identity at work
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

discrimination in the credential society an audit study of race and college selectivity in the labor market
Sex Discrimination in the Legal Profession: A Study of Promotion                                                                                                      1
Social Discrimination in the Corporate Elite: How Status Affects the Propensity for Minority CEOs to Receive Blame for Low Firm Performance                           1
The Prevalence, Distribution, and Mental Health Correlates of Perceived Discrimination in the United States                                                           1
IS THERE DISCRIMINATION IN THE "BLACK MAN'S GAME"?                                                              

Series([], Name: article_name, dtype: int64)
race related differences in promotions and support underlying effects of human and social capital    1
Name: article_name_edited, dtype: int64

whitened résumés race and self presentation in labor markets
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

demographic diversity and faultlines the compositional dynamics of organizational groups
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

looking up and looking out career mobility effects of demographic similarity among professionals
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

race place and crime how violent crime events affect employer discrimination
Series([], Name: article_name, dtype: int64)
newspaper coverage of the 1992 los angeles uprising race place and the story of the riot racial ideology in african american and korean american new

Series([], Name: article_name_edited, dtype: int64)

unequal hard times the influence of the great recession on gender bias in entrepreneurial financing
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

documenting desgregation segregation in american workplaces by race ethnicity and sex 1966 2003
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

top management team demography and corporate strategic change
Experience-Based Top Management Team Competence and Sustained Growth                                                                                    1
The Impact of U.S. Company Internationalization on Top Management Team Advice Networks: A Tacit Knowledge Perspective                                   1
When Do Chief Marketing Officers Have Influence on Top Management Teams?                                                                                1
Top Management Team Compensatio

why top management team characteristics matter when employing a chief operating officer a strategic contingency perspective                          1
top management team strategic consensus demographic homogeneity and firm performance a report of resounding nonfindings                              1
diversification and top management team complementarity is performance improved by merging similar or dissimilar teams                               1
top management team composition corporate ideology and firm performance                                                                              1
top management team diversity and firm performance moderators of functional background and locus of control diversity                                1
effects of top management team change on performance in downsized us companies                                                                       1
the impact of board monitoring and involvement on top management team affective conflict      

Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

a fair game racial bias and repeated interaction between nba coaches and players
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

modelling internal organizational change
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

toward an institutional ecology of organizational founding
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

the organizational ecology of strategic groups in the american brewing industry from 1975 to 1990
The Organizational Ecology of a Technological System                                                      1
When Disciplinary Worlds Collide: The Organizational Ecology of Disciplines in a University Department    1
Name: article_name, dtype: int64
the organizational ecology of a technological system                            

Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

the evolution of inertia
Cultural Transmission, Disproportionate Prior Exposure, and the Evolution of Cooperation                                                                                           2
The Evolution of Corporate Capabilities in Emerging Technologies                                                                                                                   1
From Black Muslim to Bilalian: The Evolution of a Movement                                                                                                                         1
Resource Tangibility and the Evolution of a Publicly Funded Health and Human Services Network                                                                                      1
Erratum: The Evolution of Organization Analysis in ASQ, 1959-1979                                                                                        

cultural transmission disproportionate prior exposure and the evolution of cooperation                                                         2
the evolution of financial planning models at a commercial bank                                                                                1
the evolution of public sector strategy                                                                                                        1
a look at the evolution of meteorological satellites                                                                                           1
the ending of the slave trade and the evolution of european scientific racism                                                                  1
organizational form as a solution to the problem of credible commitment the evolution of naming strategies among u s hotel chains 1896 1980    1
the evolution of number                                                                                                           

Shifting Gears, Shifting Niches: Organizational Inertia and Change in the Evolution of the U.S. Automobile Industry, 1885-1981    1
Name: article_name, dtype: int64
shifting gears shifting niches organizational inertia and change in the evolution of the u s automobile industry 1885 1981    1
Name: article_name_edited, dtype: int64

move to the beat rhythms of change and performance
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

davids against goliath collective identities and the market success of peripheral organizations during resource partitioning
Series([], Name: article_name, dtype: int64)
Series([], Name: article_name_edited, dtype: int64)

the ecology of organizational growth chinese law firms in the age of globalization
Cooperation, Deterrence, and the Ecology of Regulatory Enforcement                                              1
The Ecology of Collective Action and Regional Representation in the European Union               

In [None]:
allnames = df_meta['article_name_edited'].tolist() 
#print(len(allnames))
coded_df['article_name_edited'] = coded_df['article_name'].apply(
    lambda title: get_title(title, is_title = True))

allnames = [name for name in df_meta['article_name_edited'].tolist() if name not in 
            (coded_h2['article_name_edited'].tolist() + 
             coded_df['article_name_edited'].tolist() + 
             uncoded_meta['article_name_edited'].tolist())]

#allnames = df['article_name_edited'].tolist()
print(len(allnames))

#tqdm.pandas(desc = 'Calculating scores')
#uncoded_h2['closest_score'] = uncoded_h2['article_name_edited'].progress_apply(lambda name: max([score for title, score in process.extract(name, allnames)]))
tqdm.pandas(desc = 'Getting best match')
uncoded_h2['closest_match'] = uncoded_h2['article_name_edited'].progress_apply(
    lambda name: process.extractOne(name, allnames))

uncoded_h2[['citation', 'article_name_edited', 'closest_match']].sort_values(
    by='article_name_edited', ascending=True).to_excel(
    not_matched_fp, index=False)

Getting best match:   0%|          | 0/116 [00:00<?, ?it/s]

191272


Getting best match:  16%|█▌        | 18/116 [2:59:45<19:22:49, 711.94s/it]

### Finalize matching

In [191]:
# Concatenate h2-coded data with hand-coded data
coded_df = pd.concat([coded_df, coded_h2, uncoded_meta], axis=0, join='inner')
coded_df = coded_df[coded_df['edited_filename'].notnull()] # Remove empty row
coded_df

Unnamed: 0,cultural_score,relational_score,demographic_score,article_name,abstract,jstor_url,year,journal_title,edited_filename
0,1.0,0.0,0.0,"Intersecting Three Muddy Roads: Stability, Leg...",Several decades of research by multiple academ...,https://www.jstor.org/stable/25822540,2011,Journal of Managerial Issues,10.2307_25822540
1,1.0,0.0,0.0,Rational Decision Making as Performative Praxi...,Organizational theorists built their knowledge...,external-fulltext-any,2011,Organization Science,10.2307_20868880
2,1.0,1.0,0.0,From Fiefs to Clans and Network Capitalism: Ex...,China's rapid economic development is being ac...,https://www.jstor.org/stable/2393869,1986,Administrative Science Quarterly,10.2307_2393869
3,1.0,1.0,0.0,The Collective Strategy Framework: An Applicat...,This paper investigates empirically the compet...,https://www.jstor.org/stable/2392643,1984,Administrative Science Quarterly,10.2307_2392643
4,1.0,0.0,0.0,"Political Institutional Change, Obsolescing Le...",This paper studies the practice of integration...,https://www.jstor.org/stable/41682289,2012,MIR: Management International Review,10.2307_41682289
5,1.0,0.0,0.0,Culture and Meaning: Making Sense of Conflicti...,,https://www.jstor.org/stable/40397128,1989,International Studies of Management & Organiza...,10.2307_40397128
6,1.0,0.0,0.0,Linking Organizational Values to Relationships...,This study explores the organizational values ...,https://www.jstor.org/stable/2640266,1995,Organization Science,10.2307_2640266
7,1.0,0.0,0.0,Beyond the red tape: How victims of terrorism ...,We use a storyteller perspective to examine ho...,external-fulltext-any,2011,Journal of Organizational Behavior,10.2307_41415713
8,1.0,0.0,0.0,Embedding Sustainability Across the Organizati...,This article is a response to Haugh and Talwar...,https://www-jstor-org.proxy.library.georgetown...,2011,Academy of Management Learning & Education,10.2307_23100442
9,1.0,1.0,0.0,When Experience Meets National Institutional E...,We develop an institutional change perspective...,https://www.jstor.org/stable/27735492,2009,Strategic Management Journal,10.2307_27735492


## Merge full text with coded articles

### Create file paths from filenames

In [192]:
tqdm.pandas(desc='Creating file paths...')
coded_df['file_path'] = coded_df['edited_filename'].progress_apply(
    lambda name: ocr_fp + 'journal-article-' + str(name) + '.txt')

# Check out results
for i, row in coded_df.iloc[:10,].iterrows():
    print(row['file_path'])

Creating file paths...: 100%|██████████| 540/540 [00:00<00:00, 167177.75it/s]

/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_25822540.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_20868880.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_2393869.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_2392643.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_41682289.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_40397128.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_2640266.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_41415713.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_23100442.txt
/home/jovyan/work/jstor_data/ocr/journal-article-10.2307_27735492.txt





### Read in text files

In [193]:
def read_file(fp):
    '''Reads text file.
    
    Args:
        fp: path to text file
        
    Returns:
        str: text from file'''
    
    with open(fp, 'r') as myfile:
        text = myfile.read()
        
    return text

tqdm.pandas(desc='Reading text files...')
coded_df['text'] = coded_df['file_path'].progress_apply(lambda fp: read_file(fp))

Reading text files...: 100%|██████████| 540/540 [00:00<00:00, 722.03it/s] 


In [194]:
tqdm.pandas(desc='Reading text files...')
coded_df['text'] = coded_df['file_path'].progress_apply(lambda fp: read_file(fp))

# Check out results
for i, row in coded_df.iloc[:10,].iterrows():
    print(row['text'][:100])
    print()

Reading text files...: 100%|██████████| 540/540 [00:00<00:00, 4070.69it/s]

<plain_text><page sequence="1">JOURNAL OF MANAGERIAL ISSUES Vol. XXIII Number 1 Spring 2011: 96-112 

<plain_text><page sequence="1">Organization ht icna Vol. 22, No. 3, May-June 2011, pp. 573-586 issn 

<plain_text><page sequence="1">From Fiefs to Clans and Network Capitalism: Explaining China's Emergi

<plain_text><page sequence="1">The Collective Strategy Framework: An Applica- tion to Competing Pre-

<plain_text><page sequence="1">Manag Int Rev (2012) 52:847-877 DOI 10.1007/sl 1575-012-0141-4 RESEAR

<plain_text><page sequence="1">Int. Studies ofMgt. &amp; Org.t Vol. 19, No. 3, pp. 64-81. M.E.Sharpe

<plain_text><page sequence="1">Linking Organizational Values to Relationships with External Constitu

<plain_text><page sequence="1">Journal of Organizational Behavior, J. Organiz. Behav. 32, 938-954 (2

<plain_text><page sequence="1">® Academy oí Management Learning &amp; Education, 2011, Vol. 10, No. 

<plain_text><page sequence="1">Strategie Management Journal Strat. Mgmt. /.. 30: 1




## Check & save final coded data

In [195]:
# Extract and assess conclusive results for each perspective
coded_cult_final = coded_df[coded_df['cultural_score'].notnull()][['text', 'cultural_score']]
coded_relt_final = coded_df[coded_df['relational_score'].notnull()][['text', 'relational_score']]
coded_demog_final = coded_df[coded_df['demographic_score'].notnull()][['text', 'demographic_score']]

print('Number of coded files for each perspective:\n')
print(f'Cultural: {str(len(coded_cult_final[coded_cult_final["cultural_score"]==1.0]))} yes, {str(len(coded_cult_final[coded_cult_final["cultural_score"]==0.0]))} no')
print(f'Relational: {str(len(coded_relt_final[coded_relt_final["relational_score"]==1.0]))} yes, {str(len(coded_relt_final[coded_relt_final["relational_score"]==0.0]))} no')
print(f'Demographic: {str(len(coded_demog_final[coded_demog_final["demographic_score"]==1.0]))} yes, {str(len(coded_demog_final[coded_demog_final["demographic_score"]==0.0]))} no')

Number of coded files for each perspective:

Cultural: 133 yes, 209 no
Relational: 114 yes, 229 no
Demographic: 101 yes, 248 no


In [196]:
# Save reference file, sorted by journal then article name
df[['article_name', 'article_name_edited', 'journal_title', 'year', 'file_name', 'jstor_url']].sort_values(
    by=['journal_title', 'article_name_edited'], ascending=True).to_excel(
    article_names_fp, index = False)

In [197]:
# Save lists of true positives: those that matched, those that didn't
coded_df.sort_values(by='article_name', ascending=True).to_excel(
    matched_fp, index=False)

uncoded_h2[['citation', 'article_name_edited']].sort_values(
    by='article_name_edited', ascending=True).to_excel(
    not_matched_fp, index=False)

In [198]:
# Save training data for classifiers: true positives + negatives for each perspective
quickpickle_dump(coded_cult_final, training_cult_fp)
quickpickle_dump(coded_relt_final, training_relt_fp)
quickpickle_dump(coded_demog_final, training_demog_fp)