### Inspect most frequent & distinct words for JSTOR articles




Authors: Nancy Xu, Thomas Lu

Institution: University of California, Berkeley

Date created: March 27, 2022

Date last modified: April 5, 2022

Code to first preprocess the JSTOR articles, then use ngram to find the most frequentc words, and TFIDF to find the most distinct words for articles labeled positive for each perspective.

In [1]:

import pickle
import re


import os
import random as rand
from tqdm import tqdm, trange

import numpy as np
import pandas as pd
# from clean_text import stopwords_make, punctstr_make, unicode_make, apache_tokenize, clean_sentence_apache 

import matplotlib.pyplot as plt

import numpy as np
import re
import random
from tqdm import tqdm
from collections import Counter

from collections import Counter


## preprocess text data

In [2]:
def open_test_data(path):
    return open(path, 'rb')
with open_test_data('/home/jovyan/work/tlu_storage/training_cultural_preprocessed_100321.pkl') as f:
    cult = pickle.load(f)

with open_test_data('/home/jovyan/work/tlu_storage/training_demographic_preprocessed_100321.pkl') as f:
    demog = pickle.load(f)

with open_test_data('/home/jovyan/work/tlu_storage/training_orgs_preprocessed_100321.pkl') as f:
    orgs = pickle.load(f)

with open_test_data('/home/jovyan/work/tlu_storage/training_relational_preprocessed_100321.pkl') as f:
    rela = pickle.load(f)
    


In [3]:
import itertools
full_text = []

for i in cult['text']:
    joined = list(itertools.chain(*i))
    full_text.append(" ".join(joined))


# ' '.join(cult['text'][0][0])

In [4]:
full_text_demog = []
for i in demog['text']:
    joined = list(itertools.chain(*i))
    full_text_demog.append(" ".join(joined))

In [5]:
full_text_orgs = []
for j in orgs['text']:
    joined = list(itertools.chain(*j))
    full_text_orgs.append(" ".join(joined))

In [6]:
full_text_rela = []
for j in rela['text']:
    joined = list(itertools.chain(*j))
    full_text_rela.append(" ".join(joined))

In [7]:
cult['full_text'] = full_text
demog['full_text'] = full_text_demog
orgs['full_text'] = full_text_orgs
rela['full_text'] = full_text_rela

In [8]:
def remove_tags(article):
    article = re.sub('<plain_text> <page sequence="1">', '', article)
    article = re.sub(r'</page>(\<.*?\>)', ' \n ', article)
    # xml tags
    article = re.sub(r'<.*?>', '', article)
    article = re.sub(r'<body.*\n\s*.*\s*.*>', '', article)
    return article

tags_removed = [remove_tags(art) for art in cult['full_text']]
tags_removed_demog = [remove_tags(art) for art in demog['full_text']]
tags_removed_org = [remove_tags(art) for art in orgs['full_text']]
tags_removed_rela = [remove_tags(art) for art in rela['full_text']]
cult['text_no_tags'] = tags_removed
demog['text_no_tags'] = tags_removed_demog
orgs['text_no_tags'] = tags_removed_org
rela['text_no_tags'] = tags_removed_rela

In [9]:
cult = cult[cult['cultural_score']==1]
demog = demog[demog['demographic_score']==1]
orgs = orgs[orgs['orgs_score']==1]
rela = rela[rela['relational_score']==1]

In [10]:
cult

Unnamed: 0,text,cultural_score,primary_subject,edited_filename,article_name,full_text,text_no_tags
1,"[[Civil, Rights, Law, at, Work:, Sex, Discrimi...",1.0,Sociology,10.1086_210317,Civil Rights Law at Work: Sex Discrimination a...,Civil Rights Law at Work: Sex Discrimination a...,Civil Rights Law at Work: Sex Discrimination a...
3,"[[World, Society, and, the, Nation-State, John...",1.0,Sociology,10.1086_231174,World Society and the Nation‐State,World Society and the Nation-State John Meyer ...,World Society and the Nation-State John Meyer ...
4,"[[<body, xmlns:xlink=""http://www..org//xlink""]...",1.0,Sociology,10.1086_382347,Kinship Networks and Entrepreneurs in China’s ...,"<body xmlns:xlink=""http://www..org//xlink"" xml...",Introduction Economists have long concurred t...
5,"[[<body, xmlns:xlink=""http://www..org//xlink""]...",1.0,Sociology,10.1086_517899,What Is Organizational Imprinting? Cultural En...,"<body xmlns:xlink=""http://www..org//xlink"" xml...",Introduction Organizational sociologists have...
6,"[[<body, xmlns:xlink=""http://www..org//xlink""]...",1.0,Sociology,10.1086_588742,"Homeward Bound? Interest, Identity, and Invest...","<body xmlns:xlink=""http://www..org//xlink"" xml...",Introduction Are indigenous investors in Thir...
8,"[[<body, xmlns:xlink=""http://www..org//xlink""]...",1.0,Sociology,10.1086_659639,The Credit Crisis as a Problem in the Sociolog...,"<body xmlns:xlink=""http://www..org//xlink"" xml...",Introduction At the heart of the credit crisi...
17,"[[Government, Regulatory, Powers, and, Church,...",1.0,Sociology,10.2307_1385815,Government Regulatory Powers and Church Autono...,Government Regulatory Powers and Church Autono...,Government Regulatory Powers and Church Autono...
23,"[[Reexamining, Resistance, as, Oppositional, B...",1.0,Sociology,10.2307_1519868,Reexamining Resistance as Oppositional Behavio...,Reexamining Resistance as Oppositional Behavio...,Reexamining Resistance as Oppositional Behavio...
30,"[[Enforcement, is, the, Name, of, the, Game:, ...",1.0,Sociology,10.2307_20831088,Enforcement is the Name of the Game: An Essay ...,Enforcement is the Name of the Game: An Essay ...,Enforcement is the Name of the Game: An Essay ...
32,"[[Protest, Group, Success:, The, Impact, of, G...",1.0,Sociology,10.2307_20831203,Protest Group Success: The Impact of Group Cha...,Protest Group Success: The Impact of Group Cha...,Protest Group Success: The Impact of Group Cha...


In [12]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['one', 'two', 'three', 'amp', 'may', 'can', 'new', 'also', 'and'])

import string
import re
import nltk

def word_process(tt):
    """
    helper function to lower text, remove stop words, numbers, and empty 
    """
    
    tt = tt.lower()
    
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~=\n'''
    # Removing punctuations in string 
    # Using loop + punctuation string 


    for ele in tt:  
        if ele in punc:  
            tt = tt.replace(ele, " ")  

    # read tokens
    tokens = tt.split()
    lst = [token.translate(punc).lower() for token in tokens ]
    
    #remove stop words
    filtered = []
    for i in lst:
        if i not in stop_words:
            filtered.append(i)
    
    # removing singular numbers and singular letters
    pattern = '[0-9]'
    filtered = [re.sub(pattern, '', i) for i in filtered] 
    new = []
    for inp in filtered:
        new.append(' '.join( [w for w in inp.split() if len(w)>1] ))
        
    # filter out empty strings 
    new = [i for i in new if i] 

    dt = [d.split() for d in new]
    
  
    st = ''
    for i in dt:
        st = st + " " + i[0]
    return st
    

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
def get_most_common(df):
    processed = [word_process(i) for i in df['text_no_tags']]
    df['processed'] = processed
    
    cult_words = []
    for k in processed:
        cult_words.extend(k.split())
    
    cult_counts = Counter(cult_words)
    return cult_counts.most_common()

In [15]:
import pickle

with open('cult_most_common.pkl', 'wb') as f:
    pickle.dump(get_most_common(cult), f)

In [None]:
import pickle

with open('rela_most_common.pkl', 'wb') as f:
    pickle.dump(get_most_common(rela), f)

In [None]:
with open('demog_most_common.pkl', 'wb') as f:
    pickle.dump(get_most_common(demog), f)

In [None]:
with open('orgs_most_common.pkl', 'wb') as f:
    pickle.dump(get_most_common(orgs), f)

In [None]:
with open('orgs_most_common.pkl', 'rb') as f:
    orgs_most_common = pickle.load(f)

In [16]:
cult_processed = [word_process(i) for i in cult['text_no_tags']]

In [17]:
cult['processed'] = cult_processed

In [18]:
orgs_processed = [word_process(i) for i in orgs['text_no_tags']]
orgs['processed'] = orgs_processed

In [19]:
demog_processed = [word_process(i) for i in demog['text_no_tags']]
demog['processed'] = demog_processed

In [20]:
rela_processed = [word_process(i) for i in rela['text_no_tags']]
rela['processed'] = rela_processed

In [21]:
cult_words = []
for k in cult_processed:
    cult_words.extend(k.split())

## Use n grams to inspect the most common words

In [22]:
from collections import Counter
import numpy as np

In [23]:
from itertools import tee, islice

def ngrams(lst, n):
  tlst = lst
  while True:
    a, b = tee(tlst)
    l = tuple(islice(a, n))
    if len(l) == n:
      yield l
      next(b)
      tlst = b
    else:
      break



In [30]:
def most_common_terms(cult_processed, num=30):
    cult_words = []
    for k in cult_processed:
        cult_words.extend(k.split())
    cult_counts = Counter(cult_words)
    top_uni = cult_counts.most_common()[:num]
    cult_counts = Counter(ngrams(cult_words, 2))
    top_bi = cult_counts.most_common()[:num]
    return top_uni, top_bi

In [71]:
top_uni, top_bi = most_common_terms(demog['processed'][demog.demographic_score == 1.0])
print(top_uni)
# print('\n' + '='*50,'\n')
# print(top_bi)

[('organizational', 11703), ('organizations', 9717), ('women', 7894), ('social', 7299), ('entry', 7053), ('bottom', 5817), ('align', 5733), ('firms', 5697), ('valign', 5685), ('organization', 5656), ('model', 5571), ('change', 5535), ('work', 5512), ('level', 5249), ('firm', 5151), ('effects', 5108), ('job', 4925), ('research', 4799), ('time', 4677), ('group', 4656), ('performance', 4649), ('gender', 4584), ('size', 4377), ('men', 4253), ('management', 4203), ('char', 4162), ('industry', 4040), ('number', 3953), ('data', 3940), ('effect', 3863)]


In [72]:
top_uni, top_bi = most_common_terms(rela['processed'][rela.relational_score == 1.0])
print(top_uni)
# print('\n' + '='*50,'\n')
# print(top_bi)

[('social', 12351), ('firms', 9929), ('network', 9284), ('ties', 7994), ('organizations', 7628), ('firm', 7608), ('organizational', 6759), ('entry', 6033), ('organization', 5999), ('research', 5925), ('power', 5841), ('market', 5592), ('align', 5283), ('networks', 5210), ('valign', 5163), ('model', 5153), ('bottom', 5053), ('information', 4771), ('data', 4727), ('management', 4646), ('number', 4610), ('industry', 4417), ('work', 4395), ('american', 4341), ('group', 4257), ('time', 4174), ('study', 4171), ('analysis', 4042), ('control', 4032), ('would', 4011)]


In [73]:
top_uni, top_bi = most_common_terms(cult['processed'][cult.cultural_score == 1.0])
print(top_uni)
# print('\n' + '='*50,'\n')
# print(top_bi)

[('organizational', 9200), ('organizations', 8361), ('social', 7200), ('institutional', 6218), ('organization', 5246), ('management', 4843), ('firms', 4811), ('research', 4115), ('change', 3902), ('time', 3799), ('state', 3796), ('university', 3793), ('american', 3716), ('model', 3523), ('work', 3448), ('press', 3446), ('theory', 3387), ('market', 3271), ('firm', 3216), ('would', 3117), ('entry', 3049), ('analysis', 3004), ('culture', 2947), ('study', 2883), ('journal', 2792), ('science', 2738), ('first', 2721), ('business', 2715), ('data', 2700), ('political', 2688)]


## New Distinctive Word Method 1: Summed TF-IDF
####  Inspect most distinctive words for positively labeled set for each perspective

In [88]:
tfidf = TfidfVectorizer(stop_words='english')

   
X = tfidf.fit_transform(corpus)
feature_names = np.array(tfidf.get_feature_names())


# for k in [demog, cult, rela, orgs]:
def get_terms(rela,word_count = 50):
    responses = tfidf.transform(rela['processed'])
    word_inds = np.argsort(np.array(responses.sum(axis=0))[0])[:-word_count+1:-1]
    return [feature_names[i] for i in word_inds]

rela_set = get_terms(rela)
cult_set = get_terms(cult)
demog_set = get_terms(demog)

shared = set(rela_set) & set(cult_set) & set(demog_set)

In [89]:
[t for t in rela_set if t not in shared]

['network',
 'ties',
 'power',
 'networks',
 'board',
 'job',
 'information',
 'control',
 'capital',
 'resources',
 'structure',
 'status',
 'business',
 'economic',
 'corporate',
 'relations',
 'alliance',
 'exchange',
 'table',
 'directors',
 'members',
 'knowledge',
 'results',
 'groups',
 'tie',
 'strategic',
 'effect',
 'likely',
 'ownership',
 'embeddedness',
 'resource',
 'align',
 'press',
 'relationships',
 'state',
 'structural',
 'high',
 'partners',
 'political',
 'percent',
 'contacts',
 'institutional',
 'people',
 'centrality',
 'dependence',
 'variable',
 'hypothesis',
 'influence']

In [90]:
[t for t in cult_set if t not in shared]

['institutional',
 'culture',
 'state',
 'change',
 'cultural',
 'press',
 'legitimacy',
 'women',
 'business',
 'practices',
 'political',
 'public',
 'science',
 'control',
 'logics',
 'education',
 'adoption',
 'law',
 'actors',
 'structure',
 'institutions',
 'diffusion',
 'process',
 'york',
 'environment',
 'corporate',
 'movement',
 'sustainability',
 'labor',
 'environmental',
 'et',
 'meyer',
 'review',
 'different',
 'al',
 'field',
 'economic',
 'knowledge',
 'models',
 'likely',
 'schools',
 'government',
 'legal',
 'isomorphism',
 'resources',
 'institutionalization',
 'structures',
 'action']

In [91]:
[t for t in demog_set if t not in shared]

['women',
 'gender',
 'men',
 'job',
 'density',
 'sex',
 'female',
 'change',
 'size',
 'jobs',
 'male',
 'labor',
 'diversity',
 'team',
 'workers',
 'population',
 'hannan',
 'rates',
 'age',
 'groups',
 'effect',
 'status',
 'race',
 'rate',
 'black',
 'founding',
 'table',
 'members',
 'percent',
 'employment',
 'white',
 'align',
 'differences',
 'results',
 'niche',
 'carroll',
 'populations',
 'tenure',
 'char',
 'inequality',
 'managerial',
 'models',
 'occupational',
 'segregation',
 'competition',
 'mortality',
 'variable',
 'discrimination']