In [None]:
#!/usr/bin/env python
# -*- coding: UTF-8

# <p style="text-align: center;"> JSTOR Text Analysis Project:<br/> Refining Expert-Built Dictionaries with word2vec
Authors: Jaren Haber, Rebecca Abraham, Laiming Huang, Zekai Fan<br/>
Institution: University of California, Berkeley<br/>
Date created: July 20, 2018<br/>
Date last modified: September 22, 2020

## Initialize Python

In [1]:
# Install missing packages
!pip install gensim
!pip install nltk
!pip install tqdm

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/5c/4e/afe2315e08a38967f8a3036bbe7e38b428e9b7a90e823a83d0d49df1adf5/gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 2.5MB/s eta 0:00:01     |██████████████▋                 | 11.1MB 2.5MB/s eta 0:00:06     |██████████████████████████████▏ | 22.9MB 2.5MB/s eta 0:00:01
Collecting smart-open>=1.8.1 (from gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/11/9a/ba2d5f67f25e8d5bbf2fcec7a99b1e38428e83cb715f64dd179ca43a11bb/smart_open-3.0.0.tar.gz (113kB)
[K     |████████████████████████████████| 122kB 53.7MB/s eta 0:00:01
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/18/88/7c/f06dabd5e9cabe02d2269167bcacbbf9b47d0c0ff7d6ebcb78
Successfully built smart-open
Installing collected packages: smart-open, gensim
Successfully inst

In [18]:
# IMPORTING KEY PACKAGES
import gensim # for word embedding models
import _pickle as cPickle # Optimized version of pickle
import gc # For managing garbage collector
from collections import Counter # For counting terms across the corpus
import re # For filtering most_similar() output--remove surnames
import csv # For saving csv files

import sys; sys.path.insert(0, "../../../data_management/tools/") # To load functions from files in data_management/tools
from textlist_file import write_list, load_list # For saving and loading text lists to/from file

In [3]:
# FOR VISUALIZATIONS
import matplotlib
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE # For visualizing word embeddings
from scipy.spatial import distance # To use cosine distances for tSNE metric

# Visualization parameters
%pylab inline 
%matplotlib inline
#matplotlib.style.use('white')

import seaborn as sns # To make matplotlib prettier
sns.set(style='white')
#sns.despine()

Populating the interactive namespace from numpy and matplotlib


In [41]:
# Define model file paths (Your job to figure out how to load these in!)
wem_path = "../../../models_storage/word_embeddings_data/word2vec_phrased_filtered_300d_2020_sept5.bin" # old: dec11
#wem_path_npy = "../../../models_storage/word_embeddings_data/word2vec_phrased_filtered_300d_aug14.bin.wv.vectors.npy"
#wem_path_old = "../yoon/word2vec_phrased_filtered_300d_july18.bin"

# Define dictionary file paths:
culture_path = "../../Dictionary Mapping/Dictionaries/core/cultural_core.csv"
relational_path = "../../Dictionary Mapping/Dictionaries/core/relational_core.csv"
demographic_path = "../../Dictionary Mapping/Dictionaries/core/demographic_core.csv"

culture_orgs_path = "../../Dictionary Mapping/Dictionaries/core/cultural_core_orgs.csv"
relational_orgs_path = "../../Dictionary Mapping/Dictionaries/core/relational_core_orgs.csv"
demographic_orgs_path = "../../Dictionary Mapping/Dictionaries/core/demographic_core_orgs.csv"

culture_full_path = "../../Dictionary Mapping/Dictionaries/cultural_original.csv"
relational_full_path = "../../Dictionary Mapping/Dictionaries/relational_original.csv"
demographic_full_path = "../../Dictionary Mapping/Dictionaries/demographic_original.csv"

## Define helper functions

In [5]:
def dict_cohere(thisdict, wem_model):
    '''Computes the average cosine similarity score of terms within one dictionary with all other terms in that same dictionary,
    effectively measuring the coherence of the dictionary.
    ...question for development: does it make sense to compare the average cosine similarity score between all terms 
    in thisdict and the average cosine similarity among the total model vocabulary? (Could that be, by definition, 0?)
    
    NOTE: For an unknown reason, calling this function deletes terms from thisdict.
    
    Inputs: List of key terms, word2vec model.
    Output: Average cosine similarity score of each word with all other words in the list of key terms.'''
    
    # Initialize average distance variables:
    word_avg_dist = 0
    word_avg_dists = 0
    dict_avg_sim = 0
    all_avg_dists = 0
    model_avg_dists = 0
    
    # Compute average cosine similarity score of each word with other dict words:
    for word in thisdict:
        word_avg_dist = (wem_model.wv.distances(word, other_words=thisdict).sum())/len(thisdict) # Total diffs of word with all other words, take average
        word_avg_dists += word_avg_dist # Add up each average distance, incrementally
    dict_avg_sim = 1 - word_avg_dists/len(thisdict) # Find average cosine similarity score by subtracting avg. distance from 1

    #print("Dictionary coherence (avg. cosine similarity): " + str(dict_avg_sim))
    
    return dict_avg_sim

In [6]:
def term_or_part_in_blacklist(term, blacklist):
    if term in blacklist:
        return True
    for part in term.split("_"):
        if part in blacklist:
            return True
    return False


## Load & check word2vec model

In [45]:
# Load word2vec model using gensim:
model = gensim.models.KeyedVectors.load(wem_path)

# For reference, standard code looks like:
#model = gensim.models.KeyedVectors.load_word2vec_format(wem_path_old, binary=True)

### Check similar terms to authors of foundational texts per perspective

In [46]:
model.most_similar("pfeffer_salancik", topn=50) # foundational relational author

  """Entry point for launching an IPython kernel.


[('resource_dependence', 0.7903361320495605),
 ('pfeffer', 0.6995372772216797),
 ('pfeffer_pfeffer', 0.6906613111495972),
 ('salancik', 0.6521415710449219),
 ('aldrich_pfeffer', 0.6360293030738831),
 ('resource_dependencies', 0.6351190805435181),
 ('marshfield_ma_pittman', 0.6303338408470154),
 ('lawrence_lorsch', 0.6219888925552368),
 ('hannan_freeman', 0.6107117533683777),
 ('dimaggio_powell', 0.60467529296875),
 ('meyer_rowan', 0.5991320610046387),
 ('pfeffer_salanick', 0.5950024724006653),
 ('yuchtman_seashore', 0.590412974357605),
 ('pfeffer_nowak', 0.5882325172424316),
 ('pfeffer_salan', 0.578839898109436),
 ('selznick', 0.5740965008735657),
 ('lorsch', 0.5680735111236572),
 ('salancik_pfeffer', 0.5662677884101868),
 ('cyert', 0.5653761625289917),
 ('perrow', 0.5612665414810181),
 ('salancik_burt', 0.5577161908149719),
 ('aldrich', 0.5575342178344727),
 ('mintz_schwartz', 0.5542018413543701),
 ('pennings', 0.5510926246643066),
 ('terreberry', 0.5499744415283203),
 ('galaskiewicz'

In [47]:
model.most_similar(positive=["meyer_rowan", "dimaggio_powell"], topn=50) # foundational cultural authors

  """Entry point for launching an IPython kernel.


[('powell_dimaggio', 0.761844277381897),
 ('scott_meyer', 0.7530192732810974),
 ('meyer_scott', 0.7147403955459595),
 ('tolbert_zucker', 0.6939845085144043),
 ('institutional_isomorphism', 0.6822067499160767),
 ('isomorphism', 0.6744877696037292),
 ('zucker', 0.6465785503387451),
 ('dimaggio', 0.6441019773483276),
 ('pfeffer_salancik', 0.6335450410842896),
 ('mimetic_isomorphism', 0.6331244707107544),
 ('isomorphism_dimaggio', 0.633003294467926),
 ('strang_meyer', 0.6318851709365845),
 ('hannan_freeman', 0.6291985511779785),
 ('neoinstitutional', 0.6288586854934692),
 ('selznick', 0.6187233328819275),
 ('fligstein', 0.6174168586730957),
 ('friedland_alford', 0.6157428026199341),
 ('ruef_scott', 0.6070772409439087),
 ('new_institutionalism', 0.6011690497398376),
 ('zucker_dimaggio', 0.594767689704895),
 ('neoinstitutional_theory', 0.592765212059021),
 ('baum_oliver', 0.5890399217605591),
 ('aldrich_fiol', 0.5884919762611389),
 ('rowan', 0.5872538089752197),
 ('rowan_scott', 0.5865305662

In [48]:
model.most_similar("hannan_freeman", topn=50) # foundational demographic author

  """Entry point for launching an IPython kernel.


[('carroll_hannan', 0.7988797426223755),
 ('hannan_carroll', 0.7319571375846863),
 ('structural_inertia', 0.7238198518753052),
 ('freeman_hannan', 0.7156655788421631),
 ('barron_west', 0.6987178325653076),
 ('hannan', 0.6956249475479126),
 ('delacroix_swaminathan', 0.6848131418228149),
 ('barnett_carroll', 0.681363582611084),
 ('liability_newness', 0.6671488881111145),
 ('amburgey_kelly', 0.6660052537918091),
 ('carroll_delacroix', 0.6617243885993958),
 ('baum_singh', 0.6586145162582397),
 ('ecologists_hannan', 0.653207004070282),
 ('density_dependence', 0.6524397134780884),
 ('singh_tucker', 0.6485813856124878),
 ('singh_lumsden', 0.6445951461791992),
 ('ecologists', 0.6372841000556946),
 ('freeman_carroll', 0.6322972774505615),
 ('hannan_hannan', 0.6316306591033936),
 ('baum_oliver', 0.6286548376083374),
 ('aldrich_auster', 0.6276682019233704),
 ('amburgey', 0.6273452043533325),
 ('brittain_freeman', 0.6200803518295288),
 ('aldrich_hannan', 0.6162484884262085),
 ('hannan_1989a', 0.61

## Load and refine black list

In [49]:
# Load the text file of surnames for further processing.
with open('../../surnames.txt', 'r') as file:
    data = file.read().replace('\n', '')

surname = data.lower().split('|')

with open('../../expanded_dict_blacklist.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        surname += row


In [50]:
print(surname)

['magnuson-martinson', 'holmén', 'watson', 'parra-cardona', 'stanton-salazar', 'taliman', 'wilson', 'fahy', 'lakoff', 'leonard,', 'valdez', 'spires', 'diez', 'mahtani', 'patrício', 'wargon', 'polsky', 'takahashi', 'liu', 'eick', 'buller', 'leitch', 'windsperger', 'morillo', 'ferraro', 'gephart,', 'rothausen', 'olmos', 'carlile', 'levine', 'sine', 'sagie', 'mattar', 'cherrington', 'carrizales', 'arford', 'bersani', 'leuner', 'crea', 'goulding', 'sullivan', 'tanvir', 'freimer', 'messer', 'etzioni', "fenton-o'creevy", 'heinonen', 'zou', 'banton', 'sampson', 'leventhal', 'ward', 'prussia', 'ramu', 'gaddis', 'brill,', 'bendor', 'holtzworth-munroe', 'sobek', 'hodgkin', 'weeks', 'shinagawa', 'skvoretz', 'rodero-cosano', 'hintz', 'kauhanen', 'keysar', 'van de werfhorst', 'felmlee', 'huang', 'chinitz', 'ingenue', 'krasman', 'kirby', 'desoucey', 'iyogun', 'seamans', 'sell', 'heerwig', 'oxelheim', 'cocks', 'langlie', 'fontana', 'schriver', 'zacher', 'bhrolcháin', 'ejorh', 'senter,', 'kloyer', 'cr

In [51]:
# Add garbage terms into blacklist.
for i in model.wv.vocab:
    if ("valign" in i) or ("oasis" in i) or ("colwidth" in i):
        surname.append(i)

In [52]:
len(surname)

26602

## Remove blacklist terms from model

In [53]:
# https://stackoverflow.com/questions/48941648/how-to-remove-a-word-completely-from-a-word2vec-model-in-gensim
from tqdm.notebook import tqdm
import json
import nltk
nltk.download("wordnet")
from nltk.corpus import wordnet

with open('../../Dictionary Mapping/Dictionaries/words_dictionary.json') as f:
    whitelist = json.load(f)
blacklist = set(surname)

def remove_from_w2v(w2v, blacklist):
    new_vectors = []
    new_vocab = {}
    new_index2entity = []
    new_vectors_norm = []

    for i in tqdm(range(len(w2v.wv.vocab))):
        word = w2v.wv.index2entity[i]
        vec = w2v.wv.vectors[i]
        vocab = w2v.wv.vocab[word]
        vec_norm = w2v.wv.vectors_norm[i]
        if len(wordnet.synsets(word)) > 0 or all([len(wordnet.synsets(x)) > 0 for x in word.split("_")]):
        # if word in whitelist or all([x in whitelist for x in word.split("_")]):
            if not word in blacklist or any([w in blacklist for w in word.split("_")]):
                vocab.index = len(new_index2entity)
                new_index2entity.append(word)
                new_vocab[word] = vocab
                new_vectors.append(vec)
                new_vectors_norm.append(vec_norm)

    w2v.wv.vocab = new_vocab
    w2v.wv.vectors = np.array(new_vectors)
    w2v.wv.index2entity = np.array(new_index2entity)
    w2v.wv.index2word = np.array(new_index2entity)
    w2v.wv.vectors_norm = np.array(new_vectors_norm)

model.wv.init_sims()      # needs to be called for remove_from_w2v to work
print("Vocab size before removal: " + str(len(model.wv.vocab)))
remove_from_w2v(model, surname)
print("Vocab size after: " + str(len(model.wv.vocab)))

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Vocab size before removal: 758989


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=758989.0), HTML(value='')))


Vocab size after: 216541


## Load and clean dictionaries

### Cultural dictionary

In [54]:
# Load the raw culture dictionary (full and seed) and expanded version (50 terms + 22 orgs terms).
culture_full = [elem.strip('\n').replace(",", " ") for elem in load_list(culture_full_path)] # Load full culture dictionary
culture_full = list(set(culture_full)) # Remove duplicates

culture_orgs = [elem.strip('\n').replace(",", " ") for elem in load_list(culture_orgs_path)] # Load orgs-specific culture dictionary
culture_orgs = list(set(culture_orgs)) # Remove duplicates

culture = [elem.strip('\n').replace(",", " ") for elem in load_list(culture_path)] # Load seed culture dictionary
culture = list(set(culture)) # Remove duplicates
sorted(culture)

['ceremonial',
 'coercion',
 'coercive',
 'conform',
 'conformity',
 'cultural',
 'decouple',
 'decoupled',
 'diffuse',
 'diffusion',
 'imitate',
 'imitation',
 'innovation',
 'institutional',
 'institutionalize',
 'interorganizational field',
 'isomorphic',
 'isomorphism',
 'legitimacy',
 'legitimate',
 'legitimation',
 'loosely coupled',
 'mimetic',
 'norm',
 'normative',
 'norms',
 'organizational field',
 'profession',
 'professional',
 'rationalize',
 'rationalized',
 'ritual',
 'socially constructed',
 'structuration',
 'taken granted']

In [55]:
# Remove any terms from culture dict NOT in current model (these will have to be replaced):
for i in range(5):
    for word in culture:
        if word not in list(model.wv.vocab):
            culture.remove(word)
            print("Removed " + str(word) + " from culture dictionary.")
    # Repeat for quality (second pass sometimes catches more):
    for word in culture:
        if word not in list(model.wv.vocab):
            culture.remove(word)
            print("Removed " + str(word) + " from culture dictionary.")

    # Remove surnames in culture
    for word in culture:
        if word in surname:
            culture.remove(word)
            print("Removed " + str(word) + " from culture dictionary.")

print("Length of culture dictionary filtered into vector space:", len(culture))
culture

Removed interorganizational field from culture dictionary.
Removed taken granted from culture dictionary.
Removed socially constructed from culture dictionary.
Removed loosely coupled from culture dictionary.
Removed organizational field from culture dictionary.
Removed structuration from culture dictionary.
Length of culture dictionary filtered into vector space: 29


['diffusion',
 'conform',
 'normative',
 'innovation',
 'institutionalize',
 'rationalized',
 'coercion',
 'legitimacy',
 'norms',
 'ceremonial',
 'cultural',
 'imitate',
 'mimetic',
 'coercive',
 'professional',
 'ritual',
 'isomorphic',
 'decouple',
 'decoupled',
 'conformity',
 'isomorphism',
 'institutional',
 'diffuse',
 'norm',
 'rationalize',
 'legitimation',
 'imitation',
 'profession',
 'legitimate']

In [70]:
coherence, coherence_orgs = dict_cohere(culture, model), dict_cohere(culture_orgs, model)

print("Coherence of " + str(len(culture)) + "-term cultural dictionary: ", str(coherence))
print("Coherence of " + str(len(culture_orgs)) + "-term cultural dictionary: ", str(coherence_orgs))

Coherence of 29-term cultural dictionary:  0.3163296430771474
Coherence of 71-term cultural dictionary:  0.2747724699940952


In [77]:
# Check out most similar words
model.wv.most_similar(culture_orgs, topn=20)

[('dimaggio_powell', 0.619671106338501),
 ('institutions', 0.6093469262123108),
 ('governmental', 0.583474338054657),
 ('logics', 0.571103572845459),
 ('formal', 0.5684301853179932),
 ('structures', 0.5678442120552063),
 ('rules_regulations', 0.5665346384048462),
 ('apparatuses', 0.5656915903091431),
 ('authority', 0.5621172785758972),
 ('rationalizing', 0.5608727931976318),
 ('institutional_arrangements', 0.5596417784690857),
 ('institutional_isomorphism', 0.559412956237793),
 ('administrative', 0.5563393831253052),
 ('entrenched', 0.552812933921814),
 ('universalistic', 0.5456591844558716),
 ('legitimizes', 0.5424147248268127),
 ('embody', 0.5424008369445801),
 ('institutional_logics', 0.54092937707901),
 ('actors', 0.5400744676589966),
 ('external_constituents', 0.5397895574569702)]

### Relational dictionary

In [58]:
# Load the raw + orgs relational dictionaries.
relational = []
for item in load_list(relational_path):
    relational.append(item.strip("\n").replace(",", " "))

relational_orgs = [elem.strip('\n').replace(",", " ") for elem in load_list(relational_orgs_path)] # Load orgs-specific culture dictionary
relational_orgs = list(set(relational_orgs)) # Remove duplicates

relational

['board directors',
 'buffer',
 'coalition',
 'constrain',
 'constraint',
 'control',
 'cooperate',
 'cooperation',
 'coopt',
 'cooptation',
 'dependence',
 'dependent',
 'director interlock',
 'director interlocks',
 'diversification',
 'diversify',
 'dominance',
 'exchange',
 'external',
 'horizontal',
 'influence',
 'interdependence',
 'interdependent',
 'interlock',
 'interlocking',
 'joint venture',
 'merge',
 'merged',
 'merger',
 'network',
 'network',
 'power',
 'pressure',
 'resource dependence',
 'sanction',
 'vertical']

In [59]:
# Remove any terms from relational dict NOT in current model (these will have to be replaced):
for i in range(5):
    for word in relational:
        if word not in list(model.wv.vocab):
            relational.remove(word)
            print("Removed " + str(word) + " from relational dictionary.")
    # Repeat for quality (second pass sometimes catches more):
    for word in relational:
        if word not in list(model.wv.vocab):
            relational.remove(word)
            print("Removed " + str(word) + " from relational dictionary.")
    # Remove surnames in relational
    for word in relational:
        if word in surname:
            relational.remove(word)
            print("Removed " + str(word) + " from relational dictionary.")
            

print()
print("Length of relational dictionary filtered into vector space:", len(relational))
relational

Removed board directors from relational dictionary.
Removed coopt from relational dictionary.
Removed director interlock from relational dictionary.
Removed joint venture from relational dictionary.
Removed resource dependence from relational dictionary.
Removed cooptation from relational dictionary.
Removed director interlocks from relational dictionary.
Removed coalition from relational dictionary.
Removed power from relational dictionary.

Length of relational dictionary filtered into vector space: 27


['buffer',
 'constrain',
 'constraint',
 'control',
 'cooperate',
 'cooperation',
 'dependence',
 'dependent',
 'diversification',
 'diversify',
 'dominance',
 'exchange',
 'external',
 'horizontal',
 'influence',
 'interdependence',
 'interdependent',
 'interlock',
 'interlocking',
 'merge',
 'merged',
 'merger',
 'network',
 'network',
 'pressure',
 'sanction',
 'vertical']

In [72]:
coherence, coherence_orgs = dict_cohere(relational, model), dict_cohere(relational_orgs, model)
print("Coherence of " + str(len(relational)) + "-term relational dictionary: ", str(coherence))
print("Coherence of " + str(len(relational_orgs)) + "-term relational dictionary: ", str(coherence_orgs))

Coherence of 27-term relational dictionary:  0.2512116713451915
Coherence of 70-term relational dictionary:  0.25794814051414006


In [76]:
# Check out most similar words
model.wv.most_similar(relational_orgs, topn=20)

[('customers_suppliers', 0.5795645117759705),
 ('collaborations', 0.561689019203186),
 ('corporations', 0.5608978271484375),
 ('agencies', 0.5543973445892334),
 ('governmental_agencies', 0.5404126644134521),
 ('coordinating', 0.5383496284484863),
 ('alliance_partners', 0.5357421040534973),
 ('boundary_spanning', 0.5354406237602234),
 ('geographically_dispersed', 0.5345038175582886),
 ('boundary_spanners', 0.5342847108840942),
 ('strategic_alliances', 0.5335747599601746),
 ('governmental', 0.5315826535224915),
 ('buyer_supplier', 0.5290824174880981),
 ('subsidiaries', 0.5274680256843567),
 ('actors', 0.5270282030105591),
 ('suppliers_distributors', 0.5269871950149536),
 ('centralization', 0.5267918109893799),
 ('centralized', 0.5262429714202881),
 ('interlocking_directorates', 0.5255770683288574),
 ('suppliers', 0.5231723785400391)]

### Demographic dictionary

In [61]:
# Load the raw + orgs demographic dictionaries.
demographic = []
for item in load_list(demographic_path):
    demographic.append(item.strip("\n").replace(",", " "))
    
demographic_orgs = [elem.strip('\n').replace(",", " ") for elem in load_list(demographic_orgs_path)] # Load orgs-specific culture dictionary
demographic_orgs = list(set(demographic_orgs)) # Remove duplicates

demographic

['age dependence',
 'birth rate',
 'carrying capacity',
 'chance survival',
 'competition',
 'death rate',
 'density',
 'ecological',
 'ecology',
 'evolution',
 'evolutionary',
 'failure rate',
 'fitness',
 'founding rate',
 'generalism',
 'generalist',
 'inertia',
 'inertial',
 'legitimacy',
 'legitimate',
 'legitimation',
 'liability newness',
 'natural selection',
 'niche',
 'organizational form',
 'population',
 'population ecology',
 'reliability',
 'resistance change',
 'selection',
 'selection',
 'size dependence',
 'specialism',
 'specialist',
 'structural inertia',
 'survival chance']

In [62]:
# Remove any terms from demographic dict NOT in current model (these will have to be replaced):
for i in range(5):
    for word in demographic:
        if word not in list(model.wv.vocab):
            demographic.remove(word)
            print("Removed " + str(word) + " from demographic dictionary.")
    # Repeat for quality (second pass sometimes catches more):
    for word in demographic:
        if word not in list(model.wv.vocab):
            demographic.remove(word)
            print("Removed " + str(word) + " from demographic dictionary.")
    # Remove surnames in demographic
    for word in demographic:
        if word in surname:
            demographic.remove(word)
            print("Removed " + str(word) + " from demographic dictionary.")
            
print()
print("Length of demographic dictionary filtered into vector space:", len(demographic))
demographic

Removed age dependence from demographic dictionary.
Removed carrying capacity from demographic dictionary.
Removed death rate from demographic dictionary.
Removed failure rate from demographic dictionary.
Removed founding rate from demographic dictionary.
Removed liability newness from demographic dictionary.
Removed organizational form from demographic dictionary.
Removed population ecology from demographic dictionary.
Removed resistance change from demographic dictionary.
Removed size dependence from demographic dictionary.
Removed structural inertia from demographic dictionary.
Removed birth rate from demographic dictionary.
Removed generalism from demographic dictionary.
Removed natural selection from demographic dictionary.
Removed survival chance from demographic dictionary.
Removed fitness from demographic dictionary.
Removed chance survival from demographic dictionary.

Length of demographic dictionary filtered into vector space: 19


['competition',
 'density',
 'ecological',
 'ecology',
 'evolution',
 'evolutionary',
 'generalist',
 'inertia',
 'inertial',
 'legitimacy',
 'legitimate',
 'legitimation',
 'niche',
 'population',
 'reliability',
 'selection',
 'selection',
 'specialism',
 'specialist']

In [67]:
coherence, coherence_orgs = dict_cohere(demographic, model), dict_cohere(demographic_orgs, model)
print("Coherence of " + str(len(demographic)) + "-term demographic dictionary: ", str(coherence))
print("Coherence of " + str(len(demographic_orgs)) + "-term demographic dictionary: ", str(coherence_orgs))

Coherence of 19-term demographic dictionary:  0.3017470050716663
Coherence of 69-term demographic dictionary:  0.24961601598651884


In [75]:
# Check out most similar words
model.wv.most_similar(demographic_orgs, topn=20)

[('localized_competition', 0.5877772569656372),
 ('carroll_delacroix', 0.5832249522209167),
 ('survival_chances', 0.5825402140617371),
 ('baum_oliver', 0.5726785063743591),
 ('niche_overlap', 0.5687468647956848),
 ('niche_width', 0.5670601725578308),
 ('isomorphism_col', 0.5662622451782227),
 ('overlap_density', 0.5592220425605774),
 ('freeman_carroll', 0.5579248666763306),
 ('foundings_kosher', 0.5555108189582825),
 ('delacroix_carroll', 0.5530387759208679),
 ('newly_founded', 0.5501166582107544),
 ('primer_prolegomenon', 0.5444112420082092),
 ('foundings_failures', 0.5442254543304443),
 ('foundings_california_wine', 0.543562650680542),
 ('liabilities_newness', 0.5431041717529297),
 ('biosphere_ecologies', 0.5412288904190063),
 ('industry', 0.5401310920715332),
 ('dimaggio_powell', 0.5401141047477722),
 ('carroll_wade', 0.5392552018165588)]

## Expand dictionaries

By using the model to look at similar words across terms, create a list of candidate terms for a bigger conceptual dictionary. Manually search all these candidate terms for those that are tightly conceptually related to the seed dictionary. This process blends data-driven search from the model with hand-driven search across the candidate terms.

By searching through the above candidate terms/phrases, expand from the seed terms into a larger--but still conceptually coherent--list! 

In [37]:
dictionary_lengths = list(range(30, 100, 10)) + list(range(100, 1001, 100))
expanded_dicts_path = '../../Dictionary Mapping/Dictionaries/Expanded/wordnet_english3/'
filename_template = 'closest_{}_{}.csv'         # filename_template.format(perspective, length)

In [40]:
expanded_dict = culture.copy()
for length in dictionary_lengths:
    expanded_dict += [x for x, _ in model.wv.most_similar(expanded_dict, topn = length - len(expanded_dict))]
    coherence = dict_cohere(expanded_dict, model)
    print("Coherence of " + str(length) + "-term cultural dictionary: ", str(coherence))
    wtr = csv.writer(open(expanded_dicts_path + filename_template.format('culture', str(length)), 'w'), delimiter=',', lineterminator='\n')   
    for x in expanded_dict: 
        wtr.writerow([x])
culture_expanded = expanded_dict.copy()
print()
        
expanded_dict = relational.copy()
for length in dictionary_lengths:
    expanded_dict += [x for x, _ in model.wv.most_similar(expanded_dict, topn = length - len(expanded_dict))]
    coherence = dict_cohere(expanded_dict, model)
    print("Coherence of " + str(length) + "-term relational dictionary: ", str(coherence))
    wtr = csv.writer(open(expanded_dicts_path + filename_template.format('relational', str(length)), 'w'), delimiter=',', lineterminator='\n')   
    for x in expanded_dict: 
        wtr.writerow([x])
relational_expanded = expanded_dict.copy()
print()
        
expanded_dict = demographic.copy()
for length in dictionary_lengths:
    if length == 1000:
        i = 0
    expanded_dict += [x for x, _ in model.wv.most_similar(expanded_dict, topn = length - len(expanded_dict))]
    coherence = dict_cohere(expanded_dict, model)
    print("Coherence of " + str(length) + "-term demographic dictionary: ", str(coherence))
    with open(expanded_dicts_path + filename_template.format('demographic', str(length)), 'w') as f:
        wtr = csv.writer(f)   
        for x in expanded_dict:  
            wtr.writerow([x])
demographic_expanded = expanded_dict.copy()

Coherence of 30-term cultural dictionary:  0.32140290154351137
Coherence of 40-term cultural dictionary:  0.3457211291790008
Coherence of 50-term cultural dictionary:  0.34597452926635763
Coherence of 60-term cultural dictionary:  0.344760231441922
Coherence of 70-term cultural dictionary:  0.3471236007067624
Coherence of 80-term cultural dictionary:  0.34965586066246046
Coherence of 90-term cultural dictionary:  0.3517327664222245
Coherence of 100-term cultural dictionary:  0.35300685577392565
Coherence of 200-term cultural dictionary:  0.34873132076263413
Coherence of 300-term cultural dictionary:  0.34332141452365517
Coherence of 400-term cultural dictionary:  0.3372111212730412
Coherence of 500-term cultural dictionary:  0.33244912963867135
Coherence of 600-term cultural dictionary:  0.32829182679918056
Coherence of 700-term cultural dictionary:  0.3246671057876276
Coherence of 800-term cultural dictionary:  0.3211471792221061
Coherence of 900-term cultural dictionary:  0.317932315

## Find more blacklist candidates

In [33]:
counter = 0
for w in tqdm(demographic_expanded):
    if w not in whitelist and "".join(w.split("_")) not in whitelist:
        for part in w.split("_"):
            if part not in whitelist:
                print(w)
                counter += 1
                break
print(counter)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))

foundings_california_wine
foundings
carroll_delacroix
baum_oliver
delacroix_carroll
delacroix_freeman
foundings_failures
foundings_kosher
foundings_ecological
nonmonotonic_density
foundings_delacroix
foundings_specialist
nonmonotonic
baum_joel
baum
dimaggio_powell
microbreweries_carroll
microbreweries
foundings_failings
ginsberg_baum
delacroix
panics_1870s
baum_powell
microbreweries_brewpubs
brewpubs
nonmonotonic_inverted
foundings_disbandings
disbandings
yue_luo
microbrewery_foundings
joel_baum
wineries_delacroix
waves_disbandings
powell_dimaggio
sorensen_stuart
mcpherson_mcpherson
ssa_linkages
foundings_deaths
lagged_foundings
brewpub_density
nonmonotonic_pattern
richardson_gb
producer_foundings
nonintegrated_engine
kimberley_miles
league_foundings
burton_sorensen
microbrewery_mortality
isomorphism_dimaggio
walker_liston
mcpherson
visible_brewpub
reagans_ray
rutledge_mo_fellowship
pennings
generalist_automakers
ceo_succession
foundings_dissolutions
farrell_monroe
brewpub
lyons_br
com

In [34]:
counter = 0
for w in tqdm(culture_expanded):
    if w not in whitelist and "".join(w.split("_")) not in whitelist:
        for part in w.split("_"):
            if part not in whitelist:
                print(w)
                counter += 1
                break
print(counter)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))

dimaggio_powell
legitimates
subjectivities
knowledges
deconstruct
foregrounding
reifications
signifiers
foregrounded
deconstruction
postmodern
deconstructing
deconstructs
fetishized
postmodernist
counterposed
postmodernism
deconstructionist
objectifies
deconstructed

20


In [35]:
counter = 0
for w in tqdm(relational_expanded):
    if w not in whitelist and "".join(w.split("_")) not in whitelist:
        for part in w.split("_"):
            if part not in whitelist:
                print(w)
                counter += 1
                break
print(counter)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))

interdependences
complementarities
transactors
jvs
spillovers
sourcing
outsourcing
external_stakeholders
knowledge_spillovers
technology_sourcing
lyons_br
jv
core_competences
partnerships_ipo
economics_tce
richardson_gb
stakeholders
ipo
tce
automakers
lane_salk
equity_jvs
outsourced
tce_predicts
japanese_automakers
outsource
ipos
sourced_externally
concurrent_sourcing
internalisation_advantages
lerner_merges
outsourcees
post_ipo
ip_protection
pitts_salter
global_sourcing
offshore_outsourcing
gompers_lerner
psfs
shareholdings
ceos
eecs
risc_microprocessor
upfront_investments
founder_ceos
nonintegrated_acquirer
sourced_internally
telecom
outsourcing_mitigates
settlements_basle
gompers_pa_lerner
firm_outsources
roth_ricks
shareholding
ceo
nonintegrated_engine
eib_papers
goldsmith_lyons
undertaking_ipo
galbraith_stiles
nonintegrated_channels
comment_jarrell
mediated_sourcing
klein_crawford
exploit_complementarities
competences
jensen_murphy
retailers_standardisation
hostages_klein
ip_tough