# Studying Corpora Frequencies

In [2]:
# import packages
import os
from pathlib import Path
import sys
from datetime import datetime
from time import time
import json
import pickle
from collections import Counter
from collections import defaultdict
from collections import OrderedDict
import re

## Setup

In [3]:
"""modify according to your setup"""

# model directory path
program_dir = '../'

# corpora directory path
input_dir_path = Path(program_dir) / 'processed-corpora'
if (not input_dir_path.is_dir()):
    print("No processed corpora input directory found")

# phrases directory path
phrases_dir_path = Path(program_dir) / 'phrases'
if (not phrases_dir_path.is_dir()):
    print("No phrases directory found")

# output directory path
output_dir_path = Path(program_dir) / 'frequencies'
output_dir_path.mkdir(exist_ok=True)

In [4]:
# journal data: tuples of format (journal name, # of volumes, volume # for 1981)
journals = [("JCR",47,0), ("JM",84,0), ("JMR",57,0), ("MS",39,0)]
max_vols_after_1980 = 2020 - 1980 # 40

# compute start volume nums for 1981
# if start vol <= 0, that means joural doesn't have articles from that year
# this is okay because we just won't find any matches for those vols when parsing files
for i in range(len(journals)):
    journal = journals[i]
    name, volumes, _ = journal

    vol_1981 = volumes - max_vols_after_1980 + 1
    journals[i] = (name, volumes, vol_1981)

print(journals)

[('JCR', 47, 8), ('JM', 84, 45), ('JMR', 57, 18), ('MS', 39, 0)]


## Helper Functions

In [5]:
def convert_year_to_index(yr, start_yr, end_yr, step):
    """Converts a year into a usable index for the word vector matrix.

    Args:
        yr:
            The year of interest.
        start_yr:
            The first year, inclusive, of the corpora.
        end_yr:
            The last year, inclusive, of the corpora.
        step:
            How many years per timeslice.

    Returns:
        An integer index that represents the time slice containing the desired year in the word vector matrix.

    Raises:
        Exception:
            The year specified is not contained in the year range of the corpora.
    """
    if yr < start_yr or yr > end_yr:
        raise Exception('Trying to access a year that is not in the corpora: ' + yr)
    return (yr - start_yr) // step

In [6]:
def load_phrases_to_dict(input_file_name):
    """Takes in file with list of phrases separated by new lines (and each word separated by either spaces or hyphens).

    e.g. Given list:
        cross culture
        cross-culture
        decision making
        in-store marketing

    Example calls to resulting dictionary:
        phrases["cross culture"] = "cross-culture"
        phrases["decision making"] = "decision-making"
        phrases["in store marketing"] = "in-store-marketing"
    """
    
    phrases = {}
    with open(input_file_name, 'r') as infile:
        for line in infile:
            line = line.rstrip()
            line = line.lower()
            phrases[line.replace("-", " ")] = line.replace(" ", "-") # should disregard intra-word hyphens in phrases
    return phrases

In [7]:
def get_phrase_list():
    method_list_path = phrases_dir_path / 'Method.txt'    # replace with phrase list file names
    topic_list_path = phrases_dir_path / 'Topic.txt'
    autophrase_list_path = phrases_dir_path / 'autophrase_6147.txt'
    if (not method_list_path.exists()) or (not topic_list_path.exists()) or (not autophrase_list_path.exists()):
        print("Phrase lists not found, exiting early")
        return 1
    
    method_phrases = load_phrases_to_dict(method_list_path)    # Load method/topic list
    topic_phrases = load_phrases_to_dict(topic_list_path)
    autophrase_phrases = load_phrases_to_dict(autophrase_list_path)    # Load AutoPhrase list
    
    phrases = method_phrases    # Combine all into 1 list
    phrases.update(topic_phrases)
    phrases.update(autophrase_phrases)
    
    return list(phrases.values())

In [8]:
def get_full_vocab_list():
    min_count = 20
    print('- loading word id file')
    with open(program_dir + 'dw2v-master/wordID_corpus.txt') as f:
        lines = f.readlines()
    lines = [line.split(',') for line in lines]

    lines = [line for line in lines if int(line[2]) >= min_count]
    word_list = [line[1] for line in lines]
    print('got full vocab list')
    
    return word_list

In [9]:
def count_freq_for_timeslice(timeslice, yrs_in_timeslice, phrase_list):
    """
    Returns a frequency dictionary for a single time slice. Keys are journal names, and values are Counter objects.
    """
    journal_freq_for_timeslice = {}
#     {"JCR":defaultdict(int), "JM":defaultdict(int), "JMR":defaultdict(int), "MS":defaultdict(int)}
    
    for journal in journals:
        name, volumes, vol_1981 = journal
        start_vol = vol_1981 + timeslice * yrs_in_timeslice
        
        journal_dir_path = input_dir_path / name
        
        read_files = Path(journal_dir_path).glob('*.txt')   # List all files in directory
        # Filter files with paths that match volume number start_vol and start_vol+1
        read_files = [str(filename) for filename in read_files if int(filename.name.split("#")[0]) in range(start_vol, start_vol + yrs_in_timeslice)]
        read_files = sorted(read_files, key=lambda x:[int(c) if c.isdigit() else c for c in re.split(r'(\d+)', x)])
        
        count = Counter()
        for f in read_files:
            with open(f, 'r') as infile:
                line = infile.read()
                words = line.split()
                count.update(words)
#             print("Counted word_list occurrences in file from", name, "directory:", f)
        journal_freq_for_timeslice[name] = Counter({key: val for key, val in count.items() if key in phrase_list})

    return journal_freq_for_timeslice

In [10]:
def count_freq_for_timeslices(num_timeslices, yrs_in_timeslice, phrase_list):
    """
    Returns a dictionary of frequency dictionaries for each time slice. Keys are time slice indices, values are dictionaries of journal frequencies for that time slice.
    """
    journal_freq_for_timeslices = {}
    
    for timeslice_i in range(num_timeslices):
        journal_freq_for_timeslice = count_freq_for_timeslice(timeslice_i, yrs_in_timeslice, phrase_list)
        journal_freq_for_timeslices[timeslice_i] = journal_freq_for_timeslice
        
        print("Got frequencies for period", timeslice_i)
    #     print(journal_freq_for_timeslice)

    return journal_freq_for_timeslices

In [11]:
def count_freq_across_timeslices(journal_freq_for_timeslices):
    """
    Returns a frequency dictionary for all time slices summed together. Keys are time slice indices, values are Counter objects.
    """
    num_timeslices = len(journal_freq_for_timeslices)
    journal_freq_across_timeslices = {"JCR":Counter(), "JM":Counter(), "JMR":Counter(), "MS":Counter()}
    
    for timeslice_i in range(num_timeslices):
        journal_freq_for_timeslice = journal_freq_for_timeslices[timeslice_i]

        journal_freq_across_timeslices["JCR"] += journal_freq_for_timeslice["JCR"]
        journal_freq_across_timeslices["JM"] += journal_freq_for_timeslice["JM"]
        journal_freq_across_timeslices["JMR"] += journal_freq_for_timeslice["JMR"]
        journal_freq_across_timeslices["MS"] += journal_freq_for_timeslice["MS"]

    return journal_freq_across_timeslices

## Main Program

In [12]:
start_yr = 1981 # the first year of the corpora
end_yr = 2020 # the last year of the corpora
step = 2 # how many years per time slice
num_timeslices = (end_yr - start_yr + 1) // step
timeslice_yrs = range(start_yr, end_yr+1, step) # years 1981 to 2020 inclusive

phrase_list = get_phrase_list()
# print(phrase_list)
full_vocab_list = get_full_vocab_list()
print(len(full_vocab_list))

- loading word id file
got full vocab list
39354


## Main Program - Frequencies

### Save + Load File Setup

#### Count frequencies in corpora - can skip cell below if there are already frequency files saved!:

In [23]:
# Find frequencies for each time slice and save file
journal_freq_for_timeslices = count_freq_for_timeslices(num_timeslices, step, full_vocab_list)

freq_savefile = output_dir_path / 'journal_freq_for_timeslices.p'
pickle.dump(journal_freq_for_timeslices, open(freq_savefile, 'wb'), pickle.HIGHEST_PROTOCOL)

# Sum frequencies across all time slices and save file
journal_freq_across_timeslices = count_freq_across_timeslices(journal_freq_for_timeslices)

freq_savefile = output_dir_path / 'journal_freq_across_timeslices.p'
pickle.dump(journal_freq_across_timeslices, open(freq_savefile, 'wb'), pickle.HIGHEST_PROTOCOL)

Got frequencies for period 0
Got frequencies for period 1
Got frequencies for period 2
Got frequencies for period 3
Got frequencies for period 4
Got frequencies for period 5
Got frequencies for period 6
Got frequencies for period 7
Got frequencies for period 8
Got frequencies for period 9
Got frequencies for period 10
Got frequencies for period 11
Got frequencies for period 12
Got frequencies for period 13
Got frequencies for period 14
Got frequencies for period 15
Got frequencies for period 16
Got frequencies for period 17
Got frequencies for period 18
Got frequencies for period 19


#### Load journal frequencies from pre-saved file:

In [24]:
freq_savefile = output_dir_path / 'journal_freq_for_timeslices.p'
try:
    journal_freq_for_timeslices = pickle.load(open(freq_savefile, 'rb'))
    print("journal frequencies for individual timeslices loaded succesfully")
except(IOError):
    print("Error trying to load frequencies")
    
freq_savefile = output_dir_path / 'journal_freq_across_timeslices.p'
try:
    journal_freq_across_timeslices = pickle.load(open(freq_savefile, 'rb'))
    print("journal frequencies across summed timeslices loaded succesfully")
except(IOError):
    print("Error trying to load frequencies")

journal frequencies for individual timeslices loaded succesfully
journal frequencies across summed timeslices loaded succesfully


### Analysis

#### Investigate and validate journal frequencies by period:

In [12]:
for yr in timeslice_yrs:
    timeslice_i = convert_year_to_index(yr, start_yr, end_yr, step)
    journal_freq_for_timeslice = journal_freq_for_timeslices[timeslice_i]

    print("Frequencies for period", str(timeslice_i) + ", years", yr, "-", yr+step-1)
#     print(journal_freq_for_timeslice)
    JCR_freq = journal_freq_for_timeslice["JCR"]
    MS_freq = journal_freq_for_timeslice["MS"]
    
    print(JCR_freq["econometrics"])
    print(MS_freq["econometrics"])
    top_n = 15
    print("top", top_n, "most frequent in JCR:", JCR_freq.most_common(top_n))
    print("top", top_n, "most frequent in MS:", MS_freq.most_common(top_n))

Frequencies for period 0, years 1981 - 1982
12
1
top 15 most frequent in JCR: [('choice', 1058), ('brand', 911), ('advertising', 776), ('preference', 493), ('attitude', 412), ('memory', 395), ('learning', 386), ('attention', 358), ('satisfaction', 337), ('consumer-behavior', 315), ('affect', 310), ('decision-making', 302), ('leisure', 292), ('knowledge', 258), ('information-processing', 245)]
top 15 most frequent in MS: [('brand', 169), ('advertising', 144), ('regression', 62), ('survey', 58), ('maximum-likelihood', 53), ('pricing', 51), ('choice', 48), ('anova', 48), ('network', 45), ('market-share', 44), ('experience', 42), ('preference', 41), ('repeat-purchase', 37), ('decision-support', 35), ('explanatory-variables', 34)]
Frequencies for period 1, years 1983 - 1984
7
15
top 15 most frequent in JCR: [('brand', 1299), ('choice', 1166), ('involvement', 614), ('advertising', 611), ('communication', 463), ('attitude', 411), ('consumer-behavior', 334), ('learning', 317), ('knowledge', 31

#### Looking at journal frequencies across all time periods:

In [13]:
top_n = 15
print("top", top_n, "most frequent in JCR across all timeslices:", journal_freq_across_timeslices["JCR"].most_common(top_n))
print("top", top_n, "most frequent in MS across all timeslices:", journal_freq_across_timeslices["MS"].most_common(top_n))

top 15 most frequent in JCR across all timeslices: [('brand', 37152), ('choice', 30166), ('experience', 17284), ('affect', 12878), ('advertising', 11767), ('memory', 10886), ('preference', 10705), ('knowledge', 10431), ('attitude', 9287), ('attention', 9185), ('goals', 8154), ('motivation', 6384), ('learning', 5613), ('involvement', 5494), ('satisfaction', 5292)]
top 15 most frequent in MS across all timeslices: [('brand', 32494), ('advertising', 27598), ('marketing-science', 23518), ('choice', 16853), ('pricing', 10553), ('preference', 6286), ('learning', 5149), ('market-share', 5107), ('promotions', 4618), ('affect', 4340), ('experience', 4124), ('network', 4085), ('technology', 3674), ('a-model', 3624), ('survey', 3104)]


## Main Program - Ratios

### Save + Load File Setup

#### Calculate JCR/MS ratios - can skip cell below if there is already a file saved!:

In [70]:
# find JCR to MS ratios for each word in each timeslice
ratios_for_timeslices = {}

for yr in timeslice_yrs:
    timeslice_i = convert_year_to_index(yr, start_yr, end_yr, step)
    journal_freq_for_timeslice = journal_freq_for_timeslices[timeslice_i]
    
    JCR_freq = journal_freq_for_timeslice["JCR"]
    MS_freq = journal_freq_for_timeslice["MS"]
    
    ratios = {}
    for phrase in phrase_list:
        JCR_phrase_freq = JCR_freq[phrase]
        MS_phrase_freq = MS_freq[phrase]
        
        # if the word has 0 occurrence in either journal, set to 0 (will end up in middle of list)
        if (JCR_phrase_freq == 0 and MS_phrase_freq == 0):
            ratios[phrase] = 0
        # if the word has 0 occurrence in MS, set to 1 * JCR freq (most freq in JCR will end up at top of list)
        elif MS_phrase_freq == 0:
            ratios[phrase] = JCR_phrase_freq
        # if the word has 0 occurrence in JCR, set to -1 * MS freq (most freq in MS will end up at bottom of list)
        elif JCR_phrase_freq == 0:
            ratios[phrase] = -1 * MS_phrase_freq
        else:
            ratios[phrase] = JCR_phrase_freq / MS_phrase_freq
    
    ratios_for_timeslices[timeslice_i] = ratios

# Save into file
ratios_savefile = output_dir_path / 'ratios_for_timeslices.p'
pickle.dump(ratios_for_timeslices, open(ratios_savefile, 'wb'), pickle.HIGHEST_PROTOCOL)

# find JCR to MS ratios for each word in all timeslices
ratios_across_timeslices = {}

for phrase in phrase_list:
    JCR_phrase_freq = journal_freq_across_timeslices["JCR"][phrase]
    MS_phrase_freq = journal_freq_across_timeslices["MS"][phrase]

    # if the word has 0 occurrence in either journal, set to 0 (will end up in middle of list)
    if (JCR_phrase_freq == 0 and MS_phrase_freq == 0):
        ratios_across_timeslices[phrase] = 0
    # if the word has 0 occurrence in MS, set to 1 * JCR freq (most freq in JCR will end up at top of list)
    elif MS_phrase_freq == 0:
        ratios_across_timeslices[phrase] = JCR_phrase_freq
    # if the word has 0 occurrence in JCR, set to -1 * MS freq (most freq in MS will end up at bottom of list)
    elif JCR_phrase_freq == 0:
        ratios_across_timeslices[phrase] = -1 * MS_phrase_freq
    else:
        ratios_across_timeslices[phrase] = JCR_phrase_freq / MS_phrase_freq

# Save into file
ratios_savefile = output_dir_path / 'ratios_across_timeslices.p'
pickle.dump(ratios_across_timeslices, open(ratios_savefile, 'wb'), pickle.HIGHEST_PROTOCOL)

#### Load ratios from pre-saved file:

In [14]:
ratios_savefile = output_dir_path / 'ratios_for_timeslices.p'
try:
    ratios_for_timeslices = pickle.load(open(ratios_savefile, 'rb'))
    print("ratios for individual timeslices loaded succesfully")
except(IOError):
    print("Error trying to load ratios")
    
ratios_savefile = output_dir_path / 'ratios_across_timeslices.p'
try:
    ratios_across_timeslices = pickle.load(open(ratios_savefile, 'rb'))
    print("ratios across timeslices loaded succesfully")
except(IOError):
    print("Error trying to load ratios")

ratios for individual timeslices loaded succesfully
ratios across timeslices loaded succesfully


### Analysis

#### Determine most quantitative and behavioral words in each time slice - can skip if already have ranked lists saved as files!
We can retrieve the most behavioral and most quant words by looking at the highest ranked and lowest ranked ratios, respectively.

In [15]:
for yr in timeslice_yrs:
    timeslice_i = convert_year_to_index(yr, start_yr, end_yr, step)
    ratios_for_timeslice = ratios_for_timeslices[timeslice_i]
    
    ranked_most_behavioral = OrderedDict(sorted(ratios_for_timeslice.items(), key=lambda kv: kv[1], reverse=True)) # JCR/MS larger = more behavioral
    ranked_most_quant = OrderedDict(sorted(ratios_for_timeslice.items(), key=lambda kv: kv[1])) # JCR/MS smaller = more quant
    
    # Save ranked behavioral words
    ranked_behavioral_pickle_savefile = output_dir_path / ('ranked_behavioral_for_timeslice_' + str(timeslice_i) + '.p')
    pickle.dump(ranked_most_behavioral, open(ranked_behavioral_pickle_savefile, 'wb'), pickle.HIGHEST_PROTOCOL)
    
    ranked_behavioral_txt_savefile = output_dir_path / ('ranked_behavioral_for_timeslice_' + str(timeslice_i) + '.txt')
    with open(ranked_behavioral_txt_savefile, 'w') as outfile:
        for key, value in ranked_most_behavioral.items():
            outfile.write(str(key) + ": " + str(value) + '\n')
    
    # Save ranked quant words
    ranked_quant_pickle_savefile = output_dir_path / ('ranked_quant_for_timeslice_' + str(timeslice_i) + '.p')
    pickle.dump(ranked_most_quant, open(ranked_quant_pickle_savefile, 'wb'), pickle.HIGHEST_PROTOCOL)
    
    ranked_quant_txt_savefile = output_dir_path / ('ranked_quant_for_timeslice_' + str(timeslice_i) + '.txt')
    with open(ranked_quant_txt_savefile, 'w') as outfile:
        for key, value in ranked_most_quant.items():
            outfile.write(str(key) + ": " + str(value) + '\n')

    print("\n*** Period " + str(timeslice_i) + ", years", str(yr) + "-" + str(yr+step-1) + " ***")

    top_n = 15
    print("Top", top_n, "most behavioral words:")
    for phrase, ratio in list(ranked_most_behavioral.items())[:top_n]:
        print(phrase, ratio)
    print("\nTop", top_n, "most quant words:")
    for phrase, ratio in list(ranked_most_quant.items())[:top_n]:
        print(phrase, ratio)


*** Period 0, years 1981-1982 ***
Top 15 most behavioral words:
leisure 292
involvement 215.0
energy-conservation 203.0
energy-consumption 151
product-information 148
attribution 124
mode-choice 102.0
memory 98.75
motivation 82
united-states 82.0
credit-cards 73
social-class 71
cognition 68
status-quo 65
public-policy 62

Top 15 most quant words:
decision-support -35
piecewise-linear -22
consumer-surplus -16
innovation-diffusion -16
customer-behavior -13
technological-innovations -12
binomial-distribution -8
budget-allocation -8
markov-process -8
continuous-variables -7
higher-dimensional -6
mercedes-benz -6
random-variables -6
optimal-pricing -6
input-variables -6

*** Period 1, years 1983-1984 ***
Top 15 most behavioral words:
family-life 189
social-class 171.0
gift-giving 115
classical-conditioning 104
familiarity 96.5
latent-class 96
opinion-leaders 86
attitude-change 70
family-members 59.0
lifestyle 56
persuasion 54.333333333333336
information-overload 53.0
information-integratio

#### Load ranked behavioral + quant phrases from pre-saved file:

In [78]:
# Load ranked behavioral and quant words for time slice 0
timeslice_i = 0
ranked_behavioral_pickle_savefile = output_dir_path / ('ranked_behavioral_for_timeslice_' + str(timeslice_i) + '.p')
try:
    ranked_most_behavioral_0 = list(pickle.load(open(ranked_behavioral_pickle_savefile, 'rb')))
    print("Ranked behavioral phrases for time slice " + str(timeslice_i) + " loaded succesfully")
except(IOError):
    print("Error trying to load ranked behavioral phrases for time slice " + str(timeslice_i))
    
ranked_quant_pickle_savefile = output_dir_path / ('ranked_quant_for_timeslice_' + str(timeslice_i) + '.p')
try:
    ranked_most_quant_0 = list(pickle.load(open(ranked_quant_pickle_savefile, 'rb')))
    print("Ranked quant phrases for time slice " + str(timeslice_i) + " loaded succesfully")
except(IOError):
    print("Error trying to load ranked quant phrases for time slice " + str(timeslice_i))
    
# Load ranked behavioral and quant words for time slice 0
timeslice_i = 19
ranked_behavioral_pickle_savefile = output_dir_path / ('ranked_behavioral_for_timeslice_' + str(timeslice_i) + '.p')
try:
    ranked_most_behavioral_19 = list(pickle.load(open(ranked_behavioral_pickle_savefile, 'rb')))
    print("Ranked behavioral phrases for time slice " + str(timeslice_i) + " loaded succesfully")
except(IOError):
    print("Error trying to load ranked behavioral phrases for time slice " + str(timeslice_i))
    
ranked_quant_pickle_savefile = output_dir_path / ('ranked_quant_for_timeslice_' + str(timeslice_i) + '.p')
try:
    ranked_most_quant_19 = list(pickle.load(open(ranked_quant_pickle_savefile, 'rb')))
    print("Ranked quant phrases for time slice " + str(timeslice_i) + " loaded succesfully")
except(IOError):
    print("Error trying to load ranked quant phrases for time slice " + str(timeslice_i))

Ranked behavioral phrases for time slice 0 loaded succesfully
Ranked quant phrases for time slice 0 loaded succesfully
Ranked behavioral phrases for time slice 19 loaded succesfully
Ranked quant phrases for time slice 19 loaded succesfully


#### Determine most volatile words across time slices (attempt #1)

In [79]:
phrase_rank_changes = {}
for phrase in phrase_list:
    # positive number indicates it is ranked higher in period 19 than period 0
    behavioral_rank_change = ranked_most_behavioral_0.index(phrase) - ranked_most_behavioral_19.index(phrase)
    quant_rank_change = ranked_most_quant_0.index(phrase) - ranked_most_quant_19.index(phrase)
    
    phrase_rank_changes[phrase] = {"behavioral-rank-change": behavioral_rank_change, "quant-rank-change": quant_rank_change}
    
#     print(phrase + ": behavioral rank change " + str(behavioral_rank_change) + ", quant rank change " + str(quant_rank_change))

# ordered from 'became more behavioral' to 'became less behavioral'
ranked_behavioral_change = OrderedDict(sorted(phrase_rank_changes.items(), key=lambda kv: kv[1]["behavioral-rank-change"], reverse=True))
ranked_quant_change = OrderedDict(sorted(phrase_rank_changes.items(), key=lambda kv: kv[1]["quant-rank-change"], reverse=True))

top_n = 15
print("\n*** Top", top_n, "words that became more behavioral ***")
for phrase, rank_change in list(ranked_behavioral_change.items())[:top_n]:
    print(phrase, rank_change["behavioral-rank-change"])
print("\n*** Top", top_n, "words that became more quant ***")
for phrase, rank_change in list(ranked_quant_change.items())[:top_n]:
    print(phrase, rank_change["quant-rank-change"])
    
print("\n*** Top", top_n, "words that became less behavioral ***")
for phrase, rank_change in list(ranked_behavioral_change.items())[-1*top_n:]:
    print(phrase, rank_change["behavioral-rank-change"])
print("\n*** Top", top_n, "words that became less quant ***")
for phrase, rank_change in list(ranked_quant_change.items())[-1*top_n:]:
    print(phrase, rank_change["quant-rank-change"])


*** Top 15 words that became more behavioral ***
boundary-condition 6078
review-helpfulness 5996
cognitive-flexibility 5967
body-temperature 5904
stress-induced 5904
product-catalog 5889
visual-patterns 5879
technological-innovations 5861
holidays 5746
maximum-coverage 5737
initial-state 5726
psychological-stress 5726
word-completion 5724
frequency-distribution 5720
moral-emotions 5709

*** Top 15 words that became more quant ***
assistant-professor 6238
washington-dc 6216
associate-professor 6213
conjoint-analysis 6190
complaining 6152
consumer-demand 6100
british-columbia 6087
human-capital 6077
utility-function 6066
working-paper 6054
john-wiley 6030
graduate-school 6029
business-administration 6026
failure-rate 6017
yale-university 6000

*** Top 15 words that became less behavioral ***
yale-university -6006
failure-rate -6015
john-wiley -6025
graduate-school -6029
business-administration -6029
working-paper -6054
utility-function -6066
human-capital -6086
british-columbia -6091
co

In [None]:
"""
delete below code later
"""

def get_stability(self, w, start_t, end_t, stable_type='avg'):
    """
    code snippet for measuring stability / volatility of words
    """
    total_variations = []
    #for y in range(end_t - start_t):
    #    total_variation += abs(self.sim_by_word_year(w, y, w, y+1))
    N = end_t - start_t
    for y1 in range(N):
        for y2 in range(y1 + 1, N):
            total_variations.append(self.sim_by_word_year(w, y1, w, y2))
    
    if stable_type == 'avg':
        return np.mean(total_variations)
    elif stable_type == 'max':
        return max(total_variations)
    elif stable_type == 'min':
        return min(total_variations)
    else:
        raise ValueError('stable_type should be in [avg, max, min]')

#### Determine most stable words across time slices