 Bags of words
===
**Computational social sciences**

---
In this Notebook we will show how to extract bags of words to perform dyachronic analysis of historical texts.

---

## Section 1: Using word lists ...

... from psychometric tools and obtain synonyms and hyponyms using WordNet

In [None]:
import nltk
from nltk.corpus import wordnet as wn


#nltk.download('all') 

### Functions

In [None]:
def generate_meanings(word_list):
    """
    Generates a list of word meanings/definitions
    
    Parameters
    ----------
    word_list : a list of words

    Returns
    -------
    A list with word meanings and definitions.

    """
    word_meaning_def = []
    
    # iterate over words in the word_base list
    for word in word_list:

      ## iterate over different meanings of the word synsets
      for meaning in wn.synsets(word, pos=wn.NOUN+wn.VERB+wn.ADJ):
          #print(meaning)
          #print(meaning.definition())
          #print('\n')
          word_meaning_def.append([word, meaning, meaning.definition()])
    
    # remove duplicates  
    meanings_list = [] 
    [meanings_list.append(x) for x in word_meaning_def if x not in meanings_list]
            
    return(meanings_list)


def word_check(list_pre):
    """
    For manually checking each word for relevance and removing irrelevant terms.
    
    Parameters
    ----------
    list_pre : a list of words

    Returns
    -------
    list_post : a list of words after checks
    
    Use
    ------
    For each word, press 1 if it is relevant, 0 otherwise

    """
    
    list_post = []
    
    for item in list_pre:
        
        print(item)
        x = input()
        
        if x == '1':
            list_post.append(item)
            
    return(list_post)


def synonyms_hyponyms(meaning_list, languages):
    """
    Generates synonyms and hyponyms from pruned word base.


    Parameters
    ----------
    meaning_list : list of meanings
    languages : languages for search, e.g. ['eng']

    Returns
    -------
    A list of words in specified languages, including synomyns and hyponyms.

    """
  
    word_lists = []
    sysnet = []
    
    ## Create a list with just sysnet meanings
    i = 0
    while i < len(meaning_list):
        x = meaning_list[i][1]
        sysnet.append(x)
        i += 1

    for language in languages: # loop over languages
    
        word_list = []

        for word in sysnet: # loop over words in sysnet list
                    
            for synonym in word.lemmas(language): # synonyms
                if synonym.name() not in word_list:
                        word_list.append(synonym.name())

            for hyponym in word.hyponyms(): # hyponyms
                for synonym_of_hyponym in hyponym.lemmas(language):
                    if synonym_of_hyponym.name() not in word_list:
                        word_list.append(synonym_of_hyponym.name())

    word_lists += [sorted(word_list)] # add language-specific list to global list, sorting alphabetically

    for lst in word_lists: # combine lists for each language
        return(lst)
    

def bag_of_words(word_list):
    """
    Extracts words and removes duplicates, creating the final bag of words.
    """
        
    full_list = []

    for i in word_list:
        x = i[0]
        full_list.append(x)
    
    final_list = [] 
    [final_list.append(x) for x in full_list if x not in final_list]
    
    return final_list

### Seed words related to financial behaviors.

In [None]:
word_base_financial = ['earn', 'income', 'compensation', 'livelihood', 'pay', 'paycheck', 'salary', 'wage', 'save', 'assets', 
                  'profit', 'accumulation', 'deposit', 'installment', 'wealth', 'spend', 'purchase', 'consume', 'payment', 
                  'fund', 'invest', 'stockmarket', 'stock', 'bond', 'securities', 'dividends', 'shareholder', 'portfolio'
                  'debt', 'liability', 'credit', 'loan', 'money', 'cash', 'currency', 'bills', 'coins', 'rich', 'poor', 
                  'bank', 'economy', 'financial']

# Create a list of meanings and definitions
financial_list_meanings = generate_meanings(word_base_financial)

# Create a pruned version of the list by removing irrelevant words
financial_list_pruned = word_check(financial_list_meanings)

#### Optional: look at pruned word list


In [None]:
print(financial_list_pruned)

In [None]:
# FINANCIAL TERMS

# Generate the list of synonyms and hyponyms
languages = ['eng']
financial_list_syn_hyp = synonyms_hyponyms(financial_list_pruned, languages)

# Generate meanings
financial_list_meanings = generate_meanings(financial_list_syn_hyp)

# Exclude irrelevant words
# type 0 to exclude, 1 to include
financial_list = word_check(financial_list_meanings)

In [None]:
# Optional

print(financial_list)

In [None]:
# FINANCIAL TERMS
# Create final word list for next steps.

final_financial_list = bag_of_words(financial_list)
#final_Financial_list = word_check(final_financial_list)

# Optional: check final list
#print(final_Financial_list)
print(final_financial_list)

In [None]:
# Compare lists for inter-rater reliability

---
# Section 2: Generate a semantic vector map with word2vec

In [None]:
from gensim.models.word2vec import Word2Vec
import os
from os import path

## Functions

In [None]:
def literary_words_list(file_path):
    """
    Function to select .txt files and store them as a list of words, to use as input to the function WordVec

    Parameters
    ----------
    file_path : a file path where .txt files are located (e.g. '/path/to/texts')

    Returns
    -------
    A list of the words in the files
    
    """

    # Specify root folder for file search
    os.chdir(file_path)
    root_folder = os.getcwd()

    # Create list for clean sentences
    words_list = []

    # Iterates over the path, folders and subfolders looking for txt files
    for path, subdirs, files in os.walk(root_folder):
        for file in files:
            if '.txt' in file and 'model' not in file:
                print(file)
                name = os.path.join(path, file)
                file_text = open(name, encoding = 'utf-8').read()

                # Creates a list of paragraphs - lines
                text_list_paragraphs = file_text.split('\n')

                # Clean the paragraphs further -- getting read of \r at the end of the line
                for paragraph in text_list_paragraphs:
                
                    #paragraph = paragraph.replace('\r', '')

                    # Add the paragraphs to the word2vec input list
                    words_list += [paragraph.split(' ')]
            
    return words_list

## Create list of words from .txt files of literary plays

In [None]:
word2vec_input = literary_words_list('c:\\Users\\Maria\\Dropbox\\Maria Brackin\\Finance and Text Analysis\\texts\\english early modern')

## Build and save the vector space with Word2Vec

In [None]:
SentenceCorpus = word2vec_input
word2vec_output = Word2Vec(SentenceCorpus, min_count=1)

# save to file
word2vec_output.save('w2v_model.txt')

---
# Section 3 Use the vector semantic map ...
...to evaluate if the bags of words created in section 1 are ecologically valid

## Functions

In [None]:
def get_word2vec_list(word_list,model):
    """
    Uses word2vec to find the 10 semantically most similar words to each seed word in word_list
    """
    
    list_of_word2vec_lists = []
    for word in word_list:
        try:

            ## here is the crucial line - we are using the model that we trained to get the most similar words within our corpus
            list_vects = model.wv.most_similar([word],topn=10)

            new_list = []
            new_list += [word]
            for item in list_vects:
                word1 = item[0]
                new_list += [word1]

            #print(new_list)
            #print('\n')
            list_of_word2vec_lists += [new_list]

        except KeyError:
            continue
    return(list_of_word2vec_lists)


def choose_word2vec_list(w2v_word_list):
    """
    Checks word2vec list of 10 most similar words

    Parameters
    ----------
    w2v_word_list : a list of words

    Returns
    -------
    A list of ecologically valid words 
    
    Use
    ------
    For the 10 lists of words, choose the number of those that have meanings coherent with the topic

    """
    
    indices = []
    
    index = 0
    
    for w2v_list in w2v_word_list:
        
        print(index, w2v_list) 
        x = input()
        if x == '1':
            indices.append(index)
        index += 1
    
    ## chose from the word2vec outputs, the lists that seem to have clouds of meanings coherent with prosociality
    ## in this example we can chose, for instance (among others) indexes 3,6,9,11,13 and 14

    relevant_w2v_words = list(w2v_word_list[i] for i in indices)

    return [item for sublist in relevant_w2v_words for item in sublist]

## Load vector space for english in the early modern period

In [None]:
# Load model
model = Word2Vec.load('w2v_model.txt')


final_financial_list = ['Maundy_money', 'absorb', 'account_payable', 'accounts_receivable', 'accumulation', 'acquirer', 'affluence', 'afford', 'amortisation', 'amortization', 'amount', 'amount_of_money', 'ante_up', 'arrears', 'assets', 'bank', 'bank_bill', 'bank_building', 'bank_deposit', 'bank_line', 'bank_loan', 'bank_note', "banker's_bill", 'banking_company', 'banking_concern', 'banknote', 'bankroll', 'bargain', 'bawbee', 'bear', 'benefit', 'big_bucks', 'big_money', 'bill', 'bond', 'bond_certificate', 'bond_issue', 'boodle', 'bread', 'bread_and_butter', 'bring_home', 'bring_in', 'budget', 'bundle', 'buy', 'buy_back', 'buy_food', 'buy_into', 'buy_out', 'buy_up', 'buyback', 'buying', 'cache', 'call_loan', 'capital', 'cash', 'cash_flow', 'cash_in', 'cash_in_hand', 'change', 'charge', 'cheap_money', 'chickenfeed', 'chump_change', 'cleanup', 'clear', 'coin', 'coinage', 'cold_cash', 'comforts', 'commercial_bank', 'commercial_credit', 'commit', 'consume', 'consumer_loan', 'credit', 'credit_line', 'credit_union', 'crown', 'currency', 'current_assets', 'debenture', 'debt', 'deep_pocket', 'deferred_payment', 'defray', 'defrayal', 'defrayment', 'demand_deposit', 'demand_loan', 'denier', 'deplete', 'deposit', 'direct_loan', 'disbursal', 'disburse', 'disbursement', 'disposable_income', 'dissipate', 'dividend', 'divvy', 'dough', 'down_payment', 'ducat', 'earn', 'earning_per_share', 'earnings', 'easy_money', 'economic_system', 'economise', 'economize', 'economy', 'economy_of_scale', 'eightpence', 'eke_out', 'equity', 'exhaust', 'expend', 'farthing', 'final_payment', 'finance', 'finances', 'financial', 'financial_obligation', 'fiscal', 'fivepence', 'folding_money', 'fool_away', 'foot', 'fourpence', 'free_enterprise', 'fritter', 'fritter_away', 'frivol_away', 'fund', 'funds', 'gain', 'gelt', 'give_back', 'government_income', 'government_note', 'government_revenue', 'government_security', 'gravy_train', 'greenback', 'groat', 'gross', 'gross_profit', 'gross_profit_margin', 'gross_revenue', 'gross_sales', 'growth_stock', 'guinea', "ha'penny", 'half-pay', 'half_crown', 'halfpenny', 'hard_cash', 'hard_currency', 'hedge_fund', 'hedgefund', 'hive_up', 'hoard', 'hole_card', 'home_equity_credit', 'home_equity_loan', 'home_loan', 'homeless', 'immediate_payment', 'impulse-buy', 'income', 'indebtedness', 'industrialism', 'inherited_wealth', 'installment', 'installment_credit', 'installment_loan', 'invest', 'investment', 'investment_funds', 'invoice', 'job', 'keep', 'laissez-faire_economy', 'lavish', 'lay_aside', 'lay_away', 'lay_out', 'letter_of_credit', 'letter_security', 'liability', 'limited_liability', 'line_of_credit', "line_one's_pockets", 'liquid_assets', 'liquidate', 'listed_security', 'livelihood', 'living', 'living_wage', 'loan', 'lucre', 'lump_sum', 'luxuriate', 'luxuriousness', 'luxury', 'maintenance', 'make', 'margin', 'market_economy', 'markup', 'material_resource', 'meal_ticket', 'megabucks', 'member_bank', 'merchant_bank', 'metal_money', 'minimum_wage', 'mintage', 'misspend', 'mixed_economy', 'monetary_fund', 'monetary_resource', 'money', 'moolah', 'mortgage_loan', 'mutual_fund', 'needy', 'nest_egg', 'net_income', 'net_profit', 'net_sales', 'nickel-and-dime', 'ninepence', 'non-market_economy', 'note', 'opulence', 'ordinary_shares', 'outlay', 'overage', 'overpay', 'overpayment', 'overspend', 'paper_currency', 'paper_money', 'pay', 'pay_cash', 'pay_envelope', 'pay_off', 'pay_out', 'pay_packet', 'pay_up', 'payable', 'paycheck', 'payment', 'payoff', 'payroll_check', 'pecuniary_resource', 'pelf', 'penny', 'penny-pinch', 'pension_fund', 'per_capita_income', 'personal_credit_line', 'personal_income', 'personal_line_of_credit', 'personal_loan', 'petty_cash', 'pick_up', 'pile', 'pin_money', 'pittance', 'pocket_money', 'poor', 'poor_people', 'premium', 'prepay', 'prepayment', 'proceeds', 'profit', 'profits', 'protection', 'pull_in', 'purchase', 'purchasing', 'quick_assets', 'quick_buck', 'quittance', 'rake_in', 'ready_cash', 'ready_money', 'realise', 'realize', 'receivables', 'recompense', 'redeem', 'redeposit', 'refund', 'regular_payment', 'remission', 'remit', 'remitment', 'remittal', 'remittance', 'remunerate', 'remuneration', 'rental_income', 'repay', 'repayment', 'repurchase', 'reserve', 'reserve_assets', 'reserve_bank', 'resource', 'retrenchment', 'return', 'reward', 'rich', 'rich_people', 'richness', 'salary', 'sales', 'save', 'save_up', 'saving', 'savings', 'scrimp', 'secured_bond', 'security', 'security_deposit', 'shareholder', 'shareowner', 'shekels', 'shilling', 'shinplaster', 'shoot', 'shower', 'sick_pay', 'sixpence', 'skimp', 'small_change', 'specie', 'spend', 'spending', 'spending_money', 'splurge', 'squander', 'stash', 'state_bank', 'sterling', 'stint', 'stock', 'stock_certificate', 'stock_dividend', 'stockholder', 'stockpile', 'subscribe', 'subscribe_to', 'subscription', 'subsidisation', 'subsidization', 'subsistence', 'sufficiency', 'sum', 'sum_of_money', 'sumptuousness', 'support', 'support_payment', 'surcharge', 'sustenance', 'take-home_pay', 'take_home', 'take_in', 'takings', 'tenpence', 'threepence', 'thriftiness', "tighten_one's_belt", 'time_loan', 'token_money', 'token_payment', 'treasury_stock', 'trifle', 'trifle_away', 'trust_fund', 'tuppence', 'turn_a_nice_penny', 'turn_a_profit', 'twopence', 'undercharge', 'underpay', 'underpayment', 'underspend', 'unearned_income', 'unearned_revenue', 'use_up', 'wage', 'wampum', 'wanton', 'wanton_away', 'ware', 'waste', 'wealth', 'wealthiness', 'working_capital', 'yield']

# create lists of ecologically valid words 
#list_of_financial_w2v = get_word2vec_list(final_Financial_list, model)
list_of_financial_w2v = get_word2vec_list(final_financial_list, model)

# choose most relevant lists of words
# type 0 to exclude, 1 to include

financial_BoW = choose_word2vec_list(list_of_financial_w2v)

print(list(set(financial_BoW)))

## Check words again using word_check

In [None]:
Financial_BoW = word_check(financial_BoW)

## Final list

In [None]:
# Remove duplicates
FINANCIAL_BoW = list(set(Financial_BoW))

## Final bags of words for frequency analyses
print('financial terms\n\n', FINANCIAL_BoW)

In [None]:
## See next script