In [24]:
import pandas as pd
from pandas import HDFStore
import time, gensim, logging, os
import numpy as np
import multiprocessing as mp
import pandas
from functools import partial
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import datapath
from sklearn.metrics.pairwise import cosine_similarity

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [52]:
cd /mnt/ide0/home/valalvern/models

/mnt/ide0/home/valalvern/models


In [15]:
def get_words_and_frequencies(model, target_word, N, threshold):
    """
    Take a w2v model and extract most similar words and frequencies
    
    Input: 
      model: the name of gensim w2v model
      N: N top words most similar to the target word
      target_word: the word of interest
      threshold: number of counts below which words will be removed
    
    Return a dictionary that maps word to frequency.     
    """
    dic = {}
    similars = list(model.wv.most_similar(target_word, topn=N))
    for pair in similars:
        if model.wv.vocab[pair[0]].count >= threshold:
            dic[pair[0]] = (pair[1],model.wv.vocab[pair[0]].count)
        
    return dic

In [16]:
def get_context_dict(year, model, target_word, dic):
    """
    From a dictionary of most similar words to target word and their
    frequencies, produce a dictionary of context similaries between the
    target word and those similar words
    """
    context_dict = {}
    c_score = {}
    s_score = {}
    word1 = target_word
    vect1 = model.trainables.syn1[model.wv.vocab[word1].index]
    for word in dic.keys():
        vect2 = model.trainables.syn1[model.wv.vocab[word].index]
        context = cosine_similarity([vect1], [vect2])
        context_dict[word] = context[0][0].round(3)
        
        if year not in c_score: 
            c_score[year] = {}
        c_score[year][word] = context[0][0]
        if year not in s_score: 
            s_score[year] = {}
        s_score[year][word] = dic[word][0]
        
    return s_score, c_score, context_dict

In [18]:
def get_comp_sub(context_dict, threshold1, threshold2):
    """
    From a dictionary of context similaries, get a list of 
    complements and substitutes.
    
    threshold1: top N most similar contexts words that counts as complements
    threshold2: bottom N most similar contexts words that counts as substitutes
    """
    sub = []
    for word in sorted(context_dict.items(), key=lambda x: x[1])[:threshold2]:
        sub.append(word[0])
    comp = []
    for word in sorted(context_dict .items(), key=lambda x: x[1], reverse=True)[:threshold1]:
        comp.append(word[0])
        
    return comp, sub


In [44]:
def extract_from_models(target_word, N, threshold0, threshold1, threshold2):
    """
    Extract a dataframe of most similar words to a target word from
    models stored in current directory
    
    threshold0: top N most similar words
    threshold1: top N most similar contexts words that counts as complements
    threshold2: bottom N most similar contexts words that counts as substitutes    
    """
    comps = []
    subs = []
    years = []
    contexts = []
    freqs = []
    c = {}
    s = {}
    
    for file in os.listdir():
        #if file == '1990.sg':
            #continue
        if file.endswith('.sg'): 
            
            year = file[:4]
            model = gensim.models.Word2Vec.load(file)
            try:
                dic = get_words_and_frequencies(model, target_word, N, threshold0)
                s_score, c_score, context_dict = get_context_dict(year, model, target_word, dic)
                comp, sub = get_comp_sub(context_dict, threshold1, threshold2)
            except: 
                continue

            freqs.append(dic)
            contexts.append(context_dict)
            comps.append(comp)
            subs.append(sub)
            years.append(year)
            c.update(c_score)
            s.update(s_score)
    
    return s, c, pandas.DataFrame({'complements': comps,
                             'substitutes': subs,
                             'frequency_dict': freqs,
                             'context_dict': contexts}, index =years)

In [51]:
word1 = '民主'
chi_model.wv.syn1[chi_model.wv.key_to_index[word1]]

AttributeError: 'KeyedVectors' object has no attribute 'syn1'

In [53]:
s, c, df = extract_from_models('民主', 500, 100, 100, 100)

2022-03-13 21:16:08,108 : INFO : loading Word2Vec object from 1985.sg
2022-03-13 21:16:08,826 : INFO : loading wv recursively from 1985.sg.wv.* with mmap=None
2022-03-13 21:16:08,827 : INFO : setting ignored attribute vectors_norm to None
2022-03-13 21:16:08,827 : INFO : loading vocabulary recursively from 1985.sg.vocabulary.* with mmap=None
2022-03-13 21:16:08,828 : INFO : loading trainables recursively from 1985.sg.trainables.* with mmap=None
2022-03-13 21:16:08,828 : INFO : setting ignored attribute cum_table to None
2022-03-13 21:16:08,829 : INFO : loaded 1985.sg
2022-03-13 21:16:08,871 : INFO : precomputing L2-norms of word weight vectors
2022-03-13 21:16:08,951 : INFO : loading Word2Vec object from 1993.sg
2022-03-13 21:16:09,823 : INFO : loading wv recursively from 1993.sg.wv.* with mmap=None
2022-03-13 21:16:09,824 : INFO : setting ignored attribute vectors_norm to None
2022-03-13 21:16:09,824 : INFO : loading vocabulary recursively from 1993.sg.vocabulary.* with mmap=None
2022

2022-03-13 21:16:17,548 : INFO : setting ignored attribute cum_table to None
2022-03-13 21:16:17,548 : INFO : loaded 1956.sg
2022-03-13 21:16:17,586 : INFO : precomputing L2-norms of word weight vectors
2022-03-13 21:16:17,650 : INFO : loading Word2Vec object from 2004.sg
2022-03-13 21:16:18,436 : INFO : loading wv recursively from 2004.sg.wv.* with mmap=None
2022-03-13 21:16:18,437 : INFO : setting ignored attribute vectors_norm to None
2022-03-13 21:16:18,437 : INFO : loading vocabulary recursively from 2004.sg.vocabulary.* with mmap=None
2022-03-13 21:16:18,438 : INFO : loading trainables recursively from 2004.sg.trainables.* with mmap=None
2022-03-13 21:16:18,438 : INFO : setting ignored attribute cum_table to None
2022-03-13 21:16:18,439 : INFO : loaded 2004.sg
2022-03-13 21:16:18,491 : INFO : precomputing L2-norms of word weight vectors
2022-03-13 21:16:18,623 : INFO : loading Word2Vec object from 1978.sg
2022-03-13 21:16:19,211 : INFO : loading wv recursively from 1978.sg.wv.* w

2022-03-13 21:16:28,310 : INFO : loading vocabulary recursively from 1989.sg.vocabulary.* with mmap=None
2022-03-13 21:16:28,310 : INFO : loading trainables recursively from 1989.sg.trainables.* with mmap=None
2022-03-13 21:16:28,311 : INFO : setting ignored attribute cum_table to None
2022-03-13 21:16:28,311 : INFO : loaded 1989.sg
2022-03-13 21:16:28,367 : INFO : precomputing L2-norms of word weight vectors
2022-03-13 21:16:28,424 : INFO : loading Word2Vec object from 2005.sg
2022-03-13 21:16:29,242 : INFO : loading wv recursively from 2005.sg.wv.* with mmap=None
2022-03-13 21:16:29,243 : INFO : setting ignored attribute vectors_norm to None
2022-03-13 21:16:29,243 : INFO : loading vocabulary recursively from 2005.sg.vocabulary.* with mmap=None
2022-03-13 21:16:29,244 : INFO : loading trainables recursively from 2005.sg.trainables.* with mmap=None
2022-03-13 21:16:29,244 : INFO : setting ignored attribute cum_table to None
2022-03-13 21:16:29,244 : INFO : loaded 2005.sg
2022-03-13 21

2022-03-13 21:16:37,751 : INFO : loading Word2Vec object from 1980.sg
2022-03-13 21:16:38,445 : INFO : loading wv recursively from 1980.sg.wv.* with mmap=None
2022-03-13 21:16:38,445 : INFO : setting ignored attribute vectors_norm to None
2022-03-13 21:16:38,446 : INFO : loading vocabulary recursively from 1980.sg.vocabulary.* with mmap=None
2022-03-13 21:16:38,446 : INFO : loading trainables recursively from 1980.sg.trainables.* with mmap=None
2022-03-13 21:16:38,447 : INFO : setting ignored attribute cum_table to None
2022-03-13 21:16:38,447 : INFO : loaded 1980.sg
2022-03-13 21:16:38,509 : INFO : precomputing L2-norms of word weight vectors
2022-03-13 21:16:38,550 : INFO : loading Word2Vec object from 1977.sg
2022-03-13 21:16:39,095 : INFO : loading wv recursively from 1977.sg.wv.* with mmap=None
2022-03-13 21:16:39,096 : INFO : setting ignored attribute vectors_norm to None
2022-03-13 21:16:39,096 : INFO : loading vocabulary recursively from 1977.sg.vocabulary.* with mmap=None
2022

2022-03-13 21:16:46,692 : INFO : setting ignored attribute cum_table to None
2022-03-13 21:16:46,693 : INFO : loaded 1974.sg
2022-03-13 21:16:46,738 : INFO : precomputing L2-norms of word weight vectors
2022-03-13 21:16:46,792 : INFO : loading Word2Vec object from 2000.sg
2022-03-13 21:16:47,559 : INFO : loading wv recursively from 2000.sg.wv.* with mmap=None
2022-03-13 21:16:47,560 : INFO : setting ignored attribute vectors_norm to None
2022-03-13 21:16:47,561 : INFO : loading vocabulary recursively from 2000.sg.vocabulary.* with mmap=None
2022-03-13 21:16:47,561 : INFO : loading trainables recursively from 2000.sg.trainables.* with mmap=None
2022-03-13 21:16:47,561 : INFO : setting ignored attribute cum_table to None
2022-03-13 21:16:47,562 : INFO : loaded 2000.sg
2022-03-13 21:16:47,629 : INFO : precomputing L2-norms of word weight vectors
2022-03-13 21:16:47,753 : INFO : loading Word2Vec object from 2008.sg
2022-03-13 21:16:48,688 : INFO : loading wv recursively from 2008.sg.wv.* w

In [50]:
df 

Unnamed: 0,complements,substitutes,frequency_dict,context_dict
1985,"[psychological, organisation, realm, evolution...","[constitution, organization, discipline, scien...","{'reform': (0.7672901749610901, 166), 'organis...","{'reform': -0.03, 'organisation': 0.266, 'disc..."
1943,"[doctrine, revolution, religion, planning, agr...","[science, organisation, management, humanity, ...","{'organisation': (0.7327439188957214, 144), 'o...","{'organisation': -0.776, 'organization': 0.011..."
1993,"[intervention, reform, culture, constitution, ...","[capitalism, politics, leadership, collapse, i...","{'democratic': (0.7494760155677795, 127), 'eco...","{'democratic': -0.269, 'economics': 0.101, 'ca..."
1995,"[perception, unity, organisation, politics, ev...","[practices, institution, democratic, separatio...","{'economics': (0.7507821917533875, 114), 'refo...","{'economics': 0.64, 'reform': 0.4, 'democratic..."
1958,"[reform, standards, machinery, culture, sphere...","[climate, customs, model, economy, training, s...","{'organization': (0.8030379414558411, 197), 's...","{'organization': -0.049, 'security': -0.085, '..."
...,...,...,...,...
2008,"[equality, culture, regional, sciences, revolu...","[unity, discipline, landscape, transformation,...","{'democratic': (0.7658092975616455, 122), 'pol...","{'democratic': 0.552, 'politics': -0.708, 'med..."
1965,"[science, objective, unity, democratic, tradit...","[agriculture, programme, society, management, ...","{'organisation': (0.8025902509689331, 110), 'o...","{'organisation': -0.023, 'organization': -0.06..."
1909,"[simplicity, morality]","[morality, simplicity]","{'simplicity': (0.7961676120758057, 108), 'mor...","{'simplicity': 0.055, 'morality': 0.021}"
1988,"[democratic, nation, medicine, culture, philos...","[realm, perception, politics, arts, organisati...","{'democratic': (0.786578893661499, 104), 'orga...","{'democratic': 0.464, 'organisation': -0.317, ..."


In [51]:
df.to_csv('/mnt/ide0/home/valalvern/chi_comsub.csv')

In [23]:
df

NameError: name 'df' is not defined