In [31]:
import pandas as pd
from pandas import HDFStore
import time, gensim, logging, os
import numpy as np
import multiprocessing as mp
from functools import partial
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import datapath
from sklearn.metrics.pairwise import cosine_similarity

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
cd chi_models

/mnt/ide0/home/isabelladuan/chi_models


In [26]:
def get_words_and_frequencies(model, target_word, N, threshold):
    """
    Take a w2v model and extract most similar words and frequencies
    
    Input: 
      model: the name of gensim w2v model
      N: N top words most similar to the target word
      target_word: the word of interest
      threshold: number of counts below which words will be removed
    
    Return a dictionary that maps word to frequency.     
    """
    dic = {}
    similars = list(model.wv.most_similar(target_word, topn=N))
    for pair in similars:
        if model.wv.get_vecattr(pair[0], "count") >= threshold:
            dic[pair[0]] = model.wv.get_vecattr(pair[0], "count")
        
    return dic

In [43]:
def get_context_dict(model, target_word, dic):
    """
    From a dictionary of most similar words to target word and their
    frequencies, produce a dictionary of context similaries between the
    target word and those similar words
    """
    context_dict = {}
    word1 = target_word
    vect1 = model.syn1[model.wv.key_to_index[word1]]
    for word in dic.keys():
        vect2 = model.syn1[model.wv.key_to_index[word]]
        context = cosine_similarity([vect1], [vect2])
        context_dict[word] = context[0][0].round(3)
        
    return context_dict

In [36]:
def get_comp_sub(context_dic, threshold1, threshold2):
    """
    From a dictionary of context similaries, get a list of 
    complements and substitutes.
    
    threshold1: top N most similar contexts words that counts as complements
    threshold2: bottom N most similar contexts words that counts as substitutes
    """
    sub = []
    for word in sorted(context_dict.items(), key=lambda x: x[1])[:threshold2]:
        sub.append(word[0])
    comp = []
    for word in sorted(context_dict .items(), key=lambda x: x[1], reverse=True)[:threshold1]:
        comp.append(word[0])
        
    return comp, sub


In [32]:
def extract_from_models(target_word, N, threshold0, threshold1, threshold2):
    """
    Extract a dataframe of most similar words to a target word from
    models stored in current directory
    
    threshold0: top N most similar words
    threshold1: top N most similar contexts words that counts as complements
    threshold2: bottom N most similar contexts words that counts as substitutes    
    """
    comps = []
    subs = []
    years = []
    contexts = []
    freqs = []
    for file in os.listdir():
        if file.endswith('.sg'):
            model = gensim.models.Word2Vec.load(file)
            dic = get_words_and_frequencies(model, target_word, N, threshold0)
            context_dict = get_context_dict(model, target_word, dic)
            comp, sub = get_comp_sub(context_dic, threshold1, threshold2)
            
            freqs.append(dic)
            contexts.append(context_dic)
            comps.append(comp)
            subs.append(sub)
            years.append(file)
            
    return pandas.DataFrame({'complements': comps,
                             'substitutes': subs,
                             'frequency_dict': freqs,
                             'context_dict': contexts}, index =years)

In [33]:
dic = get_words_and_frequencies(chi_model, '民主', 500, 100)

In [44]:
context_dict = get_context_dict(chi_model, '民主', dic)

AttributeError: 'Word2Vec' object has no attribute 'syn1'

In [53]:
chi_model.wv.syn0

AttributeError: 'KeyedVectors' object has no attribute 'syn0'

In [51]:
word1 = '民主'
chi_model.wv.syn1[chi_model.wv.key_to_index[word1]]

AttributeError: 'KeyedVectors' object has no attribute 'syn1'

In [30]:
df = extract_from_models('民主', 500, 100)

2022-03-02 09:25:41,983 : INFO : loading Word2Vec object from 1985.sg
2022-03-02 09:25:42,545 : INFO : loading wv recursively from 1985.sg.wv.* with mmap=None
2022-03-02 09:25:42,546 : INFO : setting ignored attribute vectors_norm to None
2022-03-02 09:25:42,929 : INFO : loading vocabulary recursively from 1985.sg.vocabulary.* with mmap=None
2022-03-02 09:25:42,930 : INFO : loading trainables recursively from 1985.sg.trainables.* with mmap=None
2022-03-02 09:25:42,930 : INFO : setting ignored attribute cum_table to None
2022-03-02 09:25:43,134 : INFO : Word2Vec lifecycle event {'fname': '1985.sg', 'datetime': '2022-03-02T09:25:43.134238', 'gensim': '4.1.2', 'python': '3.7.4 (default, Aug 13 2019, 20:35:49) \n[GCC 7.3.0]', 'platform': 'Linux-3.10.0-1160.45.1.el7.x86_64-x86_64-with-redhat-7.9-Maipo', 'event': 'loaded'}
2022-03-02 09:25:43,142 : INFO : loading Word2Vec object from 1993.sg
2022-03-02 09:25:43,774 : INFO : loading wv recursively from 1993.sg.wv.* with mmap=None
2022-03-02 0

2022-03-02 09:26:01,802 : INFO : loading Word2Vec object from 1971.sg
2022-03-02 09:26:06,149 : INFO : loading wv recursively from 1971.sg.wv.* with mmap=None
2022-03-02 09:26:06,150 : INFO : setting ignored attribute vectors_norm to None
2022-03-02 09:26:06,438 : INFO : loading vocabulary recursively from 1971.sg.vocabulary.* with mmap=None
2022-03-02 09:26:06,439 : INFO : loading trainables recursively from 1971.sg.trainables.* with mmap=None
2022-03-02 09:26:06,439 : INFO : setting ignored attribute cum_table to None
2022-03-02 09:26:06,591 : INFO : Word2Vec lifecycle event {'fname': '1971.sg', 'datetime': '2022-03-02T09:26:06.591867', 'gensim': '4.1.2', 'python': '3.7.4 (default, Aug 13 2019, 20:35:49) \n[GCC 7.3.0]', 'platform': 'Linux-3.10.0-1160.45.1.el7.x86_64-x86_64-with-redhat-7.9-Maipo', 'event': 'loaded'}
2022-03-02 09:26:06,619 : INFO : loading Word2Vec object from 1955.sg
2022-03-02 09:26:07,062 : INFO : loading wv recursively from 1955.sg.wv.* with mmap=None
2022-03-02 0

2022-03-02 09:26:31,283 : INFO : loading Word2Vec object from 1962.sg
2022-03-02 09:26:31,721 : INFO : loading wv recursively from 1962.sg.wv.* with mmap=None
2022-03-02 09:26:31,722 : INFO : setting ignored attribute vectors_norm to None
2022-03-02 09:26:32,008 : INFO : loading vocabulary recursively from 1962.sg.vocabulary.* with mmap=None
2022-03-02 09:26:32,009 : INFO : loading trainables recursively from 1962.sg.trainables.* with mmap=None
2022-03-02 09:26:32,010 : INFO : setting ignored attribute cum_table to None
2022-03-02 09:26:32,158 : INFO : Word2Vec lifecycle event {'fname': '1962.sg', 'datetime': '2022-03-02T09:26:32.158865', 'gensim': '4.1.2', 'python': '3.7.4 (default, Aug 13 2019, 20:35:49) \n[GCC 7.3.0]', 'platform': 'Linux-3.10.0-1160.45.1.el7.x86_64-x86_64-with-redhat-7.9-Maipo', 'event': 'loaded'}
2022-03-02 09:26:32,184 : INFO : loading Word2Vec object from 1970.sg


KeyboardInterrupt: 

In [50]:
# I load one model for testing the pipeline ...
chi_model = gensim.models.Word2Vec.load('1955.sg')

2022-03-02 09:57:34,811 : INFO : loading Word2Vec object from 1955.sg
2022-03-02 09:57:35,380 : INFO : loading wv recursively from 1955.sg.wv.* with mmap=None
2022-03-02 09:57:35,381 : INFO : setting ignored attribute vectors_norm to None
2022-03-02 09:57:35,689 : INFO : loading vocabulary recursively from 1955.sg.vocabulary.* with mmap=None
2022-03-02 09:57:35,690 : INFO : loading trainables recursively from 1955.sg.trainables.* with mmap=None
2022-03-02 09:57:35,690 : INFO : setting ignored attribute cum_table to None
2022-03-02 09:57:35,845 : INFO : Word2Vec lifecycle event {'fname': '1955.sg', 'datetime': '2022-03-02T09:57:35.845751', 'gensim': '4.1.2', 'python': '3.7.4 (default, Aug 13 2019, 20:35:49) \n[GCC 7.3.0]', 'platform': 'Linux-3.10.0-1160.45.1.el7.x86_64-x86_64-with-redhat-7.9-Maipo', 'event': 'loaded'}


In [12]:
chi_model.wv.get_vecattr('民主', 'count')

20098