In [16]:
import pandas as pd
import numpy as np
import gensim
import json
import math

from gensim.models import Word2Vec

In [30]:
p = pd.read_csv('../Downloads/verbal fluency.csv')

In [31]:
animals = []
for line in p['animals'].dropna():
    if type(line) == type('foo'):
        animals.append([x.strip() for x in line.split('; ') if x.strip()])

In [32]:
for i, patient_word_list in enumerate(animals):
    for j, word in enumerate(patient_word_list):
        if ' ' in word:
            first, second = word.split()
            first += '_ADJ'
            second += '_NOUN'
            animals[i][j] = first + ' ' + second
        else:
            animals[i][j] += '_NOUN'

In [24]:
model_names = {'ruscorpora_upos_cbow_300_20_2019': '../Downloads/180/model.bin',
              'ruscorpora_upos_skipgram_600_10_2017' : '../Downloads/ruscorpora_upos_skipgram_600_10_2017.bin',
              'tayga_upos_skipgram_300_2_2019': '../Downloads/185/model.bin',
              'ruwikiruscorpora_upos_skipgram_300_2_2019':'../Downloads/182/model.bin'}

In [9]:
def cos_sim(v1, v2):
    return np.inner(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

In [36]:
def get_cos_sim_list(model):
    '''
    takes a model, takes animals list of lists with words from global environment
    returns a tuple of list of list of pairwise cosine similarities
        and a number of words missing from the model vocabulary
        
    cosine similarity for a word pair that has a collocation is calculated as
    cosine similarity between a word vector and an average of word vectors from the collocation 
    '''
    patient_animals_cos_sim = []
    not_found = 0
    for i, patient_word_list in enumerate(animals):
        patient_animals_cos_sim.append([])
        for j, word in enumerate(patient_word_list):
            if j > 0:
                try:
                    previous_word = patient_word_list[j-1]
                    if ' ' in word:
                        first, second = word.split()
                        first_vector = model[first]
                        second_vector = model[second]
                        word_vector = np.mean([first_vector, second_vector], axis=0)
                    else:
                        word_vector = model[word]
                    if ' ' in previous_word:
                        first, second = previous_word.split()
                        first_vector = model[first]
                        second_vector = model[second]
                        previous_word_vector = np.mean([first_vector, second_vector], axis=0)
                    else:
                        previous_word_vector = model[previous_word]
                    patient_animals_cos_sim[i].append(cos_sim(word_vector, previous_word_vector))
                except KeyError as e:
                    not_found += 1
    not_found = math.ceil(not_found/2)
    return (patient_animals_cos_sim, not_found)

In [35]:
def evaluate_model(model, model_name): 
    '''
    takes a model and its name
    dumps a table of cosine similarities to a json file (named 'COS_SIM model name.json')
    dumps a row with evaluation (cosine range and number of words missing in the model vocabulary) to a csv file
    '''
    patient_animals_cos_sim, not_found = get_cos_sim_list(model)
    maximum = max([max(row) for row in patient_animals_cos_sim]) 
    minimum = min([min(row) for row in patient_animals_cos_sim])
    cos_range = maximum - minimum
    model_table_name = 'COS_SIM' + model_name + '.json'
    with open(model_table_name, 'w') as f:
        json.dump([[str(el) for el in patient] for patient in patient_animals_cos_sim],f)
    with open('evaluation.csv', 'a') as f:
        f.write('\n'+', '.join([str(x) for x in [model_name, cos_range, not_found]]))

In [37]:
for el in model_names.items():
    model_name, path = el
    print('working on: ' + model_name)
    if '.bin' in path:
        model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    else:
        model = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=False)
    evaluate_model(model, model_name)

working on: ruscorpora_upos_cbow_300_20_2019
working on: ruscorpora_upos_skipgram_600_10_2017
working on: tayga_upos_skipgram_300_2_2019
working on: ruwikiruscorpora_upos_skipgram_300_2_2019
