In [55]:
import gensim.models
import math
import numpy as np
import os.path
import pandas as pd

from zipfile import ZipFile

**let us get the data**

In [17]:
def get_patients_data(path_to_csv, columns=None, dropna=True):
    """
    gets a csv dataset into pandas keeping specified columns and dropping or keeping na's

    :param path_to_csv: str
    :param columns: list of str, columns to keep, None by default (keeps all)
    :param dropna: bool, whether to drop na's, True by default
    :return: pd.DataFrame
    """
    res = pd.read_csv(path_to_csv, index_col=0)
    if columns:
        columns = list(filter(lambda col: col in res, columns))
        if columns:
            res = res[columns]
    if dropna:
        res = res.dropna()
    return res

**let us turn ;-separated str into lists**

In [26]:
csv = "verbal fluency.csv"
animals = get_patients_data(csv, ['animals'])
animals['animals'] = animals['animals'].apply(lambda s: [x.strip() for x in s.strip().split(';')])
animals

Unnamed: 0_level_0,animals
ID,Unnamed: 1_level_1
1,"[слон, заяц, волк, олень, кенгуру, жираф, сусл..."
2,"[собака, кошка, лошадь, кролик, заяц, волк, ме..."
3,"[собака, кошка, черепаха, слон, крокодил, обез..."
4,"[кошка, мышка, собака, тигр, лев, канарейка, с..."
5,"[лось, медведь, лиса, волк, корова, собака, ко..."
6,"[кошка, собака, корова, овца, свинья, лиса, во..."
7,"[медведь, лось, волк, заяц, куропатка, утка]"
8,"[корова, лошадь, овца, свинья, кролик, медведь..."
9,"[кошка, мышка, собачка, слон, носорог, леопард..."
10,"[поросенок, слон, собака, кошка, черепаха, мед..."


**let us add dummy POS-tags for the model**

In [19]:
def add_POS_tags(patient_word_list):
    """
    adds "POS tags" to words in a word list
    if single word is given, _NOUN tag is added
    if a bi-gram is given, _ADJ and _NOUN tags are added
    
    :param patient_word_list: list of str, words
    :return: list of str, words with POS tags
    """
    res = []
    for j, word in enumerate(patient_word_list):
        if ' ' in word:
            first, second = word.split()
            first += '_ADJ'
            second += '_NOUN'
            res.append(first + ' ' + second)
        else:
            res.append(word+'_NOUN')
    return res

In [21]:
add_POS_tags(['жираф', 'морской свинка'])

['жираф_NOUN', 'морской_ADJ свинка_NOUN']

In [27]:
animals['animals'] = animals['animals'].apply(add_POS_tags)
animals

Unnamed: 0_level_0,animals
ID,Unnamed: 1_level_1
1,"[слон_NOUN, заяц_NOUN, волк_NOUN, олень_NOUN, ..."
2,"[собака_NOUN, кошка_NOUN, лошадь_NOUN, кролик_..."
3,"[собака_NOUN, кошка_NOUN, черепаха_NOUN, слон_..."
4,"[кошка_NOUN, мышка_NOUN, собака_NOUN, тигр_NOU..."
5,"[лось_NOUN, медведь_NOUN, лиса_NOUN, волк_NOUN..."
6,"[кошка_NOUN, собака_NOUN, корова_NOUN, овца_NO..."
7,"[медведь_NOUN, лось_NOUN, волк_NOUN, заяц_NOUN..."
8,"[корова_NOUN, лошадь_NOUN, овца_NOUN, свинья_N..."
9,"[кошка_NOUN, мышка_NOUN, собачка_NOUN, слон_NO..."
10,"[поросенок_NOUN, слон_NOUN, собака_NOUN, кошка..."


**let us unzip the model into a temporary directory and load it**

In [14]:
model_zip_name = 'models/tayga_upos_skipgram_300_2_2019.zip'
with ZipFile(model_zip_name, 'r') as myzip:
    myzip.extractall('models/tmp')

In [15]:
model_path = 'models/tmp/model.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

In [28]:
model['слон_NOUN']

array([ 0.08601318, -0.6828482 , -0.22675529, -0.22827153, -0.01361705,
        0.24698313, -0.04402011, -0.2475504 ,  0.07414407, -0.27594137,
        0.1739021 , -0.00158702,  0.01958552,  0.03729556,  0.5432982 ,
        0.0611247 ,  0.14772898,  0.02181707,  0.20535886, -0.11394621,
       -0.17934537,  0.2540597 ,  0.19710991, -0.06129597, -0.5700048 ,
        0.00850122,  0.2670366 ,  0.3233264 , -0.03196438, -0.18470246,
        0.38322222, -0.02588548,  0.28374165, -0.19383   , -0.08458902,
        0.4244925 , -0.15542516,  0.19162974, -0.06713919, -0.20103349,
        0.06785759,  0.05808949,  0.03733297,  0.07999419, -0.10370225,
        0.06420214, -0.11548744,  0.00239497, -0.2272254 , -0.19429892,
       -0.15440041, -0.34388942, -0.31899017,  0.14440675, -0.00867636,
        0.24022427, -0.21015681, -0.36691037,  0.08985427, -0.0185655 ,
       -0.16598868,  0.10741134,  0.14276543, -0.22300471, -0.11224405,
       -0.05493741,  0.48073494,  0.1417331 , -0.2366079 , -0.10

**let us define a way of handling bi-grams**

In [40]:
def collocation_handler(model, word, collocation_function=None):
    """
    gets a vector of a word or a bigram using a function to combine two vectors in a colloctaion
    
    :param model: gensim.Word2Vec model
    :param word: str, word or bi-gram
    :param collocation_function: function to combine two word vectors, None by default (np.mean is used)
    :return word_vector: np.ndarray, word vector
    :return not_found: int, number of words that were absent in the model
    """
    not_found = 0
    if ' ' in word:
        if not collocation_function:
            collocation_function = lambda vec1, vec2: np.mean([vec1, vec2], axis=0)
        first, second = word.split()
        if first in model and second in model:
            first_vector = model[first]
            second_vector = model[second]
            word_vector = collocation_function(first_vector, second_vector)
        elif first in model and second not in model:
            word_vector = model[first]
            not_found = 1
        elif first not in model and second in model:
            word_vector = model[second]
            not_found = 1
        else:
            word_vector = None
            not_found = 2
    else:
        if word in model:
            word_vector = model[word]
        else:
            word_vector = None
            not_found = 1
    return word_vector, not_found

In [41]:
collocation_handler(model, 'морской_ADJ свинка_NOUN')

(array([-2.40569502e-01, -2.65819699e-01,  2.47903019e-02,  3.60203013e-02,
         2.68500924e-01,  3.60444307e-01, -1.26190349e-01,  2.53701117e-03,
        -3.17839533e-02,  1.25960782e-02,  1.35463476e-01, -1.68274075e-01,
        -1.90845385e-01,  7.88793340e-03, -9.61406678e-02, -1.11334942e-01,
         3.70166034e-01,  4.08405542e-01, -1.55779645e-01,  1.14374891e-01,
        -5.48553877e-02,  9.86299813e-02, -1.45992398e-01,  1.50981732e-02,
        -5.47238231e-01,  1.22930489e-01,  2.63999104e-01, -7.07200095e-02,
        -2.63602316e-01, -1.15159288e-01,  2.86967874e-01, -9.13257897e-03,
        -1.56521797e-02, -1.82685494e-01,  1.65465757e-01,  1.50530890e-01,
        -4.82448116e-02,  2.58174300e-01, -2.40019649e-01, -3.00360192e-02,
         1.40154660e-02, -1.70106724e-01, -8.29464346e-02, -1.76284164e-01,
         2.51417272e-02, -2.48923823e-01,  2.01466367e-01, -4.91439104e-01,
        -3.01076591e-01,  6.46086335e-02, -3.41826528e-02, -7.60434270e-02,
         2.8

In [44]:
collocation_handler(model, 'kiufuer')

(None, 1)

In [45]:
collocation_handler(model, 'kiufuer свинка_NOUN')

(array([-0.24443445, -0.5771195 , -0.13867165,  0.20017748,  0.3237785 ,
         0.2328046 , -0.34049097,  0.03349691,  0.12646833,  0.18448459,
         0.24623515, -0.12471765, -0.16479154, -0.07681715, -0.03212069,
        -0.08459495,  0.36097503,  0.38839626,  0.05949978,  0.05217168,
        -0.19348028,  0.09540386, -0.47699255, -0.12457881, -0.57800716,
         0.14449762,  0.3292182 ,  0.16089065, -0.32228065, -0.08594431,
         0.36661896, -0.03241307,  0.18824233, -0.22222158,  0.11502609,
         0.19788364,  0.01979579,  0.24609703, -0.21542795,  0.03033229,
         0.17665893,  0.07056163, -0.2209753 , -0.15497443, -0.08215433,
         0.0810919 ,  0.18020569, -0.45008856,  0.00310612, -0.07636507,
         0.19055578,  0.04309632,  0.04444195,  0.23624417, -0.43727016,
         0.24869345, -0.19005972, -0.06291488,  0.00923101, -0.08208547,
         0.0858584 ,  0.20341316, -0.181405  , -0.06822711,  0.05965808,
         0.12983981,  0.37337086,  0.48328945, -0.0

**let us compute cosine similarities of a list of words**

In [56]:
def cos_sim(v1, v2):
    return np.inner(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

In [60]:
def get_cos_sim_list(model, patient_word_list, collocation_function=None):
    """
    calculates pairwise cosine similarities of words or collocations in a word list

    cosine similarity for a word pair that has a collocation is calculated as
    cosine similarity between a word vector and an average of word vectors from the collocation

    :param model: gensim.word2vec model
    :param patient_word_list: list of strings, words produced by the patient
    :param collocation_function: function combining two word vectors in a collocation, if None (default) mean is taken
    :return patient_cos_sim_list: list of float, pairwise cosine similarities
    :return not_found: int, number of words missing from the model vocabulary
    """
    patient_cos_sim_list = []
    not_found = 0
    for j, word in enumerate(patient_word_list):
        if j > 0:
            previous_word = patient_word_list[j - 1]
            word_vector, nf = collocation_handler(model, word, collocation_function)
            not_found += nf
            previous_word_vector, nf = collocation_handler(model, previous_word, collocation_function)
            not_found += nf
            if word_vector is not None and previous_word_vector is not None:
                patient_cos_sim_list.append(cos_sim(word_vector, previous_word_vector))
            else:
                continue
    not_found = math.ceil(not_found / 2)
    return patient_cos_sim_list, not_found

In [61]:
get_cos_sim_list(model, ['морской_ADJ свинка_NOUN', 'мышь_NOUN', 'уж_NOUN', 'жираф_NOUN'])

([0.39405695, 0.33655587, 0.24216779], 0)

In [62]:
get_cos_sim_list(model, ['морской_ADJ свинка_NOUN', 'мышь_NOUN', 'уж_NOUN', 'kjgfeagf','жираф_NOUN'])

([0.39405695, 0.33655587], 1)

In [68]:
cf = lambda vec1, vec2: vec1 + 2*vec2
get_cos_sim_list(model, ['морской_ADJ свинка_NOUN', 'мышь_NOUN', 'уж_NOUN', 'жираф_NOUN'], collocation_function=cf)

([0.45510814, 0.33655587, 0.24216779], 0)