In [4]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import os
import re
os.chdir('/home/daniel/school/BP')
from pythesis.data.prepare_data import *

slova_po_b = pd.read_csv('data/processed/vyjmenovana_slova_po_b.csv')
model = Word2Vec.load('pythesis/utils/word2vec.model') 

def get_word2vec_items_tuple(model, data):
    """Function to get all the items you need for manipulating items when working your word2vec model.
    Returns the data which is only in the vocabulary of your model.
    
    Parameter 'model' is instantiated Word2vecmodel you loaded before.
    Parameter 'data' has to have 'correct_answer', 'question' columns.
    
    Returns 4-tuple of word vectors array which can be then input into projection methods,
    labels which can annotate the point in the visualization, full solutions, 
    and solutions which contain only the 'fill-in-the-blank' word."""

    data = data.drop_duplicates(subset=['question'], keep='first')

    X, labels = [], []
    solutions_vocab, full_solutions_vocab = [], []
    solutions = get_solutions(data, method='fillin')
    full_solutions = get_solutions(data, method='full')

    for full_solution, solution, label in zip(full_solutions, solutions, data['question']):
        try:
            # in slova_po_b is the word 'bicí' a big outlier in visualization, i'd rather get rid of it
            # if solution == 'bicí':
            #     continue
            X.append(model.wv[solution])
            labels.append(label)
            solutions_vocab.append(solution)
            full_solutions_vocab.append(full_solution)
        except KeyError as e:
            print("Word '{}' is not in vocabulary of your model, therefore it won't be in your visualization.".format(solution))
    return X, labels, solutions_vocab, full_solutions_vocab

In [5]:
X, labels, solutions, full_solutions = get_word2vec_items_tuple(model, slova_po_b)

Word 'Bystrouška' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'zabydlený' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'Bystřice' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'vybydlený' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'Pardubice' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'Přibyslav' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'Robinson' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'obydlit' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'zbystřit' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'zabydlit' is not in vocabulary of your model, therefore it won't be in your visualization.
Word 'bystřit' is not in 

In [10]:
def create_word2vec_similarity_matrix(full_solutions, solutions):
    """'solutions' contain only the 'fill-in-the-blank' word based on which is the similarity computed.
    'full_solutions' are for representing the assignment in matrix (same word can occur multiple times).
    
    Returns similarity matrix."""
    dataframe = pd.DataFrame(index=full_solutions,columns=full_solutions)
    for i, j in zip(solutions, full_solutions):
        for k, l in zip(solutions, full_solutions):
                dataframe.loc[j,l] = model.wv.similarity(i, k)
    return dataframe

In [11]:
dataframe = create_word2vec_similarity_matrix(full_solutions, solutions)
dataframe.to_csv('data/processed/trololol.csv')

In [61]:
word2vec_similarity_matrix = pd.read_csv('data/processed/word2vec_words_similarity_matrix_slova_po_b.csv', index_col=0)
word2vec_similarity_matrix.head()

In [6]:
slova_po_b = pd.read_csv('data/processed/vyjmenovana_slova_po_b.csv')

# only words which are also in my word2vec model
vyjm_slova_filtered = slova_po_b.loc[slova_po_b['question'].isin(labels)]
correctness_matrix = reshape_to_correctness_matrix(vyjm_slova_filtered)
similarity_matrix = correctness_matrix_to_similarity_matrix('doublepearson', correctness_matrix)
similarity_matrix2 = correctness_matrix_to_similarity_matrix('pearson', correctness_matrix)

In [69]:
len(data), len(similarity_matrix), len(similarity_matrix2)

(239, 239, 239)

In [8]:
similarity_matrix.columns = data.columns
similarity_matrix.index = data.index

In [10]:
similarity_matrix['sbírka známek'].corr(data['bývalý'])

-0.04083208545254325

In [19]:
sim_matrix_doublepear = pd.Series(similarity_matrix.values.flatten())
sim_matrix_pear = pd.Series(similarity_matrix2.values.flatten())
sim_matrix_word2vector = pd.Series(data.values.flatten())

In [20]:
sim_matrix_doublepear.corr(sim_matrix_pear)

0.8858815250933174

In [31]:
sim_matrix_pear.corr(sim_matrix_word2vector)

0.23676038278328249

In [32]:
sim_matrix_doublepear.corr(sim_matrix_word2vector)

0.14759275116284798

In [34]:
np.corrcoef([similarity_matrix.values.flatten(), similarity_matrix2.values.flatten(), data.values.flatten()])

array([[1.        , 0.88588153, 0.14759275],
       [0.88588153, 1.        , 0.23676038],
       [0.14759275, 0.23676038, 1.        ]])