### Load association network

In [1]:
%run 'get-graph-norms.ipynb'

In [2]:
%run 'methods-dir-graphs.ipynb'

In [3]:
import pickle
import powerlaw
import networkx as nx
import os
import matplotlib.pyplot as plt

from scipy import sparse
from gensim.models.keyedvectors import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [4]:
path_glove = os.path.join(os.pardir, 'data', 'glove')
path_glove_raw = os.path.join(os.pardir, 'data', 'glove', 'glove.840B.300d.txt')
path_glove_proc = os.path.join(os.pardir, 'data', 'glove', 'gensim_glove_vectors.txt')

In [5]:
glove2word2vec(glove_input_file=path_glove_raw, word2vec_output_file=path_glove_proc)

(395975, 300)

In [6]:
glove = KeyedVectors.load_word2vec_format(path_glove_proc, binary=False, limit=395974)
glove.init_sims()

In [7]:
vocab = list(set(fan_vocab) & set(glove.vocab.keys()))
print(len(vocab))

4988


Create similarity matrices for both kind of similarity measures:

In [8]:
mat_inner = get_similarity_matrix(vocab, 'dot', glove)

In [9]:
mat_cosine = get_similarity_matrix(vocab, 'cos', glove)

## Directed graphs

### K-nn 

In [10]:
dir_edges_knn_dot = get_knn_edges(mat_inner, vocab, fan_dict_k)
dir_edges_knn_cos = get_knn_edges(mat_cosine, vocab, fan_dict_k)

with open(os.path.join(path_glove, 'glove_fan_directed-knn-dot_edgeset.pkl'), 'wb+') as f:
    pickle.dump(dir_edges_knn_dot, f)
    
with open(os.path.join(path_glove, 'glove_fan_directed-knn-cos_edgeset.pkl'), 'wb+') as f:
    pickle.dump(dir_edges_knn_cos, f)

### cs-method 

In [11]:
dir_edges_cs_dot = get_cs_edges(mat_inner, vocab, 0.9)

excedrin -1563.6736028529704
britannica -3404.253553329094
Average degree: 12.478548516439455


In [12]:
dir_edges_cs_cos = get_cs_edges(mat_cosine, vocab, 38)

excedrin -39.782184662879445
britannica -83.74106586963444
Average degree: 12.677024859663192


In [13]:
with open(os.path.join(path_glove, 'glove_fan_directed-cs-dot_edgeset.pkl'), 'wb+') as f:
    pickle.dump(dir_edges_cs_dot, f)
    
with open(os.path.join(path_glove, 'glove_fan_directed-cs-cos_edgeset.pkl'), 'wb+') as f:
    pickle.dump(dir_edges_cs_cos, f)

## Undirected graph

These ranges have been explored manually:

In [14]:
undirected_settings = [
    (np.arange(0.52, 0.54, .01), mat_cosine, 'cos'),
    (np.arange(22.6, 22.9, .1), mat_inner, 'dot')]

for th, mat, method in undirected_settings:
    print(method)
    for t in th:
        edges = []
        x, y = np.where(mat>=t)
        for w1, w2 in zip(x, y):
            edge = (vocab[w1], vocab[w2], mat[w1,w2])
            edges.append(edge)

        g_un = nx.Graph()
        g_un.add_weighted_edges_from(edges)

        str_print = "tau: {:.2f}, m: {}, k: {:.2f}".format(
            t, len(edges), 2*nx.number_of_edges(g_un)/nx.number_of_nodes(g_un))
        print(str_print)

        tau = '%.2f'%t
        save_path = 'glove_fan_%.2f_%s_edgeset.pkl'%(t, method)

        with open(os.path.join(path_glove, save_path), 'wb+') as f:
            pickle.dump(edges, f) 

cos
tau: 0.52, m: 59139, k: 25.18
tau: 0.53, m: 51244, k: 22.13
tau: 0.54, m: 44549, k: 19.59
dot
tau: 22.60, m: 47726, k: 22.42
tau: 22.70, m: 46534, k: 22.01
tau: 22.80, m: 45330, k: 21.54
