In [1]:
import spacy
import numpy as np
import pandas as pd
import unicodedata
nlp = spacy.load("pt_core_news_lg")

In [1]:
#Based https://alvinntnu.github.io/python-notes/statistical-analyses/network-analysis.html

In [2]:
sinonimos = ['sentir', 'pensar', 'reivindicar', 'representar', 'enfatizar', 'mostrar', 'explicar', 'apontar', 'elevar', 'reivindicar']

In [3]:
words_vectors = nlp.vocab.vectors
print('Shape: (len, dim)', words_vectors.shape)
len(words_vectors)

Shape: (len, dim) (500000, 300)


500000

In [4]:
w1 = nlp.vocab['pensar']
w2 = nlp.vocab['sentir']

In [5]:
def pairwise_similarity(word_list, nlp):
    word_sim_matrix = np.ones(shape = (len(word_list), len(word_list)))
    for i, w1 in enumerate(word_list):
        for j, w2 in enumerate(word_list):
            if w1 !=w2:
                word_sim_matrix[i,j] = nlp.vocab[str(w1)].similarity(nlp.vocab[str(w2)])
    return(word_sim_matrix)
        
pd.DataFrame(data = np.round(pairwise_similarity(sinonimos, nlp), 2),
             index = sinonimos,
             columns = sinonimos)

Unnamed: 0,sentir,pensar,reivindicar,representar,enfatizar,mostrar,explicar,apontar,elevar,reivindicar.1
sentir,1.0,0.66,0.31,0.34,0.41,0.55,0.49,0.43,0.38,0.31
pensar,0.66,1.0,0.43,0.42,0.58,0.62,0.67,0.57,0.38,0.43
reivindicar,0.31,0.43,1.0,0.7,0.62,0.54,0.55,0.6,0.5,1.0
representar,0.34,0.42,0.7,1.0,0.68,0.67,0.61,0.66,0.58,0.7
enfatizar,0.41,0.58,0.62,0.68,1.0,0.74,0.75,0.72,0.58,0.62
mostrar,0.55,0.62,0.54,0.67,0.74,1.0,0.79,0.76,0.56,0.54
explicar,0.49,0.67,0.55,0.61,0.75,0.79,1.0,0.75,0.47,0.55
apontar,0.43,0.57,0.6,0.66,0.72,0.76,0.75,1.0,0.51,0.6
elevar,0.38,0.38,0.5,0.58,0.58,0.56,0.47,0.51,1.0,0.5
reivindicar,0.31,0.43,1.0,0.7,0.62,0.54,0.55,0.6,0.5,1.0


In [6]:
vocab = list(nlp.vocab.strings)
print(len(vocab))
print(vocab[20000:20200])

665468
['2.049', '2.05', '2.050', '2.051', '2.052', '2.053', '2.055', '2.056', '2.057', '2.058', '2.059', '2.06', '2.060', '2.061', '2.062', '2.063', '2.064', '2.065', '2.066', '2.068', '2.069', '2.07', '2.070', '2.071', '2.072', '2.073', '2.074', '2.075', '2.076', '2.077', '2.078', '2.079', '2.08', '2.080', '2.081', '2.082', '2.083', '2.084', '2.085', '2.086', '2.087', '2.088', '2.089', '2.090', '2.091', '2.092', '2.093', '2.094', '2.095', '2.096', '2.097', '2.099', '2.0GHz', '2.0L', '2.0MP', '2.0c', '2.0ghz', '2.0l', '2.0mp', '2.1', '2.1,2', '2.1-', '2.1.0', '2.1.1', '2.1.2', '2.1.3', '2.1.4', '2.1.5', '2.1.6', '2.1/2', '2.10', '2.100', '2.100,00', '2.100.000,00', '2.101', '2.105', '2.105.441', '2.107', '2.108', '2.109', '2.109.049', '2.11', '2.110', '2.111', '2.112', '2.114', '2.115', '2.117', '2.119', '2.12', '2.120', '2.121', '2.124', '2.125', '2.126', '2.129', '2.13', '2.130', '2.131', '2.132', '2.133', '2.134', '2.135', '2.135,64', '2.136', '2.14', '2.140', '2.142', '2.145', '2.

In [7]:
%%time

target_word = 'sentir'
word_sim = []

target_word_vocab = nlp.vocab[target_word]
for w in vocab:
    w_vocab = nlp.vocab[w]
    if w_vocab.vector is not None and np.count_nonzero(w_vocab.vector) and not w_vocab.is_ascii and not w_vocab.is_punct and w != target_word:
        word_sim.append((w, target_word_vocab.similarity(w_vocab)))

CPU times: user 14.3 s, sys: 103 ms, total: 14.4 s
Wall time: 14.4 s


In [8]:
sorted(word_sim, key = lambda x : x[1], reverse=True)[:10]

[('sentirás', 0.7892873883247375),
 ('sentirá', 0.7855808138847351),
 ('sentí-la', 0.7750628590583801),
 ('sentí-lo', 0.7662668824195862),
 ('sentí', 0.7393266558647156),
 ('percebê-la', 0.7342424988746643),
 ('percebê-lo', 0.7295681238174438),
 ('sentir-se-ão', 0.7095298171043396),
 ('sentir-se-á', 0.7068419456481934),
 ('sentirão', 0.7067611217498779)]

In [9]:
w1 = nlp.vocab['sentir']
w2 = nlp.vocab['navio']

print(w2.is_ascii)
print(w2.is_currency)
print(w2.is_punct)

True
False
False


In [10]:
import numba
from numba import jit

@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
    assert(u.shape[0] == v.shape[0])
    uv = 0
    uu = 0
    vv = 0
    for i in range(u.shape[0]):
        uv += u[i]*v[i]
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 1
    if uu != 0 and vv != 0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta

In [11]:
def most_similar_v1(word, topn = 5):
    word = nlp.vocab[str(word)]
    queries = [
          w for w in nlp.vocab 
          if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text) == 2
    ]
  
    by_similarity = sorted(queries, key = lambda w: cosine_similarity_numba(w.vector, word.vector), reverse = True)
    return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]

In [18]:
def most_similar_v2(word, topn=5):
    word = nlp.vocab[str(word)]
    queries = [
      w for w in nlp.vocab 
      if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
    ]

    by_similarity = sorted(queries, key = lambda w: word.similarity(w), reverse = True)
    return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]

In [19]:
%%time
most_similar_v1("sentir", topn=3)

CPU times: user 2.1 s, sys: 6.98 ms, total: 2.11 s
Wall time: 2.1 s


[('só', 0.32476431131362915),
 ('vê', 0.28149497509002686),
 ('pé', 0.22125814855098724),
 ('fé', 0.21561820805072784)]

In [20]:
%%time
most_similar_v2("sentir", topn=3)

CPU times: user 2.1 s, sys: 4 ms, total: 2.1 s
Wall time: 2.1 s


[('só', 0.32476431131362915),
 ('vê', 0.28149497509002686),
 ('pé', 0.22125814855098724),
 ('fé', 0.21561820805072784)]

In [21]:
%%time
sinonimos_topn = dict([(w, most_similar_v1(w, topn=1000)) for w in sinonimos])

CPU times: user 21.2 s, sys: 29.6 ms, total: 21.2 s
Wall time: 21.2 s


In [22]:
sinonimos_topn[sinonimos[0]][:10]

[('só', 0.32476431131362915),
 ('vê', 0.28149497509002686),
 ('pé', 0.22125814855098724),
 ('fé', 0.21561820805072784),
 ('lá', 0.19053611159324646),
 ('aí', 0.1902761608362198),
 ('já', 0.17124590277671814),
 ('Vê', 0.1457124948501587),
 ('és', 0.1431615650653839),
 ('dá', 0.13862080872058868)]

In [25]:
sinonimos_topn_list = []
for w, s in sinonimos_topn.items():
    for s_w, s_s in s:
        sinonimos_topn_list.append((w, s_w, s_s))

In [26]:
print(sinonimos_topn_list[:10])
print(len(sinonimos_topn_list))

[('sentir', 'só', 0.32476431131362915), ('sentir', 'vê', 0.28149497509002686), ('sentir', 'pé', 0.22125814855098724), ('sentir', 'fé', 0.21561820805072784), ('sentir', 'lá', 0.19053611159324646), ('sentir', 'aí', 0.1902761608362198), ('sentir', 'já', 0.17124590277671814), ('sentir', 'Vê', 0.1457124948501587), ('sentir', 'és', 0.1431615650653839), ('sentir', 'dá', 0.13862080872058868)]
9009


In [28]:
df = pd.DataFrame(sinonimos_topn_list, columns = ['w1','w2','sim'])
df[df['sim'] > 0.6]
df

Unnamed: 0,w1,w2,sim
0,sentir,só,0.324764
1,sentir,vê,0.281495
2,sentir,pé,0.221258
3,sentir,fé,0.215618
4,sentir,lá,0.190536
...,...,...,...
9004,elevar,גם,-0.126297
9005,elevar,ín,-0.127578
9006,elevar,их,-0.127596
9007,elevar,vú,-0.127742


In [54]:
WORD_SIMILARITY_CUTOFF = 0.05
df2 = df[df['sim'] > WORD_SIMILARITY_CUTOFF]
nodes_id = list(set(list(df2['w2'].values) + list(df2['w1'].values)))

In [57]:
print(len(nodes_id))
m = len(nodes_id)
distances = np.zeros((m,m))

for i in range(m):
    for j in range(m):  
        distances[i,j] = nlp.vocab[nodes_id[i]].similarity(nlp.vocab[nodes_id[j]])

EMBEDDING_CUTOFF = 0.75

distances_flat = []

for i in range(m):
    for j in range(m):
        if distances[i,j]> EMBEDDING_CUTOFF and i != j:
            distances_flat.append((nodes_id[i], nodes_id[j], distances[i,j]))

edges_df = pd.DataFrame(distances_flat, columns=['w1','w2','sim'])
print(edges_df.shape)

473
(2936, 3)


In [67]:
edges_df = edges_df.append(df2).drop_duplicates()
print(edges_df.shape)
edges_df

(4091, 3)


Unnamed: 0,w1,w2,sim
0,لم,أن,0.922549
1,لم,پس,0.825348
2,لم,هر,0.833947
3,لم,تا,0.828287
4,لم,فى,0.904688
...,...,...,...
8076,elevar,µM,0.051421
8077,elevar,ﬁm,0.050826
8078,elevar,Yū,0.050798
8079,elevar,só,0.050298


In [70]:
import networkx as nx
from pyvis.network import Network

In [71]:
def myRescaler(x):
    x = np.array(x)
    y = np.interp(x, (x.min(), x.max()), (5, 20))
    return list(y)

In [74]:
G= nx.from_pandas_edgelist(edges_df, 'w1','w2','sim')

nodes_df = pd.DataFrame({'id':list(G.nodes),
                         'betweenness': myRescaler(list(nx.betweenness_centrality(G).values())),
                         'eigenvector': myRescaler(list(nx.eigenvector_centrality(G).values()))})
nodes_df['size']=[5 if i not in sinonimos else 10 for i in nodes_id]
nodes_df['size2']= [i if i not in sinonimos else 30 for i in nodes_df['eigenvector']]
nodes_df['group'] = ['KEY' if nodes_df.loc[i,'id'] in sinonimos else 'CONTEXT' for i in range(nodes_df.shape[0])]
nodes_df['color'] = ['lightpink' if nodes_df.loc[i,'group']=='KEY' else 'lightblue' for i in range(nodes_df.shape[0])]
nodes_df['borderWidthSelected'] = list(np.repeat(20.0, nodes_df.shape[0]))

In [75]:
Gvis = Network("768px","1600px", notebook=False,heading="Semantic Network")
edges_in = list(edges_df.to_records(index=False))

for i in range(nodes_df.shape[0]):
    Gvis.add_node(list(G.nodes)[i], value=nodes_df.loc[i,'size2'], group=nodes_df.loc[i,'group'])#, color=nodes_df.loc[i,'color'], borderWidthSelected = nodes_df.loc[i,'borderWidthSelected'])

Gvis.add_edges(edges_in)
#Gvis.show_buttons()
Gvis.set_options("""
  var options = {
    "nodes": {
      "borderWidth": 0,
      "color": {
        "highlight": {
          "border": "rgba(221,171,197,1)",
          "background": "rgba(248,178,255,1)"
        }
      },
      "shadow": {
        "enabled": true
      }
    },
    "edges": {
      "color": {
        "highlight": "rgba(255,192,200,1)",
        "inherit": false
      },
      "smooth": false
    },
    "interaction": {
      "hover": true,
      "navigationButtons": true
    },
    "manipulation": {
      "enabled": true
    },
    "physics": {
      "barnesHut": {
        "springLength": 270
      },
      "minVelocity": 0.75
    }
  }
""")


In [77]:
Gvis.show('Gvis.html')