In [2]:
import json
 
# Opening JSON file
f = open('./CubaCrawler/cubadebate.json')
 
# returns JSON object as
# a dictionary
data = json.load(f)
f.close()
social_net = 'cubadebate'

In [3]:
text_vector = []
for obj in data:
    text_vector.append(obj['text'])

In [4]:
import progressbar
def get_progressbar(N, name = ""):
    return progressbar.ProgressBar(
        maxval=N, 
        widgets=[progressbar.Bar('#', '[', ']'), 
        name, 
        progressbar.Percentage()])

In [5]:
import spacy
nlp = spacy.load("es_core_news_sm")

bar = get_progressbar(len(text_vector), ' tokenizer text ')
bar.start()
for i, t in enumerate(text_vector):
    text = set()
    for token in nlp(t):
        if token.is_stop or token.is_punct: continue
        text.add(token.lemma_.lower())
    text_vector[i] = list(text)
    bar.update(i+1)
bar.finish()

[#########################################################] tokenizer text 100%


In [6]:
X = text_vector.copy()
Y = text_vector.copy()

In [7]:
vocabulary = {}


for i, text in enumerate(X):
    for word in text:
        try: vocabulary[word]['X'].add(i)
        except KeyError: 
            vocabulary[word] = { 'X': set(), 'Y': set() }
            vocabulary[word]['X'].add(i)

for i, text in enumerate(Y):
    for word in text:
        try: vocabulary[word]['Y'].add(i)
        except KeyError: 
            vocabulary[word] = { 'X': set(), 'Y': set() }
            vocabulary[word]['Y'].add(i)


In [8]:
def strength(tj, ti):
    Dtj : set = vocabulary[tj]['X']
    Dti : set = vocabulary[ti]['Y']

    if Dti.isdisjoint(Dtj): return 0

    nij = Dti.intersection(Dtj)
    return (len(nij) + 1)/ (len(Dti) + 2)

In [9]:
_list_ = list(vocabulary)

In [22]:

_strength_ = [[0] * len(_list_) for _ in range(len(_list_))]

bar = get_progressbar(len(_list_), ' strength computer ')
bar.start()
for j, text_j in enumerate(_list_):
    for i, text_i in enumerate(_list_):
        _strength_[j][i] = strength(text_j, text_i)
    bar.update(j+1)
bar.finish()


[######################################################] strength computer 100%


In [53]:
_top_edge_ = 0.6

edge_dict = {}
bar = get_progressbar(len(_list_), ' term linked ')
bar.start()
for i, _ in enumerate(_list_):
    l = []
    for j, _ in enumerate(_list_):
        if i == j: continue
        elif _strength_[i][j] > _top_edge_:
            l.append((_strength_[i][j], j))
    edge_dict[i] = l
    bar.update(i+1)
bar.finish()

[############################################################] term linked 100%


In [54]:
sj = [
    sum([_strength_[index][i] for _, i in edge_dict[index] if index != i])
    for index in range(len(_list_))
]

def probability(index, B, distribution):

    return B * distribution[index] + ((1-B) / (sj[index] + 1)) * sum([   
        _strength_[index][i] * distribution[i] 
        for _, i in edge_dict[index] if index != i 
    ]) 


In [58]:
B=0.8
tuple_ = []

query, q = "Las viandas y su precio", []

for token in nlp(query):
    if token.is_stop or token.is_punct: continue
    q.append(token.lemma_.lower())

print(q)
_len_ = len(_list_)
distribution = []
for word in _list_:
    if word in q: distribution.append(1)
    else: distribution.append(1/_len_)


bar = get_progressbar(len(_list_), ' probability computer ')
bar.start()
for i, text in enumerate(_list_):
    tuple_.append((text, probability(i, B, distribution)))
    bar.update(i+1)
bar.finish()
print(len(tuple_))
tuple_ = [(x, y) for x, y in tuple_ if y > 1/len(_list_)]
tuple_.sort(key=lambda x: x[1], reverse=True)
tuple_[0:20]

[###                                                ] probability computer   6%

['vianda', 'precio']


[###################################################] probability computer 100%


14740


[('precio', 0.8000135512630451),
 ('vianda', 0.8000131006409957),
 ('797', 0.004664763954522053),
 ('habitacional', 0.004664763954522053),
 ('reorganización', 0.004664763954522053),
 ('bienio', 0.004664763954522053),
 ('561', 0.004664763954522053),
 ('454', 0.004664763954522053),
 ('morador', 0.004664763954522053),
 ('ydael', 0.004664763954522053),
 ('despegar', 0.004664763954522053),
 ('apicultura', 0.004664763954522053),
 ('vegetal', 0.004664763954522053),
 ('15%', 0.004664763954522053),
 ('544', 0.004664763954522053),
 ('reforestación', 0.004664763954522053),
 ('vivian', 0.004664763954522053),
 ('asignado', 0.004664763954522053),
 ('micons', 0.004664763954522053),
 ('minag', 0.004664763954522053)]

In [57]:
s = set()
for term in q:
    for _ , i in edge_dict[_list_.index(term)]:
        s.add(_list_[i])

s

{'unicode',
 'acompañant',
 'vianda',
 'pegado',
 'repetido',
 'cerrarir',
 'sibanicú',
 'profesorar',
 'festivales',
 'tildir',
 'heberferon',
 'coro',
 'bipolar',
 'depreciación',
 'hispanoamérica',
 'saludo',
 'desconsolado',
 'jander',
 'exigencia',
 'depositado',
 'desestabilización',
 'dedicación',
 'baseball',
 '-lo',
 'pensant',
 'auckland',
 'búfalo',
 'quinquenio',
 'desaparecierir',
 'fongang',
 'habitacional',
 'afecto',
 'cerrando',
 'traductor',
 'inhabilitar él',
 'crecerno',
 'hipertensión',
 'mariano',
 'dembélé',
 'equívocamente',
 '9.2',
 'autobiográfico',
 'nóvel',
 'libanés',
 'aunar',
 'compartirno',
 '082',
 'contento',
 'curtido',
 'pogolotti',
 'beisbolistas',
 'manhattan',
 'mayoritariamente',
 'inadecuado',
 'interfax',
 'jugar él',
 'sobresalir',
 'desagüe',
 'abecedario',
 'abdominal',
 'dofleini',
 'represión',
 'communista',
 'dinámica',
 'espinosa',
 'ofender',
 'pic.twitter.com/ycgrowbgff',
 'planteándono',
 'pensar él',
 'gendarme',
 '2021-2026',
 'con