### Bibliotecas

Importação das bibliotecas usadas.

In [15]:
import pandas as pd  
from time import time  
from collections import defaultdict  


import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

Leitura das bases de dados

In [16]:
df_item_delicious2k = pd.read_csv('database/delicious2k/items.csv', index_col=0, delimiter=';')
display(df_item_delicious2k.head())

df_tags_delicious2k = pd.read_csv('database/delicious2k/tags.csv', index_col=0, delimiter=';')
display(df_tags_delicious2k.head())

Unnamed: 0_level_0,title,url,urlPrincipal
id_item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Index of ftp://ftp.mozilla.org/pub/mozilla.org...,ftp://ftp.mozilla.org/pub/mozilla.org/firefox/...,ftp.mozilla.org
1,Fostering the development of critical thinking...,http://0-www.informaworld.com.alpha2.latrobe.e...,0-www.informaworld.com.alpha2.latrobe.edu.au
2,#000000book,http://000000book.com/,000000book.com
3,Loosely Assembled » Retrofitting Geo for the 4...,http://0009.org/blog/2010/07/16/retrofitting-g...,0009.org
4,(2009) Drippings are Dead,http://007485.com/2009_drippings-are-dead.php,007485.com


Unnamed: 0_level_0,name_tag
id_tag,Unnamed: 1_level_1
1,collection_development
2,library
3,collection
4,development
5,lesson_plan


Quantidade de dados nulos

In [17]:
df_item_delicious2k.isnull().sum()

title           44
url              0
urlPrincipal     0
dtype: int64

Deletando os dados nulos

In [18]:
df_item_delicious2k = df_item_delicious2k.dropna().reset_index(drop=True)
df_item_delicious2k.isnull().sum()

title           0
url             0
urlPrincipal    0
dtype: int64

Mantendo somente as colunas id e title

In [19]:
df_item_delicious2k = df_item_delicious2k.drop(['url', 'urlPrincipal'], axis=1)
display(df_item_delicious2k.head())

Unnamed: 0,title
0,Index of ftp://ftp.mozilla.org/pub/mozilla.org...
1,Fostering the development of critical thinking...
2,#000000book
3,Loosely Assembled » Retrofitting Geo for the 4...
4,(2009) Drippings are Dead


Remover os caracteres que não são alfanuméricos e deixando todas as letras minusculas

In [20]:
df_item_delicious2k['title'] = df_item_delicious2k['title'].str.replace('[^a-zA-Z0-9 \n\.]', '')
df_item_delicious2k['title'] = df_item_delicious2k['title'].str.lower()

display(df_item_delicious2k.head())

  df_item_delicious2k['title'] = df_item_delicious2k['title'].str.replace('[^a-zA-Z0-9 \n\.]', '')


Unnamed: 0,title
0,index of ftpftp.mozilla.orgpubmozilla.orgfiref...
1,fostering the development of critical thinking...
2,000000book
3,loosely assembled retrofitting geo for the 4t...
4,2009 drippings are dead


Cria um dicionario

In [21]:
from gensim.models.phrases import Phrases, Phraser

sent = [row.split() for row in df_item_delicious2k['title']]

phrases = Phrases(sent, min_count=5, progress_per=10000)

bigram = Phraser(phrases)

sentences = bigram[sent]

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

sorted(word_freq, key=word_freq.get, reverse=True)[:10]

INFO - 14:21:55: collecting all words and their counts
INFO - 14:21:55: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 14:21:55: PROGRESS: at sentence #10000, processed 66882 words and 57606 word types
INFO - 14:21:55: PROGRESS: at sentence #20000, processed 132211 words and 104156 word types
INFO - 14:21:55: PROGRESS: at sentence #30000, processed 198393 words and 148649 word types
INFO - 14:21:55: PROGRESS: at sentence #40000, processed 265253 words and 191819 word types
INFO - 14:21:56: PROGRESS: at sentence #50000, processed 334008 words and 234133 word types
INFO - 14:21:56: PROGRESS: at sentence #60000, processed 404492 words and 275062 word types
INFO - 14:21:56: collected 310072 token types (unigram + bigrams) from a corpus of 466224 words and 69182 sentences
INFO - 14:21:56: merged Phrases<310072 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
INFO - 14:21:56: Phrases lifecycle event {'msg': 'built Phrases<310072 vocab, min_count=5, threshold=

['the', 'and', 'of', 'for', 'a', 'in', 'to', 'on', 'blog', 'with']

Cria uma tabela de vocabulário

In [22]:
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() 

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 14:21:57: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2023-07-26T14:21:57.151574', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
INFO - 14:21:57: collecting all words and their counts
INFO - 14:21:57: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 14:21:57: PROGRESS: at sentence #10000, processed 60038 words, keeping 17460 word types
INFO - 14:21:57: PROGRESS: at sentence #20000, processed 118880 words, keeping 28801 word types
INFO - 14:21:57: PROGRESS: at sentence #30000, processed 178968 words, keeping 38321 word types
INFO - 14:21:57: PROGRESS: at sentence #40000, processed 240131 words, keeping 47108 word types
INFO - 14:21:57: PROGRESS: at sentence #50000, processed 302948 words, keeping 55303 word types
INFO - 14:21:57: PROGRESS: at sentence #60000, processed 367063 words, keeping 6

Time to build vocab: 0.01 mins


Treina o modelo

In [23]:
t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 14:21:57: Word2Vec lifecycle event {'msg': 'training model with 11 workers on 2902 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-07-26T14:21:57.737032', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 14:21:58: worker thread finished; awaiting finish of 10 more threads
INFO - 14:21:58: worker thread finished; awaiting finish of 9 more threads
INFO - 14:21:58: worker thread finished; awaiting finish of 8 more threads
INFO - 14:21:58: worker thread finished; awaiting finish of 7 more threads
INFO - 14:21:58: worker thread finished; awaiting finish of 6 more threads
INFO - 14:21:58: worker thread finished; awaiting finish of 5 more threads
INFO - 14:21:58: worker thread finished; awaiting finish of 4 more threads
INFO - 14:21:58: worker thread finished; awaiting finish of 3 more threads
INFO - 14:

Time to train the model: 0.3 mins


Modificar o modelo para torná-lo mais eficiente

In [24]:
# deixa o modelo mais eficiente - pré-computar vetores normalizados por L2.
#w2v_model.init_sims(replace=True)

### Testes e métricas

In [25]:
display(w2v_model.wv.most_similar(positive=["culture"]))

display(w2v_model.wv.most_similar(positive=["film"]))

display(w2v_model.wv.most_similar(positive=["facebook"]))

display(w2v_model.wv.most_similar(positive=["before"]))

[('architects', 0.8621521592140198),
 ('the_independent', 0.8512030243873596),
 ('poverty', 0.8468489646911621),
 ('history_of', 0.8355525732040405),
 ('politics', 0.8342115879058838),
 ('newsroom', 0.8317297697067261),
 ('political', 0.8302711844444275),
 ('society', 0.8289754986763),
 ('washington', 0.8268049955368042),
 ('justice', 0.825136661529541)]

[('documentary', 0.8320078253746033),
 ('history_of', 0.8249190449714661),
 ('art', 0.8233887553215027),
 ('museum_of', 0.802161455154419),
 ('sport', 0.7951771020889282),
 ('radio', 0.7931059002876282),
 ('television', 0.7866121530532837),
 ('guardian.co.uk', 0.7839235067367554),
 ('george', 0.7827044129371643),
 ('indie', 0.7810133099555969)]

[('twitter', 0.8708085417747498),
 ('linkedin', 0.822917103767395),
 ('foursquare', 0.8207159042358398),
 ('account', 0.7916127443313599),
 ('emarketer', 0.7793486714363098),
 ('fans', 0.7734468579292297),
 ('feeds', 0.7645094990730286),
 ('gets', 0.7633030414581299),
 ('tweets', 0.7496503591537476),
 ('link', 0.7348347902297974)]

[('who', 0.9019845724105835),
 ('did', 0.9008893966674805),
 ('sleep', 0.9007086157798767),
 ('happy', 0.9002842903137207),
 ('been', 0.8974510431289673),
 ('cracked.com', 0.8974508047103882),
 ('away', 0.8963783383369446),
 ('wait', 0.8957814574241638),
 ('but', 0.8907675743103027),
 ('this_is', 0.88478022813797)]

In [26]:
display(w2v_model.wv.similarity("before", 'after'))

display(w2v_model.wv.similarity('facebook', 'networking'))

display(w2v_model.wv.similarity('facebook', 'social'))

display(w2v_model.wv.similarity('california', 'city'))

0.8416568

0.32342213

0.6876843

0.66806304

In [27]:
display(w2v_model.wv.doesnt_match(["facebook", "twitter", "music"]))

display(w2v_model.wv.doesnt_match(["news", "reviews", "city"]))

'music'

'city'

In [28]:
display(w2v_model.wv.most_similar(positive=["before", "after"], negative=["news"], topn=3))

[('everyone', 0.7783849239349365),
 ('this', 0.7690214514732361),
 ('tell', 0.7639812231063843)]

In [29]:
w2v_model.wv['facebook']

array([ 2.57714927e-01,  2.34794319e-01,  5.97753942e-01,  6.99541867e-01,
        6.91650733e-02, -3.80674064e-01,  1.88119709e-01,  2.56881803e-01,
       -2.65564531e-01,  3.02859396e-01, -1.93976350e-02, -7.19678495e-03,
        1.26380503e-01, -1.31477818e-01,  3.66246730e-01, -1.51752934e-01,
        2.94706047e-01, -1.38219940e-02,  6.05250634e-02,  1.86488017e-01,
       -6.71747029e-02,  3.84526879e-01, -3.25521529e-01,  1.26916975e-01,
       -1.90838724e-01, -3.18591088e-01, -2.42183015e-01, -8.00687671e-02,
       -5.59585929e-01, -1.69110298e-01,  1.60533160e-01, -1.46186575e-01,
       -3.52479964e-01, -7.40431905e-01,  1.45619735e-01,  8.17461789e-01,
        1.26878724e-01,  1.89727128e-01, -2.95693219e-01, -5.18182106e-02,
       -1.13012716e-01, -3.32814336e-01, -5.68728745e-02, -9.66743752e-03,
        4.85683978e-01, -6.08670235e-01, -4.80009884e-01, -8.73312168e-03,
        2.70692945e-01,  2.40520984e-02, -1.81946814e-01,  8.09262842e-02,
       -1.69101179e-01,  