### Bibliotecas

Importação das bibliotecas usadas.

In [101]:
import pandas as pd  
from time import time  
from collections import defaultdict  


import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

Leitura da base de dados

In [102]:
df_item_delicious2k = pd.read_csv('database/delicious2k/interactions.csv', delimiter=';')
display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_item,id_tag,timestamp,datetime
0,56067,47295,278,1069319563,2003-11-20 07:12:43
1,56067,13165,1511,1070249587,2003-12-01 01:33:07
2,56067,47545,2846,1070342731,2003-12-02 03:25:31
3,56067,58683,16935,1070342792,2003-12-02 03:26:32
4,56067,58683,7732,1070342792,2003-12-02 03:26:32


Quantidade de dados nulos

In [103]:
df_item_delicious2k.isnull().sum()

id_user      0
id_item      0
id_tag       0
timestamp    0
datetime     0
dtype: int64

Mantendo somente as colunas id_user e id_item

In [104]:
df_item_delicious2k = df_item_delicious2k.drop(['id_tag', 'timestamp', 'datetime'], axis=1)
display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_item
0,56067,47295
1,56067,13165
2,56067,47545
3,56067,58683
4,56067,58683


Descarta as duplicadas

In [105]:
print(df_item_delicious2k.shape)
df_item_delicious2k = df_item_delicious2k.drop_duplicates(subset=['id_item', 'id_user'])
print(df_item_delicious2k.shape)

(437593, 2)
(104794, 2)


Agrupa os item consumidos por cada usuario

In [106]:
df_item_delicious2k = df_item_delicious2k.groupby('id_user')['id_item'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
df_item_delicious2k.columns = ['id_user', 'id_items']

display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_items
0,8,13589 2672 68527 32440 32439 38974 27038 4783 ...
1,32,62239 62241 62240 4976 15466 4605 4606 55074
2,57,8507 67451 11105 66754 67423 67426 48634 22642...
3,147,20113 65969 17379 59148 19160 8561 7962 50577 ...
4,233,65155 57035 37117 52587 30417 30431 59529 1094...


Salva em um arquivo csv

In [107]:
df_item_delicious2k.to_csv('item_user_recommender.csv', index=False)

Cria um dicionario

In [108]:
from gensim.models.phrases import Phrases, Phraser

sent = [row.split() for row in df_item_delicious2k['id_items']]

phrases = Phrases(sent, min_count=1, progress_per=10000)

bigram = Phraser(phrases)

sentences = bigram[sent]

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

sorted(word_freq, key=word_freq.get, reverse=True)[:10]

INFO - 09:17:46: collecting all words and their counts
INFO - 09:17:46: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 09:17:46: collected 154562 token types (unigram + bigrams) from a corpus of 104794 words and 1867 sentences
INFO - 09:17:46: merged Phrases<154562 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 09:17:46: Phrases lifecycle event {'msg': 'built Phrases<154562 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000> in 0.11s', 'datetime': '2023-07-31T09:17:46.751099', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
INFO - 09:17:46: exporting phrases from Phrases<154562 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 09:17:46: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<17437 phrases, min_count=1, threshold=10.0> from Phrases<154562 vocab, min_count=1, threshold=10.0, max_vocab_size

['13919',
 '5995',
 '17971',
 '66086',
 '64598',
 '68666',
 '22832',
 '15592',
 '8414',
 '49804']

Cria uma tabela de vocabulário

In [109]:
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() 

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 09:17:47: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2023-07-31T09:17:47.036066', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
INFO - 09:17:47: collecting all words and their counts
INFO - 09:17:47: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 09:17:47: collected 64310 word types from a corpus of 85677 raw words and 1867 sentences
INFO - 09:17:47: Creating a fresh vocabulary
INFO - 09:17:47: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 6 unique words (0.009329808738920852%% of original 64310, drops 64304)', 'datetime': '2023-07-31T09:17:47.145064', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INFO - 09:17:47: Word2Vec lifecycle event {'msg': 'effect

Time to build vocab: 0.0 mins


Treina o modelo

In [110]:
t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 09:17:47: Word2Vec lifecycle event {'msg': 'training model with 11 workers on 6 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-07-31T09:17:47.163067', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 09:17:47: worker thread finished; awaiting finish of 10 more threads
INFO - 09:17:47: worker thread finished; awaiting finish of 9 more threads
INFO - 09:17:47: worker thread finished; awaiting finish of 8 more threads
INFO - 09:17:47: worker thread finished; awaiting finish of 7 more threads
INFO - 09:17:47: worker thread finished; awaiting finish of 6 more threads
INFO - 09:17:47: worker thread finished; awaiting finish of 5 more threads
INFO - 09:17:47: worker thread finished; awaiting finish of 4 more threads
INFO - 09:17:47: worker thread finished; awaiting finish of 3 more threads
INFO - 09:17:

Time to train the model: 0.04 mins


Modificar o modelo para torná-lo mais eficiente

In [111]:
# deixa o modelo mais eficiente - pré-computar vetores normalizados por L2.
#w2v_model.init_sims(replace=True)

### Testes e métricas

In [112]:
display(w2v_model.wv.most_similar('13919', topn=10))

[('17971', 0.06797593832015991),
 ('64598', 0.009391184896230698),
 ('68666', 0.004503015894442797),
 ('5995', -0.010839187540113926),
 ('66086', -0.023671669885516167)]