### Bibliotecas

Importação das bibliotecas usadas.

In [181]:
import pandas as pd  
from time import time  
from collections import defaultdict  


import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

Leitura da base de dados

In [182]:
df_item_delicious2k = pd.read_csv('database/delicious2k/interactions.csv', delimiter=';')
display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_item,id_tag,timestamp,datetime
0,56067,47295,278,1069319563,2003-11-20 07:12:43
1,56067,13165,1511,1070249587,2003-12-01 01:33:07
2,56067,47545,2846,1070342731,2003-12-02 03:25:31
3,56067,58683,16935,1070342792,2003-12-02 03:26:32
4,56067,58683,7732,1070342792,2003-12-02 03:26:32


Quantidade de dados nulos

In [183]:
df_item_delicious2k.isnull().sum()

id_user      0
id_item      0
id_tag       0
timestamp    0
datetime     0
dtype: int64

Mantendo somente as colunas id_user e id_item

In [184]:
df_item_delicious2k.drop(['id_tag', 'timestamp', 'datetime'], axis=1, inplace=True)
display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_item
0,56067,47295
1,56067,13165
2,56067,47545
3,56067,58683
4,56067,58683


Descarta as duplicadas

In [185]:
print(df_item_delicious2k.shape)
df_item_delicious2k.drop_duplicates(subset=['id_item', 'id_user'], inplace=True)
print(df_item_delicious2k.shape)

(437593, 2)
(437593, 2)


Agrupa os item consumidos por cada usuario

In [186]:
df_item_delicious2k = df_item_delicious2k.groupby('id_user')['id_item'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
df_item_delicious2k.columns = ['id_user', 'id_items']

display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_items
0,8,13589 2672 68527 32440 32439 32439 32439 38974...
1,32,62239 62239 62239 62241 62241 62241 62241 6224...
2,57,8507 8507 8507 67451 11105 66754 67423 67423 6...
3,147,20113 20113 20113 20113 20113 65969 65969 6596...
4,233,65155 65155 65155 57035 57035 57035 57035 5703...


Salva em um arquivo csv

In [187]:
df_item_delicious2k.to_csv('item_user_recommender.csv', index=False)

Cria um dicionario

In [188]:
from gensim.models.phrases import Phrases, Phraser

sent = [row.split() for row in df_item_delicious2k['id_items']]

phrases = Phrases(sent, min_count=1, progress_per=10000)

bigram = Phraser(phrases)

sentences = bigram[sent]

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

sorted(word_freq, key=word_freq.get, reverse=True)[:10]

INFO - 09:37:11: collecting all words and their counts
INFO - 09:37:11: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 09:37:12: collected 216367 token types (unigram + bigrams) from a corpus of 437593 words and 1867 sentences
INFO - 09:37:12: merged Phrases<216367 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 09:37:12: Phrases lifecycle event {'msg': 'built Phrases<216367 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000> in 0.41s', 'datetime': '2023-07-31T09:37:12.284167', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
INFO - 09:37:12: exporting phrases from Phrases<216367 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 09:37:12: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<70114 phrases, min_count=1, threshold=10.0> from Phrases<216367 vocab, min_count=1, threshold=10.0, max_vocab_size

['13919_13919',
 '15592_15592',
 '22632_22632',
 '22832_22832',
 '5995_5995',
 '17971_17971',
 '8414_8414',
 '50987_50987',
 '64598_64598',
 '47902_47902']

Cria uma tabela de vocabulário

In [189]:
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() 

w2v_model = Word2Vec(min_count=5,
                     window=2,
                     workers=cores-1)

t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 09:37:12: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.025)', 'datetime': '2023-07-31T09:37:12.866168', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
INFO - 09:37:12: collecting all words and their counts
INFO - 09:37:12: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 09:37:13: collected 106791 word types from a corpus of 245833 raw words and 1867 sentences
INFO - 09:37:13: Creating a fresh vocabulary
INFO - 09:37:13: Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 10188 unique words (9.540129786217939%% of original 106791, drops 96603)', 'datetime': '2023-07-31T09:37:13.187166', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INFO - 09:37:13: Word2Vec lifecycle event {'msg': 'ef

Time to build vocab: 0.01 mins


Treina o modelo

In [190]:
t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 09:37:13: Word2Vec lifecycle event {'msg': 'training model with 11 workers on 10188 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=2 shrink_windows=True', 'datetime': '2023-07-31T09:37:13.341168', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 09:37:13: worker thread finished; awaiting finish of 10 more threads
INFO - 09:37:13: worker thread finished; awaiting finish of 9 more threads
INFO - 09:37:13: worker thread finished; awaiting finish of 8 more threads
INFO - 09:37:13: worker thread finished; awaiting finish of 7 more threads
INFO - 09:37:13: worker thread finished; awaiting finish of 6 more threads
INFO - 09:37:13: worker thread finished; awaiting finish of 5 more threads
INFO - 09:37:13: worker thread finished; awaiting finish of 4 more threads
INFO - 09:37:13: worker thread finished; awaiting finish of 3 more threads
INFO - 09:

Time to train the model: 0.13 mins


Modificar o modelo para torná-lo mais eficiente

In [191]:
# deixa o modelo mais eficiente - pré-computar vetores normalizados por L2.
#w2v_model.init_sims(replace=True)

### Testes e métricas

In [192]:
display(w2v_model.wv.most_similar('13919', topn=10))

[('31012_31012', 0.9726067781448364),
 ('52484_52484', 0.9718183875083923),
 ('13919_13919', 0.9716140031814575),
 ('60679_60679', 0.9708607196807861),
 ('56288_56288', 0.9656278491020203),
 ('12027_12027', 0.9512013792991638),
 ('34133_34133', 0.9424645304679871),
 ('54431_54431', 0.9393444657325745),
 ('29164_29164', 0.9300186038017273),
 ('65870_65870', 0.9254260063171387)]