### Bibliotecas

Importação das bibliotecas usadas.

In [1]:
import pandas as pd  
from time import time  
from collections import defaultdict  


import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

Leitura da base de dados

In [2]:
df_item_delicious2k = pd.read_csv('database/delicious2k/interactions.csv', delimiter=';')
display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_item,id_tag,timestamp,datetime
0,56067,47295,278,1069319563,2003-11-20 07:12:43
1,56067,13165,1511,1070249587,2003-12-01 01:33:07
2,56067,47545,2846,1070342731,2003-12-02 03:25:31
3,56067,58683,16935,1070342792,2003-12-02 03:26:32
4,56067,58683,7732,1070342792,2003-12-02 03:26:32


Quantidade de dados nulos

In [3]:
df_item_delicious2k.isnull().sum()

id_user      0
id_item      0
id_tag       0
timestamp    0
datetime     0
dtype: int64

Mantendo somente as colunas id_user e id_item

In [4]:
df_item_delicious2k.drop(['id_tag', 'timestamp', 'datetime'], axis=1, inplace=True)
display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_item
0,56067,47295
1,56067,13165
2,56067,47545
3,56067,58683
4,56067,58683


Descarta as duplicadas

In [5]:
print(df_item_delicious2k.shape)
df_item_delicious2k.drop_duplicates(subset=['id_item', 'id_user'], inplace=True)
print(df_item_delicious2k.shape)

(437593, 2)
(104794, 2)


Agrupa os item consumidos por cada usuario

In [6]:
df_item_delicious2k = df_item_delicious2k.groupby('id_user')['id_item'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
df_item_delicious2k.columns = ['id_user', 'id_items']

display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_items
0,8,13589 2672 68527 32440 32439 38974 27038 4783 ...
1,32,62239 62241 62240 4976 15466 4605 4606 55074
2,57,8507 67451 11105 66754 67423 67426 48634 22642...
3,147,20113 65969 17379 59148 19160 8561 7962 50577 ...
4,233,65155 57035 37117 52587 30417 30431 59529 1094...


Salva em um arquivo csv

In [7]:
df_item_delicious2k.to_csv('item_user_recommender.csv', index=False)

Define o id_user como índice

In [8]:
df_item_delicious2k.index = df_item_delicious2k['id_user']
df_item_delicious2k.drop('id_user', axis=1, inplace=True)

display(df_item_delicious2k.head())

Unnamed: 0_level_0,id_items
id_user,Unnamed: 1_level_1
8,13589 2672 68527 32440 32439 38974 27038 4783 ...
32,62239 62241 62240 4976 15466 4605 4606 55074
57,8507 67451 11105 66754 67423 67426 48634 22642...
147,20113 65969 17379 59148 19160 8561 7962 50577 ...
233,65155 57035 37117 52587 30417 30431 59529 1094...


Cria um dicionario

In [9]:
from gensim.models.doc2vec import TaggedDocument

df_split = df_item_delicious2k['id_items'].apply(lambda x: x.split(" "))
display(df_split)

documents = [TaggedDocument(doc, str(df_split.index[i])) for i, doc in enumerate(df_split)]
display(documents[:2])

id_user
8         [13589, 2672, 68527, 32440, 32439, 38974, 2703...
32        [62239, 62241, 62240, 4976, 15466, 4605, 4606,...
57        [8507, 67451, 11105, 66754, 67423, 67426, 4863...
147       [20113, 65969, 17379, 59148, 19160, 8561, 7962...
233       [65155, 57035, 37117, 52587, 30417, 30431, 595...
                                ...                        
107686    [45934, 33669, 27519, 12335, 16397, 4648, 6056...
107755    [8146, 8029, 37263, 51060, 37670, 22720, 37415...
107862    [58532, 60473, 65485, 10944, 50722, 33914, 265...
107984    [68191, 28727, 2852, 5568, 4649, 23714, 31235,...
108035    [36581, 14250, 46370, 57759, 37863, 51394, 272...
Name: id_items, Length: 1867, dtype: object

[TaggedDocument(words=['13589', '2672', '68527', '32440', '32439', '38974', '27038', '4783', '34825', '22496', '37854', '50280', '68858', '57763', '6245', '12759', '18682', '50286', '50287', '50288', '50279', '56454', '48768', '50277', '40292', '51953', '49122', '49121', '59084', '43443', '57759', '14953', '36883', '16654', '38174', '38175', '31467', '2104', '36035', '21947', '32695', '59905', '57762', '24683', '14516', '9320', '10749', '21092', '58755', '40093', '1848', '4299', '19659', '35615', '4926', '36129', '47071', '6950', '42573', '40169', '48060', '6567', '19302', '47947', '47945', '37187', '40091', '1367', '45951'], tags='8'),
 TaggedDocument(words=['62239', '62241', '62240', '4976', '15466', '4605', '4606', '55074'], tags='32')]

Cria uma tabela de vocabulário

In [10]:
import multiprocessing

from gensim.models import Doc2Vec

cores = multiprocessing.cpu_count() 

w2v_model_itens = Doc2Vec(min_count=5,
                    window=2,
                    workers=cores-1)

t = time()
w2v_model_itens.build_vocab(documents, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 06:36:48: Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d100,n5,w2,mc5,s0.001,t11)', 'datetime': '2023-08-07T06:36:48.137132', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
INFO - 06:36:48: collecting all words and their counts
INFO - 06:36:48: PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO - 06:36:48: collected 69198 word types and 10 unique tags from a corpus of 1867 examples and 104794 words
INFO - 06:36:48: Creating a fresh vocabulary
INFO - 06:36:48: Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1204 unique words (1.739934680193069%% of original 69198, drops 67994)', 'datetime': '2023-08-07T06:36:48.189133', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INFO - 06:36:48: Doc2Vec lifecycle event {'msg'

Time to build vocab: 0.0 mins


Treina o modelo

In [11]:
t = time()
w2v_model_itens.train(documents, total_examples=w2v_model_itens.corpus_count, epochs=10, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 06:36:48: Doc2Vec lifecycle event {'msg': 'training model with 11 workers on 1204 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=2 shrink_windows=True', 'datetime': '2023-08-07T06:36:48.235132', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 06:36:48: worker thread finished; awaiting finish of 10 more threads
INFO - 06:36:48: worker thread finished; awaiting finish of 9 more threads
INFO - 06:36:48: worker thread finished; awaiting finish of 8 more threads
INFO - 06:36:48: worker thread finished; awaiting finish of 7 more threads
INFO - 06:36:48: worker thread finished; awaiting finish of 6 more threads
INFO - 06:36:48: worker thread finished; awaiting finish of 5 more threads
INFO - 06:36:48: worker thread finished; awaiting finish of 4 more threads
INFO - 06:36:48: worker thread finished; awaiting finish of 3 more threads
INFO - 06:36

Time to train the model: 0.02 mins


Modificar o modelo para torná-lo mais eficiente

In [12]:
# deixa o modelo mais eficiente - pré-computar vetores normalizados por L2.
#w2v_model_itens.init_sims(replace=True)

### Testes e métricas

In [13]:
display(w2v_model_itens.wv.most_similar('17971'))

[('8414', 0.9727048277854919),
 ('15592', 0.9657999873161316),
 ('60954', 0.9649906158447266),
 ('22632', 0.9647397994995117),
 ('49804', 0.9643433094024658),
 ('68666', 0.96297287940979),
 ('31773', 0.9629290103912354),
 ('29707', 0.9617167711257935),
 ('47895', 0.961419403553009),
 ('13919', 0.9610307812690735)]

In [14]:
display(w2v_model_itens.dv['8'])

array([-0.9912618 , -1.0168346 , -0.73537695,  0.22549483, -0.2761046 ,
        0.4394822 , -0.1554847 ,  0.06785894, -0.68692935, -0.01325471,
       -0.01008369,  0.30772337,  0.09171174, -0.11314163,  0.00833276,
       -0.07110127, -0.36798447,  0.439601  ,  0.13509603,  0.20094216,
        0.2984481 , -0.33703715,  0.06478927,  0.5319433 ,  0.13450624,
       -0.36009112, -1.104463  , -0.03398375, -0.59659916, -1.016345  ,
        0.9191843 ,  0.30671325, -0.0248178 ,  1.2609051 ,  0.32333955,
        0.8385996 , -0.13398811, -1.2027547 ,  0.06756682, -0.6976798 ,
        0.27716717, -0.03220363, -0.25943905, -0.6970354 , -0.59777594,
        0.20910437,  0.19760059,  0.29249316,  0.0308669 ,  0.3042777 ,
       -0.54766846, -0.47030842, -0.5530588 , -0.36577585, -0.19025783,
       -0.0887242 , -0.49776667,  0.7133602 , -0.38512224,  0.03081879,
        1.3698734 ,  0.66775954, -0.17466885,  0.3901795 ,  0.24507599,
        0.22149771,  1.0683527 ,  0.5672117 , -0.09921928,  0.24

In [18]:
vector = w2v_model_itens.infer_vector(['61703', '57298', '68422', '58262'])
display(vector)

array([-2.1843284e-03,  2.5785184e-03, -5.5895747e-05,  9.5653831e-04,
       -1.4704153e-03,  2.8150207e-03,  3.8213748e-03, -3.2877303e-03,
        3.6413081e-03, -4.5801857e-03, -3.0050578e-03, -8.1713794e-04,
       -2.9582137e-03, -1.7124966e-03,  4.7522988e-03,  3.5093045e-03,
       -3.3836097e-03,  3.7887334e-04, -6.2979606e-04,  3.0648403e-03,
       -2.0060125e-03,  4.2721331e-03, -1.7843390e-03,  2.4320937e-03,
        4.4270135e-03,  5.4298283e-04,  3.5327536e-03,  4.9043837e-04,
        1.1997462e-03,  4.5912685e-03, -4.7835121e-03, -3.6696482e-03,
       -8.8323595e-04,  2.4182992e-03,  3.4266103e-03, -2.1122354e-03,
        4.1260510e-03, -3.7371109e-03, -3.0294573e-03,  1.6954750e-03,
        8.6253765e-04, -2.9048056e-03, -3.1717140e-03, -5.7562115e-04,
       -2.7750137e-03,  3.9309086e-04, -4.4253711e-03, -3.0275697e-03,
       -4.9325670e-03,  1.8546701e-04, -1.4977459e-03,  3.5639715e-03,
        1.8631894e-03, -1.0704094e-03,  4.3460070e-03, -1.0723827e-03,
      