### Bibliotecas

Importação das bibliotecas usadas.

In [9]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import implicit
import os
import pickle
import warnings

Leitura da base de dados

In [10]:
df_item_delicious2k = pd.read_csv('database/delicious2k/interactions.csv', delimiter=';')
display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_item,id_tag,timestamp,datetime
0,56067,47295,278,1069319563,2003-11-20 07:12:43
1,56067,13165,1511,1070249587,2003-12-01 01:33:07
2,56067,47545,2846,1070342731,2003-12-02 03:25:31
3,56067,58683,16935,1070342792,2003-12-02 03:26:32
4,56067,58683,7732,1070342792,2003-12-02 03:26:32


Quantidade de dados nulos

In [11]:
df_item_delicious2k.isnull().sum()

id_user      0
id_item      0
id_tag       0
timestamp    0
datetime     0
dtype: int64

Mantendo somente as colunas id_user e id_item

In [12]:
df_item_delicious2k.drop(['id_tag', 'timestamp', 'datetime'], axis=1, inplace=True)
display(df_item_delicious2k.head())

Unnamed: 0,id_user,id_item
0,56067,47295
1,56067,13165
2,56067,47545
3,56067,58683
4,56067,58683


Descarta as duplicadas

In [13]:
# print(df_item_delicious2k.shape)
# df_item_delicious2k.drop_duplicates(subset=['id_item', 'id_user'], inplace=True)
# print(df_item_delicious2k.shape)

Agrupa os dados e conta a quantidade de vezes que foi vista pelo user

In [14]:
df_item_delicious2k.rename(columns={'id_user': 'IdUser', 'id_item': 'IdItem'}, inplace=True)
df_item_delicious2k['Quantity'] = 1
grouped_df = df_item_delicious2k[['IdUser', 'IdItem', 'Quantity']].groupby(['IdUser', 'IdItem']).sum().reset_index()
grouped_df.loc[grouped_df['Quantity'] == 0, ['Quantity']] = 1
grouped_df = grouped_df.loc[grouped_df['Quantity'] > 0]
display(grouped_df.head())

Unnamed: 0,IdUser,IdItem,Quantity
0,8,1367,1
1,8,1848,2
2,8,2104,1
3,8,2672,1
4,8,4299,2


In [15]:

print(f'Number of unique users: {grouped_df.IdUser.nunique()}')
print(f'Number of unique items: {grouped_df.IdItem.nunique()}')

print(f'Average purchase quantity per interaction: {int(grouped_df.Quantity.mean())}')
print(f'Minimum purchase quantity per interaction: {grouped_df.Quantity.min()}')
print(f'Maximum purchase quantity per interaction: {grouped_df.Quantity.max()}')

Number of unique users: 1867
Number of unique items: 69198
Average purchase quantity per interaction: 4
Minimum purchase quantity per interaction: 1
Maximum purchase quantity per interaction: 70


Cria um dicionario

In [16]:
unique_users = grouped_df.IdUser.unique()
user_ids = dict(zip(unique_users, np.arange(unique_users.shape[0], dtype=np.int32)))

unique_items = grouped_df.IdItem.unique()
item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))

grouped_df['id_user'] = grouped_df.IdUser.apply(lambda i: user_ids[i])
grouped_df['id_item'] = grouped_df.IdItem.apply(lambda i: item_ids[i])

Cria uma matrix esparsa

In [17]:
sparse_item_user = sparse.csr_matrix((grouped_df['Quantity'].astype(float), (grouped_df['id_item'], grouped_df['id_user'])))
sparse_user_item = sparse.csr_matrix((grouped_df['Quantity'].astype(float), (grouped_df['id_user'], grouped_df['id_item'])))

Treina o modelo

In [18]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)
alpha = 15
data = (sparse_item_user * alpha).astype('double')
model.fit(data)



  0%|          | 0/50 [00:00<?, ?it/s]

### Testes e métricas

In [19]:
display(grouped_df.head(10))

Unnamed: 0,IdUser,IdItem,Quantity,id_user,id_item
0,8,1367,1,0,0
1,8,1848,2,0,1
2,8,2104,1,0,2
3,8,2672,1,0,3
4,8,4299,2,0,4
5,8,4783,1,0,5
6,8,4926,3,0,6
7,8,6245,4,0,7
8,8,6567,5,0,8
9,8,6950,2,0,9


Exemplo de Recomendação — Encontrando os Itens Semelhantes

In [26]:
grouped_df.loc[grouped_df['id_item'] == 5].head()

Unnamed: 0,IdUser,IdItem,Quantity,id_user,id_item
5,8,4783,1,0,5
72044,67926,4783,1,1255,5
82816,80298,4783,5,1455,5


In [25]:
id_item = 5
n_similar = 10

item_vecs = model.item_factors
user_vecs = model.user_factors

item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

scores = item_vecs.dot(item_vecs[id_item]) / item_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / item_norms[id_item]), key=lambda x: -x[1])

for item in similar:
    idx, score = item
    print(grouped_df.IdItem.loc[grouped_df.id_item == idx].iloc[0])

4783
12759
21092
57762
50288
10749
48768
16654
68858
34825


Exemplo de Recomendação — Recomendar Itens aos Clientes


In [28]:
grouped_df.loc[grouped_df['id_user'] == 20].sort_values('Quantity', ascending=False)[['id_user', 'IdItem', 'Quantity']].head(20)

Unnamed: 0,id_user,IdItem,Quantity
1275,20,68814,4
1259,20,38015,4
1246,20,28027,4
1230,20,19460,4
1223,20,15643,3
1268,20,62283,3
1263,20,48755,3
1260,20,39456,3
1258,20,37969,3
1252,20,30251,3


In [32]:
from sklearn.preprocessing import MinMaxScaler

def recommend(user_id, sparse_user_item, user_vecs, item_vecs, num_items=10):
    
    user_interactions = sparse_user_item[user_id,:].toarray()
    user_interactions = user_interactions.reshape(-1) + 1
    user_interactions[user_interactions > 1] = 0
    
    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()
    
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions * rec_vector_scaled

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    
    items_id = []
    scores = []

    for idx in item_idx:
        items_id.append(grouped_df.IdItem.loc[grouped_df.id_item == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'ID_Item': items_id, 'score': scores})

    return recommendations
    
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)
# Create recommendations for customer with id 2
user_id = 20
recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)

print(recommendations)

   ID_Item     score
0    66089  0.923729
1    13949  0.892712
2    61873  0.888947
3     2001  0.881460
4    24157  0.862009
5     8656  0.859719
6    24302  0.818132
7    20840  0.804916
8     3222  0.804428
9     1702  0.799872
