## Impotando os Dados

In [None]:
import pandas as pd

In [None]:
from google.colab import files
files.upload()

Saving Online Retail.xlsx to Online Retail.xlsx


In [None]:
df = pd.read_excel("Online Retail.xlsx")

In [None]:
df.head()

Unnamed: 0,CustomerID,StockCode,Description,Quantity
0,17850.0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6
1,17850.0,71053,WHITE METAL LANTERN,6
2,17850.0,84406B,CREAM CUPID HEARTS COAT HANGER,8
3,17850.0,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6
4,17850.0,84029E,RED WOOLLY HOTTIE WHITE HEART.,6


In [None]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [None]:
df = df[["CustomerID","StockCode","Description","Quantity"]]

In [None]:
df.to_csv('/content/drive/MyDrive/recomendacao/retail.csv')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/recomendacao/train_triplets.txt", sep='\t', lineterminator='\r')

In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/recomendacao/ratings_small.csv')

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
user_ratings_df = ratings.pivot_table(index="userId",columns="movieId", values="rating")

In [None]:
user_ratings_df.shape

(610, 9724)

## Medindo Esparsidade

In [None]:
celulas_vazias = user_ratings_df.isnull().values.sum()
total = user_ratings_df.size
sparsity = celulas_vazias/total

In [None]:
sparsity

0.9830003169443864

## SVD

In [None]:
# Tomando a media de ratings de cada usuario
avg_ratings = user_ratings_df.mean(axis=1)

# Centralizando as ratings
user_ratings_centered = user_ratings_df.sub(avg_ratings, axis=0)

In [None]:
user_ratings_centered.shape

(610, 9724)

In [None]:
# Preenchendo com 0
user_ratings_centered.fillna(0, inplace=True)

In [None]:
from scipy.sparse.linalg import svds
import numpy as np

In [None]:
user_ratings_centered.shape

In [None]:
# Decompoe
U, sigma, Vt = svds(user_ratings_centered)

In [None]:
U.shape

(610, 6)

In [None]:
Vt.shape

(6, 9724)

In [None]:
# Converte sigma em uma matriz diagonal
sigma = np.diag(sigma)
print(sigma)

[[36.54895519  0.          0.          0.          0.          0.        ]
 [ 0.         37.95619249  0.          0.          0.          0.        ]
 [ 0.          0.         39.37050585  0.          0.          0.        ]
 [ 0.          0.          0.         41.77917206  0.          0.        ]
 [ 0.          0.          0.          0.         43.6224036   0.        ]
 [ 0.          0.          0.          0.          0.         76.20046537]]


## Recriando as Ratings

In [None]:
# Produto de U com Sigma
U_sigma = np.dot(U, sigma)

In [None]:
# Produto de (U com Sigma) com V transposto
U_sigma_Vt = np.dot(U_sigma, Vt)

# Print the result
print(U_sigma_Vt)

[[ 9.59343518e-02 -1.71945495e-02  2.27523739e-02 ... -1.34624568e-05
  -1.34624568e-05  5.89547842e-04]
 [-4.08968238e-04  4.54110162e-04 -4.93120168e-03 ... -8.36576329e-06
  -8.36576329e-06 -5.95822929e-06]
 [-2.95922628e-02  3.72046555e-03 -1.51491416e-02 ... -3.59273284e-05
  -3.59273284e-05 -1.52040897e-04]
 ...
 [ 3.12364266e-01 -2.95290977e-01 -4.52094620e-01 ... -3.59424467e-05
  -3.59424467e-05 -2.52639026e-05]
 [ 1.58856307e-02 -4.81933236e-03  1.37231199e-03 ...  1.98083300e-06
   1.98083300e-06  1.73775968e-04]
 [ 5.97439189e-01 -1.19549102e-01 -1.92823569e-01 ... -9.41640085e-04
  -9.41640085e-04  4.68139853e-03]]


In [None]:
# Adiciona novamente as medias para recriarmos a matriz original
uncentered_ratings = U_sigma_Vt + avg_ratings.values.reshape(-1, 1)

In [None]:
# Recria o dataframe
calc_pred_ratings_df = pd.DataFrame(uncentered_ratings, 
                                    index=user_ratings_df.index,
                                    columns=user_ratings_df.columns
                                   )

In [None]:
calc_pred_ratings_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.462314,4.349185,4.389132,4.355828,4.304889,4.379954,4.339853,4.353081,4.371022,4.304191,...,4.366366,4.366333,4.366399,4.366399,4.366366,4.366399,4.366366,4.366366,4.366366,4.366969
2,3.947867,3.94873,3.943345,3.948975,3.954781,3.955014,3.950488,3.94959,3.946458,3.960781,...,3.948267,3.948247,3.948288,3.948288,3.948267,3.948288,3.948267,3.948267,3.948267,3.94827
3,2.406305,2.439618,2.420748,2.440964,2.468875,2.461135,2.451152,2.439374,2.424418,2.48179,...,2.435862,2.435774,2.435949,2.435949,2.435862,2.435949,2.435862,2.435862,2.435862,2.435745
4,3.616146,3.556886,3.576326,3.55137,3.500886,3.497739,3.52139,3.555289,3.562753,3.49058,...,3.555621,3.55578,3.555462,3.555462,3.555621,3.555462,3.555621,3.555621,3.555621,3.555394
5,3.65397,3.633074,3.646911,3.631662,3.620151,3.628583,3.628683,3.635113,3.640487,3.596743,...,3.636387,3.636443,3.63633,3.63633,3.636387,3.63633,3.636387,3.636387,3.636387,3.636556


In [None]:
user_ratings_df.fillna(0)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Fazendo Recomendações com SVD

In [None]:
# Recomendando pro user 5
user_4 = calc_pred_ratings_df.loc[4,:].sort_values(ascending=False)

In [None]:
print(user_4)

## Usando SVD com Surprise

FunkSVD é uma variação do algoritmo SVD (Singular Value Decomposition) que é utilizado para lidar com valores ausentes em matrizes de dados. Ele foi proposto por Simon Funk durante o Netflix Prize, um desafio de recomendação de filmes promovido pela Netflix em 2006. A ideia principal do FunkSVD é preencher os valores ausentes na matriz de dados utilizando uma equação de aproximação baseada no SVD e no gradiente descendente. Isso permite que o algoritmo tenha uma melhor capacidade de lidar com dados esparsos, o que é comum em sistemas de recomendação.

In [None]:
!pip install scikit-surprise -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m768.0/772.0 KB[0m [31m43.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 KB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone


In [None]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [None]:
# Carrega o dataframe
data = Dataset.load_builtin('ml-100k')

# Define o algoritmo
algo = SVD()

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [None]:
# Treina e testa o algoritmotrain and test the algorithm
trainset, testset = train_test_split(data, test_size=.25)

In [None]:
algo.fit(trainset)
predictions = algo.test(testset)

# evaluate the algorithm
accuracy.rmse(predictions)

RMSE: 0.9366


0.9366236236204732

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

data = Dataset.load_builtin("ml-100k")

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

0.9635054765364105
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


- n_epochs é o número de iterações de SGD, que é basicamente um método iterativo usado em estatística para minimizar uma função.

- lr_all é a taxa de aprendizado para todos os parâmetros, que é um parâmetro que decide quanto os parâmetros são ajustados em cada iteração.

- reg_all é o termo de regularização para todos os parâmetros, que é um termo de penalidade adicionado para evitar o overfitting.