# Objetivo

Construir um filtro colaborativo que seja capaz de recomendar os jogos mais provaveis de serem comprados por um usuario. 
Base de dados: [steam data](https://www.kaggle.com/datasets/tamber/steam-video-games).

[Deploy](https://huggingface.co/spaces/HellSank/steam_filter)

In [None]:
import numpy as np
import pandas as pd

from fastai.collab import *
from fastai.tabular.all import *
set_seed(42)

df = pd.read_csv("/kaggle/input/steam-video-games/steam-200k.csv")
#df = df[:5000]

# Sobre a base de dados

## Colunas:
- user-id
- game-title
- behavior-name: ('purchase', 'play') (65%, 35%)
- purchase-or-not (0,1) **Hours if behavior is play, 1.0 if behavior is purchase**

In [None]:
df.head(10) # O csv veio sem os nomes das colunas e com a ultima coluna preenchida com zeros.

In [None]:
new_columns = ['user-id', 'game-title', 'behavior-name', 'time-played', 'zero']
df.columns = new_columns
df.head()

In [None]:
df.drop(columns='zero', inplace=True) # Removing zeros column.
df.head()

In [None]:
df.isna().sum()

In [None]:
len_before = len(df)
df.drop(df[df['behavior-name'] == 'purchase'].index, inplace=True)
(len_before, len(df), len(df)/len_before)

In [None]:
(df['time-played'].min(), df['time-played'].mean(), df['time-played'].max(),len(df))

In [None]:
df.drop(columns='behavior-name', inplace=True)# Removing column behavior-name
df.head()

In [None]:
df['time-played'].min(),df['time-played'].mean(), df['time-played'].max() 

In [None]:
dls = CollabDataLoaders.from_df(df, item_name='game-title', bs=64)
dls.show_batch()

In [None]:
n_users  = len(dls.classes['user-id'])
n_games = len(dls.classes['game-title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
game_factors = torch.randn(n_games, n_factors)

In [None]:
class DotProduct(Module):
    def __init__(self, n_users, n_games, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.game_factors = Embedding(n_games, n_factors)
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        games = self.game_factors(x[:,1])
        return (users * games).sum(dim=1)

In [None]:
model = DotProduct(n_users, n_games, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [None]:
??learn.fit_one_cycle

In [None]:
learn.fit_one_cycle(4, 5e-3)

# DotProductBias

In [None]:
import torch
import torch.nn.functional as F
from fastai.layers import Module, Embedding, sigmoid_range

class DotProductBias(Module):
    def __init__(self, n_users, n_games, n_factors, y_range=(0.1, 11754.0)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.game_factors = Embedding(n_games, n_factors)
        self.game_bias = Embedding(n_games, 1)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:, 0])
        games = self.game_factors(x[:, 1])
        res = (users * games).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:, 0]) + self.game_bias(x[:, 1])
        return torch.sigmoid(res) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]


In [None]:
from fastai.metrics import rmse
from fastai.learner import Learner


learn = Learner(dls, DotProductBias(n_users, n_games, n_factors), loss_func=F.mse_loss, metrics=rmse)
learn.fit_one_cycle(5, 5e-3)

#  Weight Decay

In [None]:
n_factors = 20
learn = Learner(dls, DotProductBias(n_users, n_games, n_factors), loss_func=F.mse_loss, metrics=rmse)
learn.fit_one_cycle(5, 5e-3, wd=0.05)

# Game recommendations

In [None]:
game_bias = learn.model.game_bias.weight.squeeze()
idxs = game_bias.argsort()[:10]
[dls.classes['game-title'][i] for i in idxs]

## highest bias:

In [None]:
idxs = game_bias.argsort(descending=True)[:5]
[dls.classes['game-title'][i] for i in idxs]

# Some information about time played by game-title

In [None]:
df.groupby('game-title')['time-played'].max().sort_values(ascending=False)[:20]

In [None]:
df.groupby('game-title')['time-played'].mean().sort_values(ascending=False)[:20]

# Game bias graphic

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

g = df.groupby('game-title')['time-played'].count()
top_games = g.sort_values(ascending=False).index.values[:1000]

top_idxs = tensor([learn.dls.classes['game-title'].o2i[m] for m in top_games])

game_w = learn.model.game_factors.weight[top_idxs].cpu().detach()

game_pca = game_w.pca(3)
fac0, fac1, fac2 = game_pca.t()

idxs = list(range(10))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(12, 12))
plt.scatter(X, Y)
for i, x, y in zip(top_games[idxs], X, Y):
    plt.text(x, y, i, color=np.random.rand(3) * 0.7, fontsize=11)
plt.show()


# Testando o modelo

In [None]:
new_df = df[['user-id', 'game-title']].copy()
new_df.head(10)

In [None]:
new_dl = learn.dls.test_dl(new_df)
predictions = learn.get_preds(dl=new_dl)

# Extract the predicted time-played values
predicted_time_played = predictions[0].squeeze().tolist()

# Add the predicted values to the new_data DataFrame
new_df['time-played'] = predicted_time_played

new_df.head(10)

## Both df must be equal(len(df)) and sorted by (user-id, game-title) 

In [None]:
total_rows = len(df)
(len(df), len(new_df))

In [None]:
df.sort_values(by=['user-id', 'game-title'], inplace=True)
new_df.sort_values(by=['user-id', 'game-title'], inplace=True)
# Check if first value of both are equal

df.iloc[0,:2] == new_df.iloc[0,:2]

In [None]:
def check_equal(original,predicted):
    percentage = 1
    lower = original * (1 - percentage)
    upper = original * (1 + percentage)
    return (predicted >= lower and predicted <= upper)

#### First hours of lists

In [None]:
df.iloc[:,2].tolist()[:5]

In [None]:
new_df.iloc[:,2].tolist()[:5]

#### Check

In [None]:
original_values = df.iloc[:,2].tolist()
predicted_values = new_df.iloc[:,2].tolist()
right_predictions = sum(list(map(check_equal, original_values, predicted_values)))
(right_predictions/total_rows) * 100

## Save model

In [None]:
df.to_csv('original.csv', index=False)
new_df.to_csv('predicted.csv', index=False)