In [13]:
#Data Wranling
import numpy as np
import pandas as pd

#Similiaridade
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# Get data
def get_steam_data(file_path:str) -> pd.DataFrame:  #func p carregar dados csv e tranformar em um df
  try:
        column_names = ['user_id', 'item_id', 'behaviour', 'hours']  #definindo os nomes das colunas
        df = pd.read_csv(file_path, header=None, names=column_names, usecols=range(4))  #usecols - só as 4 primeiras linhas serão lidas. header=none - primeira linha não é cabeçalho
        return df  #retorna o df
  except FileNotFoundError:   
        print(f"File {file_path} not found.")   #caso o arquivo não seja encontrado essa msg é impressa, retornando um df vazio
        return pd.DataFrame()   
  except pd.errors.ParserError:
        print(f"Error parsing file {file_path}.")  #se houver erro ao processar o arquivo uma msg tbm é impressa retornando um df vazio
        return pd.DataFrame()  
 
# Função para capturar avaliações implícitas
def get_ratings(df: pd.DataFrame) -> pd.DataFrame:  #func que recebe um df 
  """Preparar e manipular dados para o calculo de 'ratings'"""
  df_user_consumption = (  
      df
      .query('behaviour == "play"')[['user_id', 'item_id', 'hours']]  #filtra interações do comportamento de 'play'
      .groupby(['user_id'])['hours']  
      .sum()              #agrupando os dados por user_id somando as horas jogadas de cada user
      .reset_index()  
      .rename({'hours': 'total_user_hours'}, axis=1)  
  )

  df_ratings = (
    df
    .query('behaviour == "play"')[['user_id', 'item_id', 'hours']]  #filtrando novamente por 'play'
    .groupby(['user_id', 'item_id'])['hours']
    .sum()      #agrupando novamente porém também com 'item_id' e somando as horas jogadas de cada user
    .reset_index()
    .merge(df_user_consumption, on='user_id')  #faz um join/merge p juntar o df de consumo total de user adicionando o total de horas de cada
  )

  df_ratings['rating'] = df_ratings['hours']/df_ratings['total_user_hours']  #definindo 'rating' como o resultado da divisão entre as horas jogadas e horas totais de cada user(+ jogado = maior avalição)
  df_ratings.drop(columns=['hours', 'total_user_hours'], inplace=True)  #removendo colunas horas e toral de horas 

  return df_ratings  #retornando o df de ratings


#Classe genérica para recomendação
class ItemBasedRecommender:
  
  def __init__(self, data, item_col, user_col, score_col, aggfunc=np.mean):  #método p iniciar a classe com parametros p o recomendador
    self.data = data.copy()  #própio df
    self.item_col = item_col  #nome da col que representa os jogos
    self.user_col = user_col #nome da col que representa os usuários
    self.score_col = score_col #nomre da coluna com a avaliação
    self.aggfunc = aggfunc #func de agg p calcular a pontuação dos itens (np.mean = média)
 
  #Treinando o modelo com parametros
  def fit(self, sample_size=None, normalize=False, n_most_popular=10):  #treinar o modelo calculando a similiridade; #sample size = se fornecido filtra os itens mais populares
                                                                         #normalize= caso True normaliza as avaliações antes de calcular as similiaridades; n° de items mais populares caso n tenha similiaridade suficiente
    if sample_size is not None:  #caso sample size for fornecido: filtrar os itens mais pop. com base no n° de users distintos q interagiram com cada item
                                 #item_sample = somente é criado se por meio do sample_size.
      self.item_sample_ = self.data.groupby(self.item_col)[self.user_col] \
        .nunique() \
        .sort_values(ascending=False) \
        .to_frame('nunique_customers') \
        .head(sample_size) \
        .index.tolist()           #nunique = contar o n° de users distintos por item. #to_frame = converte a série em um Df com o nome da coluna como 'nunique_costumers'.
        
      self.data = self.data[self.data[self.item_col].isin(self.item_sample_)] #filtrar os dados p manter apenas os itens mais populares.

    #Preparar os dados p calcular as recomendações
    self.scores_ = self.data.groupby(self.item_col).agg(**{     #agrupar por item_col calculando a média das avaliações por item / agg = agrupar varias funções de agg em mais de uma coluna.
        f'{self.score_col}_{self.aggfunc.__name__}': (self.score_col, self.aggfunc),  #calcular pontuação média  (aggfunc) com o nome da coluna sendo o função (mean)
        f'{self.score_col}_count': ('rating', 'count') # contar o n° de avaliações p cada item por meio da coluna 'rating'
        }).sort_values(f'{self.score_col}_count', ascending=False) #ordenar pelos item mais populares que mais aparecem

    #Calculando itens mais populares
    self.n_most_popular_ = self.data[self.item_col].value_counts().nlargest(n_most_popular).index  #calcula os itens mais populares com base no n° de interações

    self.data_pivot_ = self.data.pivot(index=self.item_col, columns=self.user_col, values=self.score_col)  # transformar em uma tabela dinicamica como o índice o id do item e coluna os ids dos users e valores como as avaliações.
    
    #Caso normalize = True
    if normalize:
      avg_ratings = self.data_pivot_.mean(axis=0) # calcular a média das avaliações de cada user
      self.data_pivot_ = self.data_pivot_.sub(avg_ratings, axis=1).fillna(0)  #normalizar as avaliações subtraindo a média das avaliações de cada user

    #Caso normalize = False
    else:
      self.data_pivot_ = self.data_pivot_.fillna(0)  #caso não houver normalização apenas preencher nulos

    #Calcular a matriz de similiaridade etre os itens
    self.sim_matrix_ = cosine_similarity(self.data_pivot_) 
    self.sim_matrix_ = pd.DataFrame(self.sim_matrix_, index=self.data_pivot_.index, columns=self.data_pivot_.index)  #converter a matriz em um df com os indices e cols =itens
    return self  #retornar o obj ItemBasedRecommender
    
  #Gerar as recomendações
  def recommend(self, target_item, max_recommendations=None):  #gerar recomendações p um item específico (target_item)
    try:
      return self.sim_matrix_.loc[target_item].drop(target_item).sort_values(ascending=False).head(max_recommendations)  #obter o item similiar e retornar os itens mais semelhantes
    except KeyError as e:
      print(f'\033[1m{target_item}\033[0;0m is not included in the recommendation matrix. Returning top 10 items:\n') #caso o item não for achado printar is 10 itens mais populares
      return self.n_most_popular_

  def fit_recommend(self, target_item):  #treinar o modelo e fazer a recomendação p um item
    return self.fit().recommend(target_item)


In [15]:
#Get data 
df = get_steam_data('../data/steam-200k.csv')
df


Unnamed: 0,user_id,item_id,behaviour,hours
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,purchase,1.0
3,151603712,Fallout 4,play,87.0
4,151603712,Spore,purchase,1.0
...,...,...,...,...
199995,128470551,Titan Souls,play,1.5
199996,128470551,Grand Theft Auto Vice City,purchase,1.0
199997,128470551,Grand Theft Auto Vice City,play,1.5
199998,128470551,RUSH,purchase,1.0


In [16]:
#Definindo novo df Get Ratings
df_ratings = get_ratings(df)
df_ratings

Unnamed: 0,user_id,item_id,rating
0,5250,Alien Swarm,0.021729
1,5250,Cities Skylines,0.638581
2,5250,Deus Ex Human Revolution,0.274945
3,5250,Dota 2,0.000887
4,5250,Portal 2,0.060310
...,...,...,...
70472,309434439,Dota 2,1.000000
70473,309554670,Mitos.is The Game,1.000000
70474,309626088,Age of Empires II HD Edition,1.000000
70475,309824202,Dota 2,1.000000


In [17]:
#Criar classe p o modelo de recomendação com o novo df
recommender = ItemBasedRecommender(
    data=df_ratings,
    item_col='item_id',
    user_col='user_id',
    score_col='rating',
    aggfunc=np.sum  #somar as avaliações de cada ítem
)

In [18]:
#Printando colunas do modelo
print('Coluna que identifica os itens:', recommender.item_col)
print('Coluna que identifica os usuários:', recommender.user_col)
print('Coluna que identifica as avaliações:', recommender.score_col)

Coluna que identifica os itens: item_id
Coluna que identifica os usuários: user_id
Coluna que identifica as avaliações: rating


In [19]:
#Treinando
recommender.fit()

  self.scores_ = self.data.groupby(self.item_col).agg(**{     #agrupar por item_col calculando a média das avaliações por item / agg = agrupar varias funções de agg em mais de uma coluna.


<__main__.ItemBasedRecommender at 0x13e4d9b66d0>

In [20]:
#Itens mais populares
recommender.n_most_popular_

Index(['Dota 2', 'Team Fortress 2', 'Counter-Strike Global Offensive',
       'Unturned', 'Left 4 Dead 2', 'Counter-Strike Source',
       'The Elder Scrolls V Skyrim', 'Garry's Mod', 'Counter-Strike',
       'Sid Meier's Civilization V'],
      dtype='object', name='item_id')

In [21]:
#Maiores avaliações
recommender.scores_

Unnamed: 0_level_0,rating_sum,rating_count
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
Dota 2,3904.304091,4841
Team Fortress 2,990.560459,2323
Counter-Strike Global Offensive,541.811807,1377
Unturned,321.379284,1069
Left 4 Dead 2,103.398419,801
...,...,...
Starion Tactics,0.000786,1
Gateways,0.000081,1
Community College Hero Trial by Fire,0.005879,1
Starscape,0.001837,1


In [22]:
#Similiaridade entre os itens 
recommender.data_pivot_

user_id,5250,76767,86540,144736,181212,229911,298950,381543,547685,554278,...,309228590,309255941,309262440,309265377,309404240,309434439,309554670,309626088,309824202,309903146
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
007 Legends,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Second Ninja,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rymdkapsel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00039,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
#Itens semelhantes
recommender.sim_matrix_

item_id,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,12 Labours of Hercules III Girl Power,...,rFactor,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,the static speaks my name,theHunter,theHunter Primal
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
007 Legends,1.0,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
0RBITALIS,0.0,1.000000,0.000000e+00,0.077751,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,2.367356e-05,0.000000
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),0.0,0.000000,1.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,2.948747e-07,0.000000e+00,0.000000e+00,0.000000
10 Second Ninja,0.0,0.077751,0.000000e+00,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
10000000,0.0,0.000000,0.000000e+00,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rymdkapsel,0.0,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,1.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
sZone-Online,0.0,0.000000,2.948747e-07,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,1.000000e+00,9.935406e-06,4.747041e-05,0.000000
the static speaks my name,0.0,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,9.935406e-06,1.000000e+00,6.734307e-07,0.000000
theHunter,0.0,0.000024,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000005,0.0,4.747041e-05,6.734307e-07,1.000000e+00,0.000138


In [24]:
#Exemplo de recomendação
recommender.recommend('Batman Arkham City', 10)

item_id
Star Wars The Clone Wars Republic Heroes    0.708706
Chessmaster                                 0.708706
EDGE                                        0.701947
Doctor Who The Eternity Clock               0.373133
The Sims(TM) Medieval                       0.265893
Magic The Gathering  Tactics                0.261532
Crazy Taxi                                  0.239978
Dragon The Game                             0.200712
Buccaneer The Pursuit of Infamy             0.190686
Blood Bowl Dark Elves Edition               0.190686
Name: Batman Arkham City, dtype: float64