# 1. Ambiente

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
pd.options.display.max_columns = 200
pd.options.display.max_rows = 100

# 2. Dados

In [3]:
# df_steam_user = pd.read_csv(
#     '../../data/raw/steam-200k.csv',
#     sep=',',
#     header=None,
#     names=['USER_ID', 'GAME_TITLE', 'BEHAVIOR_NAME', 'VALUE', '0']
# )

df_steam_games_raw = pd.read_csv(
    '../../data/raw/games.csv',
    usecols=['Name', 'Release date', 'Required age', 'Price', 'Windows', 'Genres', 'Tags', 'Positive', 'Negative']
)

In [4]:
df_steam_games_raw.Tags[0]

'Indie,Casual,Sports,Bowling'

# 3. Funções

In [5]:
def cria_identificadores_genero(df:pd.DataFrame, col:str, sep:str = ',') -> pd.DataFrame:
    """
    Função que cria identificadores de marcadores
    para jogos a partir de colunas contendo strings separadas por um delimitador comum.
    """
    df = df.copy()

    generos = np.unique(
        df[col]
        .drop_duplicates()
        .str.cat(sep=sep)
        .split(sep)
    )

    for genero in generos:
        df[f'{genero}'.upper().replace(' ','_')] = (
            df[col].str.contains(genero, regex=True)
        )

    return df

In [6]:
def remove_outlier(df:pd.DataFrame, col:str) -> pd.DataFrame:
    """
    Função que remove outliers de um DataFrame baseado no método IQR
    """
    df = df.copy()

    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1

    df = df[
        (df[col] >= q1 - 1.5 * iqr)
        & (df[col] <= q3 + 1.5 * iqr)
    ].copy()

    return df

In [7]:
def preprocessa_steam_games(df_steam_games: pd.DataFrame) -> pd.DataFrame:
    """
    Função que realiza o pré-processamento do dataset de jogos da Steam
    """

    # Plataforma
    df_steam_games = df_steam_games[df_steam_games['Windows'] == True].copy()
    
    # Novas Features
    df_steam_games['PriceRange'] = pd.cut(df_steam_games['Price'].round(0), bins=20)
    df_steam_games['Over18'] = df_steam_games['Required age'] >= 18
    df_steam_games['Year'] = df_steam_games['Release date'].str[-4:].astype(int)
    
    # NPS
    df_steam_games['Reviews'] = df_steam_games['Positive'] + df_steam_games['Negative']
    df_steam_games['NPS'] = (
        (df_steam_games['Positive'] / df_steam_games['Reviews'])
        - (df_steam_games['Negative'] / df_steam_games['Reviews'])
    ).fillna(0)

    # Ajuste para jogos com poucas reviews
    df_steam_games['NPS'] = np.where(
        (df_steam_games['NPS'] == 1) | (df_steam_games['Reviews'] == 1000),
        0,
        df_steam_games['NPS']
    )

    # Dados duplicados
    df_steam_games = (
        df_steam_games
        .sort_values('Reviews', ascending=False)
        .drop_duplicates('Name', keep='first')
    )

    # Marcadores
    df_steam_games['Tags'] = np.where(df_steam_games['Tags'].isna(), df_steam_games['Genres'], df_steam_games['Tags'])
    df_steam_games = df_steam_games.dropna(subset=['Tags', 'Name'])
    df_steam_games = cria_identificadores_genero(df_steam_games, 'Tags')

    # Transforma booleano em numérico
    bool_cols = df_steam_games.select_dtypes('bool').columns
    df_steam_games[bool_cols] = df_steam_games[bool_cols].astype('int64')

    df_steam_games = df_steam_games.drop(
        ['Genres', 'Tags', 'Required age', 'Windows', 'Price', 'Reviews',
         'Release date', 'Positive', 'Negative'], axis=1)
    return df_steam_games

# 4. Preprocessamento

## 4.1. Steam Games

In [8]:
df = preprocessa_steam_games(df_steam_games_raw)

  df[col].str.contains(genero, regex=True)
  df[col].str.contains(genero, regex=True)


In [9]:
registros_categoria = df.drop(['Name', 'PriceRange', 'Over18', 'Year', 'NPS'], axis=1).sum()
top_categorias = registros_categoria[registros_categoria > 6000].sort_values(ascending=False).index.to_list()

In [10]:
len(top_categorias)

32

In [11]:
df_games = df[['Name', 'NPS'] + top_categorias].reset_index(drop=True)

## 4.2. Steam Users

- Tentativa de usar as horas jogadas para ordenar recomendações
- A base de dados não possui todos os jogos necessários para tal

In [12]:
# df_steam_user = df_steam_user[df_steam_user['BEHAVIOR_NAME'] == 'play']
# df_steam_user = remove_outlier(df_steam_user, 'VALUE')

# df_playtime = df_steam_user.groupby('GAME_TITLE', as_index=False)['VALUE'].mean()
# df_playtime = df_playtime.rename(columns={'GAME_TITLE': 'Name', 'VALUE': 'Playtime'})

# 5. Modelo de Recomendação

## 5.1. KNN com distância por cossenos

In [13]:
from sklearn.neighbors import NearestNeighbors

In [14]:
neigh = NearestNeighbors(n_neighbors=6, metric='cosine')
neigh.fit(df_games.drop(['Name', 'NPS'], axis=1));

## 5.2. Resultados Preliminares - POC

In [15]:
# df_games[df_games['Name'].str.contains('Battlegr')]

In [16]:
lista_jogos = ['Realm Royale', 'Paladins®', 'Overcooked', 'Among Us', 'Counter-Strike: Source', 'Hollow Knight', "Baldur's Gate II: Enhanced Edition"]
jogo_avaliado = "Hollow Knight"

jogo = df_games[df_games['Name'] == jogo_avaliado].drop(['Name', 'NPS'], axis=1)

neighbors = neigh.kneighbors(jogo)
similaridade = neighbors[0][0]
index = neighbors[1][0]

df_similaridade = pd.DataFrame({
    'Name': df_games.iloc[index]['Name'],
    'Similarity': 1 - similaridade
})

df_similaridade = df_similaridade[df_similaridade['Name'] != jogo_avaliado]

In [17]:
df_similaridade.merge(df_games[['Name', 'NPS']], on='Name').sort_values('NPS', ascending=False)

Unnamed: 0,Name,Similarity,NPS
4,Ori and the Will of the Wisps,0.916667,0.929534
2,NO Logic,0.92582,0.904762
3,WarriOrb: Prologue,0.92582,0.774278
0,Unbound: Worlds Apart,1.0,0.772512
1,WarriOrb,0.960769,0.648649


In [18]:
# df_steam_games_raw[df_steam_games_raw['Name'] == "Tom Clancy's Rainbow Six® Siege"].Tags.unique()

  ## 5.3. Desempenho do modelo

In [19]:
knn_results = neigh.kneighbors(df_games.drop(['Name', 'NPS'], axis=1))

In [20]:
# Separando resultados
similaridade = 1 - knn_results[0]
semelhantes = knn_results[1]

In [51]:
# Transformando resultados em DataFrame e removendo coluna de id do próprio jogo
df_semelhantes = pd.DataFrame(semelhantes, columns=[f'id_semelhante_{i}' for i in range(6)]).drop('id_semelhante_0', axis=1)
df_semelhantes = df_semelhantes.join(df_games[['Name']])

# Concatenando recomendações
df_recomendacoes = pd.DataFrame()
for i in range(1,6):
    _df_semelhantes = df_semelhantes[['Name', f'id_semelhante_{i}']].merge(df_games.drop(['Name', 'NPS'], axis=1), left_on=f'id_semelhante_{i}', right_index=True, how='left')
    _df_semelhantes = _df_semelhantes.drop([f'id_semelhante_{i}'], axis=1)

    df_recomendacoes = pd.concat([df_recomendacoes, _df_semelhantes])

df_recomendacoes = df_recomendacoes.merge(df_games, on='Name', how='left', suffixes=('_recom', ''))

In [58]:
# Calculando métricas para recomendações (seguindo a lógica de um problema de classificação binária)
cols = [c for c in df_games.columns if c not in ['Name', 'NPS']]

tp_cols = []
tn_cols = []
fp_cols = []
fn_cols = []
for col in cols:
    tp_col = f'tp_{col}'
    tn_col = f'tn_{col}'
    fp_col = f'fp_{col}'
    fn_col = f'fn_{col}'

    tp_cols.append(tp_col)
    tn_cols.append(tn_col)
    fp_cols.append(fp_col)
    fn_cols.append(fn_col)

    df_recomendacoes[tp_col] = (df_recomendacoes[col] == 1) & (df_recomendacoes[f'{col}_recom'] == 1)
    df_recomendacoes[tn_col] = (df_recomendacoes[col] == 0) & (df_recomendacoes[f'{col}_recom'] == 0)
    df_recomendacoes[fp_col] = (df_recomendacoes[col] == 0) & (df_recomendacoes[f'{col}_recom'] == 1)
    df_recomendacoes[fn_col] = (df_recomendacoes[col] == 1) & (df_recomendacoes[f'{col}_recom'] == 0)

df_recomendacoes['TP'] = df_recomendacoes[tp_cols].sum(axis=1)
df_recomendacoes['TN'] = df_recomendacoes[tn_cols].sum(axis=1)
df_recomendacoes['FP'] = df_recomendacoes[fp_cols].sum(axis=1)
df_recomendacoes['FN'] = df_recomendacoes[fn_cols].sum(axis=1)

In [59]:
df_recomendacoes = df_recomendacoes.groupby('Name', as_index=False)[['TP', 'TN', 'FP', 'FN']].sum()

df_recomendacoes['Precision'] = df_recomendacoes['TP'] / (df_recomendacoes['TP'] + df_recomendacoes['FP'])
df_recomendacoes['Recall'] = df_recomendacoes['TP'] / (df_recomendacoes['TP'] + df_recomendacoes['FN'])
df_recomendacoes['F1'] = 2 * (
    df_recomendacoes['Precision'] * df_recomendacoes['Recall']) / (df_recomendacoes['Precision'] + df_recomendacoes['Recall']
)

In [64]:
print(f'Precision: {df_recomendacoes["Precision"].mean():.2%}\nRecall: {df_recomendacoes["Recall"].mean():.2%}\nF1: {df_recomendacoes["F1"].mean():.2%}')

Precision: 92.85%
Recall: 94.78%
F1: 94.22%
