# Ambiente

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [8]:
pd.options.display.max_columns = 200
pd.options.display.max_rows = 100

# Dados

In [9]:
# df_steam_user = pd.read_csv(
#     '../../data/raw/steam-200k.csv',
#     sep=',',
#     header=None,
#     names=['USER_ID', 'GAME_TITLE', 'BEHAVIOR_NAME', 'VALUE', '0']
# )

df_steam_games_raw = pd.read_csv(
    '../../data/raw/games.csv',
    usecols=['Name', 'Release date', 'Required age', 'Price', 'Windows', 'Genres', 'Tags', 'Positive', 'Negative']
)

In [10]:
df_steam_games_raw.Tags[0]

'Indie,Casual,Sports,Bowling'

# Funções

In [11]:
def cria_identificadores_genero(df:pd.DataFrame, col:str, sep:str = ',') -> pd.DataFrame:
    """
    Função que cria identificadores de marcadores
    para jogos a partir de colunas contendo strings separadas por um delimitador comum.
    """
    df = df.copy()

    generos = np.unique(
        df[col]
        .drop_duplicates()
        .str.cat(sep=sep)
        .split(sep)
    )

    for genero in generos:
        df[f'{genero}'.upper().replace(' ','_')] = (
            df[col].str.contains(genero, regex=True)
        )

    return df

In [12]:
def remove_outlier(df:pd.DataFrame, col:str) -> pd.DataFrame:
    """
    Função que remove outliers de um DataFrame baseado no método IQR
    """
    df = df.copy()

    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1

    df = df[
        (df[col] >= q1 - 1.5 * iqr)
        & (df[col] <= q3 + 1.5 * iqr)
    ].copy()

    return df

In [13]:
def preprocessa_steam_games(df_steam_games: pd.DataFrame) -> pd.DataFrame:
    """
    Função que realiza o pré-processamento do dataset de jogos da Steam
    """

    # Plataforma
    df_steam_games = df_steam_games[df_steam_games['Windows'] == True].copy()
    
    # Novas Features
    df_steam_games['PriceRange'] = pd.cut(df_steam_games['Price'].round(0), bins=20)
    df_steam_games['Over18'] = df_steam_games['Required age'] >= 18
    df_steam_games['Year'] = df_steam_games['Release date'].str[-4:].astype(int)
    
    # NPS
    df_steam_games['Reviews'] = df_steam_games['Positive'] + df_steam_games['Negative']
    df_steam_games['NPS'] = (
        (df_steam_games['Positive'] / df_steam_games['Reviews'])
        - (df_steam_games['Negative'] / df_steam_games['Reviews'])
    ).fillna(0)

    # Ajuste para jogos com poucas reviews
    df_steam_games['NPS'] = np.where(
        df_steam_games['NPS'] == 1,
        0,
        df_steam_games['NPS']
    )

    # Dados duplicados
    df_steam_games = (
        df_steam_games
        .sort_values('Reviews', ascending=False)
        .drop_duplicates('Name', keep='first')
    )

    # Marcadores
    df_steam_games['Tags'] = np.where(df_steam_games['Tags'].isna(), df_steam_games['Genres'], df_steam_games['Tags'])
    df_steam_games = df_steam_games.dropna(subset=['Tags', 'Name'])
    df_steam_games = cria_identificadores_genero(df_steam_games, 'Tags')

    # Transforma booleano em numérico
    bool_cols = df_steam_games.select_dtypes('bool').columns
    df_steam_games[bool_cols] = df_steam_games[bool_cols].astype('int64')

    df_steam_games = df_steam_games.drop(
        ['Genres', 'Tags', 'Required age', 'Windows', 'Price', 'Reviews',
         'Release date', 'Positive', 'Negative'], axis=1)
    return df_steam_games

# Preprocessamento

## Steam Games

In [14]:
df = preprocessa_steam_games(df_steam_games_raw)

  df[col].str.contains(genero, regex=True)
  df[col].str.contains(genero, regex=True)


In [15]:
registros_categoria = df.drop(['Name', 'PriceRange', 'Over18', 'Year', 'NPS'], axis=1).sum()
top_categorias = registros_categoria[registros_categoria > 6000].sort_values(ascending=False).index.to_list()

In [16]:
len(top_categorias)

32

In [17]:
df_games = df[['Name', 'NPS'] + top_categorias].reset_index(drop=True)

## Steam Users

- Tentativa de usar as horas jogadas para ordenar recomendações
- A base de dados não possui todos os jogos necessários para tal

In [18]:
# df_steam_user = df_steam_user[df_steam_user['BEHAVIOR_NAME'] == 'play']
# df_steam_user = remove_outlier(df_steam_user, 'VALUE')

# df_playtime = df_steam_user.groupby('GAME_TITLE', as_index=False)['VALUE'].mean()
# df_playtime = df_playtime.rename(columns={'GAME_TITLE': 'Name', 'VALUE': 'Playtime'})

# Avaliando Similaridade

## KNN + Distância por cossenos

In [19]:
from sklearn.neighbors import NearestNeighbors

In [20]:
neigh = NearestNeighbors(n_neighbors=11, metric='cosine')
neigh.fit(df_games.drop(['Name', 'NPS'], axis=1));

In [21]:
# df_games[df_games['Name'].str.contains('Battlegr')]

In [29]:
lista_jogos = ['Realm Royale', 'Paladins®', 'Overcooked', 'Among Us', 'Counter-Strike: Source', 'Hollow Knight', "Baldur's Gate II: Enhanced Edition"]
jogo_avaliado = "Paladins®"

jogo = df_games[df_games['Name'] == jogo_avaliado].drop(['Name', 'NPS'], axis=1)

neighbors = neigh.kneighbors(jogo)
similaridade = neighbors[0][0]
index = neighbors[1][0]

df_similaridade = pd.DataFrame({
    'Name': df_games.iloc[index]['Name'],
    'Similarity': 1 - similaridade
})

df_similaridade = df_similaridade[df_similaridade['Name'] != jogo_avaliado]

In [30]:
df_similaridade.merge(df_games[['Name', 'NPS']], on='Name').sort_values('NPS', ascending=False)

Unnamed: 0,Name,Similarity,NPS
3,Block N Load,0.797724,0.566431
1,Black Squad,0.80403,0.527572
0,Gigantic,0.80403,0.52028
8,Retail Royale,0.76277,0.476804
6,Warface,0.797724,0.355173
2,Until Last Tomorrow,0.797724,0.333333
9,Dissolution,0.76277,0.145946
5,Z1 Battle Royale,0.797724,0.108814
4,Spacelords,0.797724,0.042675
7,Garlock Online,0.76277,-0.484375


In [24]:
# df_steam_games_raw[df_steam_games_raw['Name'] == "Tom Clancy's Rainbow Six® Siege"].Tags.unique()