# Ambiente

In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [48]:
pd.options.display.max_columns = 200
pd.options.display.max_rows = 100

# Dados

In [49]:
# df_steam_user = pd.read_csv(
#     '../../data/raw/steam-200k.csv',
#     sep=',',
#     header=None,
#     names=['USER_ID', 'GAME_TITLE', 'BEHAVIOR_NAME', 'VALUE', '0']
# )

df_steam_games_raw = pd.read_csv(
    '../../data/raw/games.csv',
    usecols=['Name', 'Release date', 'Required age', 'Price', 'Windows', 'Genres', 'Tags', 'Positive', 'Negative']
)

# Funções

In [50]:
def cria_identificadores_genero(df:pd.DataFrame, col:str, sep:str = ',') -> pd.DataFrame:
    """
    Função que cria identificadores de marcadores
    para jogos a partir de colunas contendo strings separadas por um delimitador comum.
    """
    df = df.copy()

    generos = np.unique(
        df[col]
        .drop_duplicates()
        .str.cat(sep=sep)
        .split(sep)
    )

    for genero in generos:
        df[f'{genero}'.upper().replace(' ','_')] = (
            df[col].str.contains(genero, regex=True)
        )

    return df

In [51]:
def remove_outlier(df:pd.DataFrame, col:str) -> pd.DataFrame:
    """
    Função que remove outliers de um DataFrame baseado no método IQR
    """
    df = df.copy()

    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1

    df = df[
        (df[col] >= q1 - 1.5 * iqr)
        & (df[col] <= q3 + 1.5 * iqr)
    ].copy()

    return df

In [52]:
def preprocessa_steam_games(df_steam_games: pd.DataFrame) -> pd.DataFrame:
    """
    Função que realiza o pré-processamento do dataset de jogos da Steam
    """

    # Plataforma
    df_steam_games = df_steam_games[df_steam_games['Windows'] == True].copy()
    
    # Novas Features
    df_steam_games['PriceRange'] = pd.cut(df_steam_games['Price'].round(0), bins=20)
    df_steam_games['Over18'] = df_steam_games['Required age'] >= 18
    df_steam_games['Year'] = df_steam_games['Release date'].str[-4:].astype(int)
    
    # NPS
    df_steam_games['Reviews'] = df_steam_games['Positive'] + df_steam_games['Negative']
    df_steam_games['NPS'] = (
        (df_steam_games['Positive'] / df_steam_games['Reviews'])
        - (df_steam_games['Negative'] / df_steam_games['Reviews'])
    ).fillna(0)

    # Ajuste para jogos com poucas reviews
    df_steam_games['NPS'] = np.where(
        df_steam_games['NPS'] == 1,
        0,
        df_steam_games['NPS']
    )

    # Dados duplicados
    df_steam_games = (
        df_steam_games
        .sort_values('Reviews', ascending=False)
        .drop_duplicates('Name', keep='first')
    )

    # Marcadores
    df_steam_games['Tags'] = np.where(df_steam_games['Tags'].isna(), df_steam_games['Genres'], df_steam_games['Tags'])
    df_steam_games = df_steam_games.dropna(subset=['Tags', 'Name'])
    df_steam_games = cria_identificadores_genero(df_steam_games, 'Tags')

    # Transforma booleano em numérico
    bool_cols = df_steam_games.select_dtypes('bool').columns
    df_steam_games[bool_cols] = df_steam_games[bool_cols].astype('int64')

    df_steam_games = df_steam_games.drop(
        ['Genres', 'Tags', 'Required age', 'Windows', 'Price', 'Reviews',
         'Release date', 'Positive', 'Negative'], axis=1)
    return df_steam_games

# Preprocessamento

## Steam Games

In [53]:
df = preprocessa_steam_games(df_steam_games_raw)

  df[col].str.contains(genero, regex=True)
  df[col].str.contains(genero, regex=True)


In [149]:
registros_categoria = df.drop(['Name', 'PriceRange', 'Over18', 'Year', 'NPS'], axis=1).sum()
top_categorias = registros_categoria[registros_categoria > 6000].sort_values(ascending=False).index.to_list()

In [150]:
len(top_categorias)

33

In [151]:
df_games = df[['Name', 'NPS'] + top_categorias].reset_index(drop=True)

## Steam Users

- Tentativa de usar as horas jogadas para ordenar recomendações
- A base de dados não possui todos os jogos necessários para tal

In [152]:
# df_steam_user = df_steam_user[df_steam_user['BEHAVIOR_NAME'] == 'play']
# df_steam_user = remove_outlier(df_steam_user, 'VALUE')

# df_playtime = df_steam_user.groupby('GAME_TITLE', as_index=False)['VALUE'].mean()
# df_playtime = df_playtime.rename(columns={'GAME_TITLE': 'Name', 'VALUE': 'Playtime'})

# Avaliando Similaridade

## KNN + Distância por cossenos

In [153]:
from sklearn.neighbors import NearestNeighbors

In [154]:
neigh = NearestNeighbors(n_neighbors=11, metric='cosine')
neigh.fit(df_games.drop(['Name', 'NPS'], axis=1));

In [166]:
# df_games[df_games['Name'].str.contains('Battlegr')]

In [157]:
lista_jogos = ['Realm Royale', 'Paladins®', 'Overcooked', 'Among Us', 'Counter-Strike: Source', 'Hollow Knight']
jogo_avaliado = 'Hollow Knight'

jogo = df_games[df_games['Name'] == jogo_avaliado].drop(['Name', 'NPS'], axis=1)

neighbors = neigh.kneighbors(jogo)
similaridade = neighbors[0][0]
index = neighbors[1][0]

df_similaridade = pd.DataFrame({
    'Name': df_games.iloc[index]['Name'],
    'Similarity': 1 - similaridade
})

df_similaridade = df_similaridade[df_similaridade['Name'] != jogo_avaliado]

In [158]:
df_similaridade.merge(df_games[['Name', 'NPS']], on='Name').sort_values('NPS', ascending=False)

Unnamed: 0,Name,Similarity,NPS
4,Ori and the Will of the Wisps,0.916667,0.964767
2,NO Logic,0.92582,0.952381
3,WarriOrb: Prologue,0.92582,0.887139
0,Unbound: Worlds Apart,1.0,0.886256
1,WarriOrb,0.960769,0.824324
6,Tartapolis,0.912871,0.823529
8,Timothy and the Tower of Mu,0.880705,0.714286
7,Cantirium: God Slayer,0.894427,0.6
9,Lightform,0.880705,0.259259
5,Mage,0.916667,0.0


In [173]:
df_steam_games_raw[df_steam_games_raw['Name'] == "Tom Clancy's Rainbow Six® Siege"]#.Tags.unique()

Unnamed: 0,Name,Release date,Required age,Price,Windows,Positive,Negative,Genres,Tags
2904,Tom Clancy's Rainbow Six® Siege,"Dec 1, 2015",17,19.99,True,312232,64137,Action,"FPS,Multiplayer,Tactical,Shooter,Team-Based,Ac..."
4287,Tom Clancy's Rainbow Six® Siege,"Dec 1, 2015",17,19.99,True,312816,64201,Action,"FPS,Multiplayer,Tactical,Shooter,Team-Based,Ac..."
8256,Tom Clancy's Rainbow Six® Siege,"Dec 1, 2015",17,19.99,True,929372,138530,Action,"FPS,Hero Shooter,Multiplayer,Tactical,Shooter,..."
21190,Tom Clancy's Rainbow Six® Siege,"Dec 1, 2015",17,19.99,True,312719,64188,Action,"FPS,Multiplayer,Tactical,Shooter,Team-Based,Ac..."
38967,Tom Clancy's Rainbow Six® Siege,"Dec 1, 2015",17,19.99,True,312397,64151,Action,"FPS,Multiplayer,Tactical,Shooter,Team-Based,Ac..."
