# Recomendación Juego


In [60]:
import pandas as pd
import os

path_steam_games = os.path.join('..','data','clear','steam_games.csv.gz')


## Carga de data

In [61]:
## Cargamos la data necesaria para solucionar este problema.

steam_colums = ['id','app_name','publisher','genres','price','developer','release_year']

steam_games = pd.read_csv(path_steam_games,usecols=steam_colums)

## renombraremos la columna id por steam_id
steam_games.rename(columns={'id':'steam_id'}, inplace=True)

## Seleccion de las caracteristicas más importantes de un juego

In [62]:
from collections import Counter
from itertools import chain

def select_n_best(caracteristica,n=10):
    
    # Dividir las palabras en cada fila y contar la frecuencia de cada palabra
    conteo_palabras = Counter(chain.from_iterable(caracteristica.str.split(', ')))

    # Obtener las n palabras más comunes
    palabras_mas_comunes = conteo_palabras.most_common(n)

    print("Palabras más comunes:")
    for palabra, frecuencia in palabras_mas_comunes:
        print(f"{palabra}: {frecuencia}")
    
    return([palabras_mas_comunes[i][0] for i in range(n)])

In [63]:
len(steam_games['steam_id'].unique())

29975

In [64]:
## Vamos a extraer en una lista las n palabras más importantes

## Tomamos los 100 generos mas significativos
palabras_mas_relevantes = select_n_best(steam_games['genres'],40)

## convertir lista en texto
text_palabras = (', ').join(palabras_mas_relevantes)

## Eliminaremos de cada registro los no significativos.
def drop_not_important(genres):
  return(', ').join([palabra for palabra in genres.split(', ') if palabra.capitalize() in palabras_mas_relevantes])


steam_games['genres'] = steam_games['genres'].apply(drop_not_important)


# Concatenaremos las columnas que utilizaremos para realizar el analisis de recomendacion
cols_concatenate = ['app_name','developer','genres']
steam_games['features'] = steam_games[cols_concatenate].apply(lambda row: ', '.join(row), axis=1)

steam_games['features']


Palabras más comunes:
Indie: 16783
Action: 12320
Casual: 9471
Adventure: 9344
Simulation: 7508
Strategy: 7289
RPG: 5552
Singleplayer: 4268
Multiplayer: 2283
Great Soundtrack: 2185
Free to Play: 2183
Puzzle: 2075
2D: 1915
Atmospheric: 1901
Early Access: 1839
VR: 1707
Sports: 1530
Platformer: 1420
Story Rich: 1417
Sci-fi: 1354
Fantasy: 1338
Difficult: 1289
Open World: 1273
Horror: 1269
Pixel Graphics: 1218
Racing: 1188
Shooter: 1178
Co-op: 1174
Massively Multiplayer: 1168
Female Protagonist: 1153
Anime: 1138
Funny: 1089
First-Person: 1037
Arcade: 958
Retro: 943
Sandbox: 937
Turn-Based: 932
FPS: 928
Comedy: 873
Point & Click: 855


0        Lost summoner kitty, Kotoshiro, Strategy, Acti...
1        Ironbound, Secret level srl, Strategy, Indie, ...
2        Real pool 3d - poolians, Poolians.com, Simulat...
3                 弹炸人2222, 彼岸领域, Action, Adventure, Casual
4        Log challenge, Otros, Action, Indie, Casual, S...
                               ...                        
29970    Colony on mars, Nikita "ghost_rus", Strategy, ...
29971    Logistical: south africa, Sacada, Strategy, In...
29972    Russian roads, Laush dmitriy sergeevich, Indie...
29973    Exit 2 - directions, Xropi,stev3ns, Indie, Cas...
29974    Maze run vr, Otros, Adventure, Indie, Action, ...
Name: features, Length: 29975, dtype: object

In [65]:
steam_games.head(10)

Unnamed: 0,publisher,genres,app_name,price,steam_id,developer,release_year,features
0,Kotoshiro,"Strategy, Action, Indie, Casual, Simulation",Lost summoner kitty,4.99,761140.0,Kotoshiro,2018,"Lost summoner kitty, Kotoshiro, Strategy, Acti..."
1,"Making fun, inc.","Strategy, Indie, Fantasy, Difficult",Ironbound,0.0,643980.0,Secret level srl,2018,"Ironbound, Secret level srl, Strategy, Indie, ..."
2,Poolians.com,"Simulation, Sports, Casual, Indie, Multiplayer",Real pool 3d - poolians,0.0,670290.0,Poolians.com,2017,"Real pool 3d - poolians, Poolians.com, Simulat..."
3,彼岸领域,"Action, Adventure, Casual",弹炸人2222,0.99,767400.0,彼岸领域,2017,"弹炸人2222, 彼岸领域, Action, Adventure, Casual"
4,Otros,"Action, Indie, Casual, Sports",Log challenge,2.99,773570.0,Otros,2017,"Log challenge, Otros, Action, Indie, Casual, S..."
5,Trickjump games ltd,"Action, Adventure, Simulation, Shooter",Battle royale trainer,3.99,772540.0,Trickjump games ltd,2018,"Battle royale trainer, Trickjump games ltd, Ac..."
6,Poppermost productions,"Indie, Simulation, Sports",Snow - all access basic pass,9.99,774276.0,Poppermost productions,2018,"Snow - all access basic pass, Poppermost produ..."
7,Poppermost productions,"Indie, Simulation, Sports",Snow - all access pro pass,18.99,774277.0,Poppermost productions,2018,"Snow - all access pro pass, Poppermost product..."
8,Poppermost productions,"Indie, Simulation, Sports",Snow - all access legend pass,29.99,774278.0,Poppermost productions,2018,"Snow - all access legend pass, Poppermost prod..."
9,Otros,Casual,Icarus six sixty six,0.0,724910.0,Otros,2018,"Icarus six sixty six, Otros, Casual"


- **CountVectorizer** : Retorna la frecuencia absoluta de cada término en un texto

- **TF-IDF**  Retorna la frecuencia de cada término en un término, y normaliza por el totoal de documentos donde el termino aparece

In [66]:
steam_genres = steam_games['genres']

- Inspeccionaremos con:

In [69]:
id = 2780.0 # id del juego

n = 20 # numero de generos a considerar

In [None]:
text_1 = steam_games[steam_games['steam_id'] == id]['features'].iloc[0]

In [73]:
text_1

'Arma: combat operations, Bohemia interactive, Action, Simulation, Shooter'

In [None]:

# Calcularemos la similitud coseno
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

df = steam_games[['steam_id','features']]

import nltk
from nltk.corpus import stopwords

#Eliminaremos las stopwords

stop_words_steams = ['op','based','co','first']
stop = list(stopwords.words('english'))
stop += stop_words_steams



def score_ml(text):
  data_corpus = [text_1, text]  
  tf = TfidfVectorizer(stop_words=stop)

  tf_idf_matrix_df =tf.fit_transform(data_corpus)
  
  return(linear_kernel(tf_idf_matrix_df,tf_idf_matrix_df)[0,1])

# prueba = df.head(1000)
# prueba['correlacion'] = prueba['features'].head(1000).apply(score_ml)

df['correlacion'] = df['features'].apply(score_ml)


In [79]:
df.sort_values('correlacion',ascending=False).head(10)

Unnamed: 0,steam_id,features,correlacion
98,2780.0,"Arma: combat operations, Bohemia interactive, ...",1.0
2637,304380.0,"Arma 3 helicopters, Bohemia interactive, Simul...",0.602975
29576,33900.0,"Arma 2, Bohemia interactive, Simulation, Actio...",0.55078
27966,107410.0,"Arma 3, Bohemia interactive, Simulation, Multi...",0.55078
9606,639600.0,"Arma 3 malden, Bohemia interactive, Action, Si...",0.505606
16501,601670.0,"Arma 3 jets, Bohemia interactive, Simulation, ...",0.505606
3238,332350.0,"Arma 3 marksmen, Bohemia interactive, Simulati...",0.505606
29136,65780.0,"Arma: gold edition, Bohemia interactive, Simul...",0.47736
20981,395180.0,"Arma 3 apex, Bohemia interactive, Simulation, ...",0.47736
27229,275700.0,"Arma 3 zeus, Bohemia interactive, Action, Stra...",0.457624


**Nota**
- En este función en particular se optó por dos técnicas. En esta versión vamos a tomar un id, luego extraeremos las caracteristicas de este id en un texto, finalmente vamos a realizar la similitud de esta caracteristica con todas las otras caracteristica de forma lineal, es decir, vamos a realizar la correlación de columna por columna, y vamos a quedarnos con el top 5 de los más similares.

- Para cada registro vamos calcular la correlación con el texto del id introducido. Esta técnica nos permitirá realizar el analisis con la tabla completa de steam. Lamentablemente el tiempo de respues ronda los 2 minutos. Esto porque realiza un analisis de recomendación para cada una de las caracteristicas.

## Creación de la función.

In [80]:
import nltk
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer


def recomendacion_juego_steam(id :int):
  path_endpoint_6 = os.path.join('..','data','clear','06_recomendacion_juego.csv.gz')
  df= pd.read_csv(path_endpoint_6).head(100)
  

  df['correlacion'] = 0

  text_1 = df[df['steam_id'] == id]['features'].iloc[0]

  nltk.download('stopwords')
  
  #Eliminaremos las stopwords
  stop_words_steams = ['op','based','co','first']
  stop = list(stopwords.words('english'))
  stop += stop_words_steams

 

  def score_ml(text):
    data_corpus = [text_1, text]  
    tf = TfidfVectorizer(stop_words=stop)

    tf_idf_matrix_df =tf.fit_transform(data_corpus)
    
    return(linear_kernel(tf_idf_matrix_df,tf_idf_matrix_df)[0,1])

  df['correlacion'] = df['features'].apply(score_ml)
  
  df= df.sort_values('correlacion',ascending=False)[1:6]
  
  df['features'] = df['features'].apply(lambda x: x.split(',')[0])
  
  # texto = df.nlargest(5,'correlacion')[['features','correlacion']].to_dict(orient='records')  
  texto=df['features'].values
  return list(texto)
  

In [85]:
recomendacion_juego_steam(2780)

[nltk_data] Downloading package stopwords to /home/pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['Battle royale trainer', "Garry's mod", 'Tomb raider: anniversary', 'The ship: murder party', 'Spear of destiny']