# Recomendación Juego

In [3]:
import pandas as pd
import os

path_steam_games = os.path.join('..','data','clear','steam_games.csv.gz')

path_user_review = os.path.join('..','data','clear','user_reviews.csv.gz')

path_user_items = os.path.join('..','data','clear','users_items.csv.gz')

## Carga de data

In [306]:
## Cargamos la data necesaria para solucionar este problema.

steam_games_col = ['id','app_name','developer','genres']
steam_games = pd.read_csv(path_steam_games,usecols=steam_games_col)


user_items_cols = ['item_id']
user_items = pd.read_csv(path_user_items,usecols=user_items_cols)

user_review_col = ['item_id']
user_review = pd.read_csv(path_user_review, lineterminator='\n',usecols=user_review_col)


## renombraremos la columna id por item_id
steam_games.rename(columns={'id':'item_id'}, inplace=True)


### Selección de usuarios 

- Dado que tenemos memoria limitada y espacio limitado vamos a seleccionar a los usuarios que se encuentran en steam y tambien hayan realizado algun comentario. Para ello realizaremos un merge con las tablas de información.
 
- Para una siguiente fase de este problema, se podrian considerar muchas mas variables para categorizar los juegos. Por cuestienos de tiempo y constos computacionales, vamos solo a categorizar utilizando la similitud entre palabras, (obviando el contexto). 

In [307]:
merge_1 = steam_games.merge(user_review, on='item_id',how='inner').groupby(['item_id']).first()

In [308]:
merge_1.head(10)

Unnamed: 0_level_0,genres,app_name,developer
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10.0,"Action, FPS, Multiplayer, Shooter, Classic, Te...",Counter-strike,Valve
20.0,"Action, FPS, Multiplayer, Classic, Shooter, Cl...",Team fortress classic,Valve
30.0,"FPS, World War II, Multiplayer, Action, Shoote...",Day of defeat,Valve
40.0,"Action, FPS, Multiplayer, Classic, Shooter, Fi...",Deathmatch classic,Valve
50.0,"FPS, Action, Sci-fi, Singleplayer, Classic, Sh...",Half-life: opposing force,Gearbox software
...,...,...,...
421770.0,"Strategy, Massively Multiplayer, Indie, Casual...",Pool nation fx - unlock online,Cherry pop games
421890.0,"Action, Casual, Simulation",Avaris 2: the return of the empress,Studiogiw
423120.0,"Indie, RPG, Choose Your Own Adventure, Text-Ba...",Community college hero: trial by fire,Hosted games
423880.0,"Free to Play, Anime, Visual Novel, Indie, Casu...",Carpe diem,Eyzi


- Realizaremos un groupby por el id del juego. Luego en las columnas restantes nos quedaremos con el primer (único) desarrollador de cada juego. Es claro que un juego tiene un unico desarrollador y en general el mismo nombre.

In [309]:
merge_2 = merge_1.merge(user_items, on='item_id', how='inner').groupby(['item_id']).first().reset_index()

In [310]:
merge_2.head(10)

Unnamed: 0,item_id,genres,app_name,developer
0,10.0,"Action, FPS, Multiplayer, Shooter, Classic, Te...",Counter-strike,Valve
1,20.0,"Action, FPS, Multiplayer, Classic, Shooter, Cl...",Team fortress classic,Valve
2,30.0,"FPS, World War II, Multiplayer, Action, Shoote...",Day of defeat,Valve
3,40.0,"Action, FPS, Multiplayer, Classic, Shooter, Fi...",Deathmatch classic,Valve
4,50.0,"FPS, Action, Sci-fi, Singleplayer, Classic, Sh...",Half-life: opposing force,Gearbox software
...,...,...,...,...
2256,421630.0,"Indie, RPG, Steampunk, Choose Your Own Adventu...",A study in steampunk: choice by gaslight,Hosted games
2257,421890.0,"Action, Casual, Simulation",Avaris 2: the return of the empress,Studiogiw
2258,423120.0,"Indie, RPG, Choose Your Own Adventure, Text-Ba...",Community college hero: trial by fire,Hosted games
2259,423880.0,"Free to Play, Anime, Visual Novel, Indie, Casu...",Carpe diem,Eyzi


## Selección de las caracteristicas más importantes de un juego

- Inspeccionaremos cuales son los géneros mas comunes, para ello contaremos la frecuencia relativa de cada genero .

In [311]:
from collections import Counter
from itertools import chain

def select_n_best(caracteristica, n = 10 ):
    """Retorna las n caracteristicas (generos) más repetidos en todo el dataset. 

    Args:
        caracteristica (df): Columna que contiene caracteres en cada registro 
        n (int, optional): la cantidad de caracteristicas que quiero extraer. Defaults to 10.
    """
    
    # Dividir las palabras en cada fila y contar la frecuencia de cada palabra
    conteo_palabras = Counter(chain.from_iterable(caracteristica.str.split(', ')))

    # Obtener las n palabras más comunes
    palabras_mas_comunes = conteo_palabras.most_common(n)
    
    return([palabras_mas_comunes[i][0] for i in range(n)])

In [312]:
## Tomamos los n generos más significativos
palabras_mas_relevantes = select_n_best(steam_games['genres'])

## convertir lista en texto
text_palabras = (', ').join(palabras_mas_relevantes)

## Eliminaremos de cada registro los no significativos.
def drop_not_important(genres):
  return(', ').join([palabra for palabra in genres.split(', ') if palabra.capitalize() in palabras_mas_relevantes])

merge_2['genres'] = merge_2['genres'].apply(drop_not_important)


## Concatenación de texto

- A continuación vamos a concatenar los textos de genero, nombre de aplicación y desarrollador, luego con esta columna utilzaremos la técnica de conconrdacia de paralabras para identificar cuales juegos son similares entre si. 

**Ejemplo:** Mismo desarrollador nombres similares o generos similiares. 

In [313]:
merge_2.head(10)

Unnamed: 0,item_id,genres,app_name,developer
0,10.0,"Action, Multiplayer, Strategy",Counter-strike,Valve
1,20.0,"Action, Multiplayer, Adventure, Casual",Team fortress classic,Valve
2,30.0,"Multiplayer, Action, Singleplayer",Day of defeat,Valve
3,40.0,"Action, Multiplayer",Deathmatch classic,Valve
4,50.0,"Action, Singleplayer, Adventure",Half-life: opposing force,Gearbox software
...,...,...,...,...
2256,421630.0,Indie,A study in steampunk: choice by gaslight,Hosted games
2257,421890.0,"Action, Casual, Simulation",Avaris 2: the return of the empress,Studiogiw
2258,423120.0,Indie,Community college hero: trial by fire,Hosted games
2259,423880.0,"Indie, Casual, Singleplayer",Carpe diem,Eyzi


In [314]:
cols_concatenate = ['app_name','developer','genres']

merge_2['features'] = merge_2[cols_concatenate].apply(lambda row: ', '.join(row), axis=1)

merge_2

Unnamed: 0,item_id,genres,app_name,developer,features
0,10.0,"Action, Multiplayer, Strategy",Counter-strike,Valve,"Counter-strike, Valve, Action, Multiplayer, St..."
1,20.0,"Action, Multiplayer, Adventure, Casual",Team fortress classic,Valve,"Team fortress classic, Valve, Action, Multipla..."
2,30.0,"Multiplayer, Action, Singleplayer",Day of defeat,Valve,"Day of defeat, Valve, Multiplayer, Action, Sin..."
3,40.0,"Action, Multiplayer",Deathmatch classic,Valve,"Deathmatch classic, Valve, Action, Multiplayer"
4,50.0,"Action, Singleplayer, Adventure",Half-life: opposing force,Gearbox software,"Half-life: opposing force, Gearbox software, A..."
...,...,...,...,...,...
2256,421630.0,Indie,A study in steampunk: choice by gaslight,Hosted games,"A study in steampunk: choice by gaslight, Host..."
2257,421890.0,"Action, Casual, Simulation",Avaris 2: the return of the empress,Studiogiw,"Avaris 2: the return of the empress, Studiogiw..."
2258,423120.0,Indie,Community college hero: trial by fire,Hosted games,"Community college hero: trial by fire, Hosted ..."
2259,423880.0,"Indie, Casual, Singleplayer",Carpe diem,Eyzi,"Carpe diem, Eyzi, Indie, Casual, Singleplayer"


In [315]:
consulta_06 = merge_2[['item_id','features']]

## Carga de Tabla Consulta

- Cargaremos la tabla anterior, con el fin de ser consumida por la api una vez se complete el analisis. 

In [375]:
path_endpoint_06 = os.path.join('..','data','clear','06_recomendacion_juego_v2.csv.gz')
consulta_06.to_csv(path_endpoint_06, index=False ,compression='gzip')

## Lectura de Tabla Consulta

In [377]:
path_endpoint_06 = os.path.join('..','data','clear','06_recomendacion_juego_v2.csv.gz')
consulta_06 = pd.read_csv(path_endpoint_06)

In [379]:
import nltk
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

#Eliminaremos las stopwords, junto algunas que encontré
stop_words_steams = ['aaaaaa', 'ab', 'abbey','abe', 'abramenko']
stop = list(stopwords.words('english'))
stop += stop_words_steams

# Entrenaremos el modelo con las stopwords ingresadas y que solo acepte paralabras alfanumericas.
tf = TfidfVectorizer(stop_words=stop, token_pattern=r'\b[a-zA-Z]\w+\b' )

data_vector = tf.fit_transform(consulta_06['features'])

# Crearemos la matrix con filas los diferentes juegos y las columnas son las diferentes caracteristicas extraidas por el modelo
data_vector_df = pd.DataFrame(data_vector.toarray(), index=consulta_06['item_id'], columns = tf.get_feature_names_out())


In [380]:
data_vector_df.sort_values('absolute',ascending=False).head(10)

Unnamed: 0_level_0,absolute,absolution,abstraction,abyss,abyssal,academy,acceleroto,access,ace,acid,...,zombies,zomboid,zoo,zoom,zoombinis,zootfly,zoë,zucconi,zuma,zykov
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57600.0,0.513752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
320140.0,0.429609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289690.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289760.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289930.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290020.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290060.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Similitud del Coseno
- A continuación vamos a realizar la correlación de la matriz anterior con sigo mismo, para ello utilizamos **cosine_similarity**, esta función simplemente calcula el producto punto matricial, (correlacion) de la matriz anterior con su transpuesta, esto con el fin de encontrar la correlación entre los vectores fila. La norma utilizada en este modelo es la discreta, es decir la similitud en componentes.

In [381]:
vector_similitud_coseno = cosine_similarity(data_vector_df.values)

cos_sim_df = pd.DataFrame(vector_similitud_coseno, 
                          index=data_vector_df.index, columns=data_vector_df.index)

In [382]:
cos_sim_df.iloc[0:10 ,0:10]

item_id,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,130.0,220.0
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10.0,1.0,0.253759,0.283039,0.29326,0.01139,0.32878,0.313393,0.729164,0.011787,0.313393
20.0,0.253759,1.0,0.243289,0.52284,0.022823,0.282607,0.288664,0.198277,0.023618,0.288664
30.0,0.283039,0.243289,1.0,0.28116,0.021135,0.315215,0.315577,0.216763,0.021871,0.315577
40.0,0.29326,0.52284,0.28116,1.0,0.011315,0.326598,0.311313,0.213834,0.011709,0.311313
50.0,0.01139,0.022823,0.021135,0.011315,1.0,0.012685,0.463096,0.02713,0.582806,0.463096
60.0,0.32878,0.282607,0.315215,0.326598,0.012685,1.0,0.349021,0.239735,0.013127,0.349021
70.0,0.313393,0.288664,0.315577,0.311313,0.463096,0.349021,1.0,0.256368,0.479225,1.0
80.0,0.729164,0.198277,0.216763,0.213834,0.02713,0.239735,0.256368,1.0,0.028075,0.256368
130.0,0.011787,0.023618,0.021871,0.011709,0.582806,0.013127,0.479225,0.028075,1.0,0.479225
220.0,0.313393,0.288664,0.315577,0.311313,0.463096,0.349021,1.0,0.256368,0.479225,1.0


## Top 5 Similares
- Dado un id,  vamos escoger de la fila de este id, el top 5 de los más similares. dado que en la diagonal siempre es 1 debemos extraer este juego de las similitudes (todo juego es similar con sigo mismo).

- Realizaremos un ejemplo con el indice 284950, para luego generar la función

In [383]:
item_id = 284950.0	 
juegos_similares = cos_sim_df.loc[item_id].nlargest(6)

top5 = juegos_similares.iloc[1:6]
top5

item_id
301750.0    0.432880
365900.0    0.222195
297350.0    0.194786
409670.0    0.181875
367240.0    0.179492
Name: 284950.0, dtype: float64

**Nota** Aunque el nombre lo tenemos en la tabla consulta, para evitar carga de columnas innecesarias, solamente cargagos feaure y item_id, en feaures podemos hacer scrapping para obtener el nombre del juego sin la necesidad de realizar otra consulta  a la base de datos.

In [390]:
## Extracción del nombre del juego
name_consulta = consulta_06.set_index('item_id').loc[item_id].values[0].split(',')[0]
name_consulta

In [392]:
consulta_06.set_index('item_id').loc[top5.index]

Unnamed: 0_level_0,features
item_id,Unnamed: 1_level_1
301750.0,"Radical roach remastered, Dl softworks, Indie,..."
365900.0,"Pixel dungeon, Retronic games, Adventure, Indi..."
297350.0,"The old city: leviathan, Postmod softworks, Ad..."
409670.0,"Pink hour, Studio pixel, Indie, Action, Single..."
367240.0,"Avenging angel, Dark amber softworks, Action, ..."


In [393]:
resultado = consulta_06.set_index('item_id').loc[top5.index]['features'].apply(lambda x: x.split(',')[0]).values

print(f"Los juegos similares a {name_consulta} son :\n")
for name in resultado:
  print("\n",name)

Los juegos similares a Pixel puzzles: japan son :


 Radical roach remastered

 Pixel dungeon

 The old city: leviathan

 Pink hour

 Avenging angel


## Creación de la función.

Crearemos la función que condensa lo anteriormente explicado

In [11]:
import nltk
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def recomendacion_juego_v2(item_id :int):
  path_endpoint_06 = os.path.join('..','data','clear','06_recomendacion_juego_v2.csv.gz')
  consulta_06 = pd.read_csv(path_endpoint_06)
  
  nombre_juego = consulta_06.set_index('item_id').loc[item_id].values[0].split(',')[0]

  #Eliminaremos las stopwords

  stop_words_steams = ['aaaaaa', 'ab', 'abbey','abe', 'abramenko']
  stop = list(stopwords.words('english'))
  stop += stop_words_steams


  tf = TfidfVectorizer(stop_words=stop, token_pattern=r'\b[a-zA-Z]\w+\b' )

  data_vector = tf.fit_transform(consulta_06['features'])

  data_vector_df = pd.DataFrame(data_vector.toarray(), index=consulta_06['item_id'], columns = tf.get_feature_names_out())
    
  vector_similitud_coseno = cosine_similarity(data_vector_df.values)
  
  cos_sim_df = pd.DataFrame(vector_similitud_coseno, index=data_vector_df.index, columns=data_vector_df.index)
  
  ##top5
  juegos_similares = cos_sim_df.loc[item_id].nlargest(6)

  top5 = juegos_similares.iloc[1:6]
  
  
  resultado = consulta_06.set_index('item_id').loc[top5.index]['features'].apply(lambda x: x.split(',')[0]).values
  # print(f"Los juegos similares a {nombre_juego} son :\n")
  # for name in resultado:
  #   print("\n",name)
    
  resultado = consulta_06.set_index('item_id').loc[top5.index]['features'].apply(lambda x: x.split(',')[0]).values
    
  return list(resultado)
  

In [17]:
recomendacion_juego_v2(284950)

Los juegos similares a Pixel puzzles: japan son :


 Radical roach remastered

 Pixel dungeon

 The old city: leviathan

 Pink hour

 Avenging angel


['Radical roach remastered',
 'Pixel dungeon',
 'The old city: leviathan',
 'Pink hour',
 'Avenging angel']