In [1]:
# Cargar las librerías a utilizar
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pyarrow.parquet as pq

In [2]:
# Cargar los Datasets
df_user_items = pd.read_csv("data/user_items_3.csv")
df_steam_games = pd.read_csv("data/steam_games_3.csv")
df_user_reviews = pd.read_csv("data/user_reviews_3.csv")

Para el sistema de recomendación Item-Item voy a usar Similitud del Coseno de Scikit Learn. A partir de los datasets que salieron del EDA, lo primero a realizar, es dejar en cada dataset una fila para cada juego, y adaptar las variables para que puedan ser consumidas por el modelo. Logrado esto lo siguiente es unirlos mediante "inner join", para eliminar los juegos que no tengan datos de los tres datasets. Con este dataset final voy a realizar el modelado.

### df_user_items

In [3]:
df_user_items

Unnamed: 0,item_id,item_name,playtime_forever,user_id
0,10,Counter-Strike,6,76561197970982479
1,30,Day of Defeat,7,76561197970982479
2,240,Counter-Strike: Source,1853,76561197970982479
3,3830,Psychonauts,333,76561197970982479
4,2630,Call of Duty 2,75,76561197970982479
...,...,...,...,...
3004627,304930,Unturned,677,76561198329548331
3004628,227940,Heroes & Generals,43,76561198329548331
3004629,388490,One Way To Die: Steam Edition,3,76561198329548331
3004630,521570,You Have 10 Seconds 2,4,76561198329548331


In [4]:
# Eliminar la columna del usuario ya que no la voy a utilizar
df_user_items.drop("user_id", axis=1, inplace=True)

In [5]:
# Eliminar la columna con el nombre del juego, ya que lo voy a sacar del dataframe de steam_games
df_user_items.drop("item_name", axis=1, inplace=True)

In [6]:
# Renombrar la columna de 'item_id'
df_user_items = df_user_items.rename(columns={'item_id': 'id'})

In [7]:
df_user_items

Unnamed: 0,id,playtime_forever
0,10,6
1,30,7
2,240,1853
3,3830,333
4,2630,75
...,...,...
3004627,304930,677
3004628,227940,43
3004629,388490,3
3004630,521570,4


In [8]:
# Sumar los minutos jugados para cada juego
df_user_items = df_user_items.groupby(['id'])['playtime_forever'].sum()

In [9]:
# Volver a generar el DataFrame a partir de la serie resultante
df_user_items = df_user_items.to_frame()

In [10]:
# Resetear el índice para que no sea la columna id
df_user_items.reset_index(inplace=True)

In [11]:
# Instanciar el normalizador
scaler = StandardScaler()

In [12]:
# Normalizar los datos
df_user_items['playtime_forever'] = scaler.fit_transform(df_user_items[['playtime_forever']])

In [13]:
df_user_items

Unnamed: 0,id,playtime_forever
0,10,1.731158
1,20,0.116053
2,30,0.031866
3,40,-0.127119
4,50,0.426963
...,...,...
10007,527570,-0.157914
10008,527810,-0.157914
10009,527890,-0.157915
10010,527900,-0.157868


In [14]:
df_user_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10012 entries, 0 to 10011
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10012 non-null  int64  
 1   playtime_forever  10012 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 156.6 KB


In [15]:
df_user_items.isna().sum()

id                  0
playtime_forever    0
dtype: int64

### df_steam_games

In [16]:
df_steam_games

Unnamed: 0,app_name,release_date,price,id,developer,genre
0,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Action
1,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Casual
2,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Indie
3,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Simulation
4,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Strategy
...,...,...,...,...,...,...
67582,EXIT 2 - Directions,2017.0,4.99,658870.0,"xropi,stev3ns",Indie
67583,Maze Run VR,,4.99,681550.0,sin datos,Adventure
67584,Maze Run VR,,4.99,681550.0,sin datos,Indie
67585,Maze Run VR,,4.99,681550.0,sin datos,Action


In [17]:
# Eliminar la columna 'release_date' ya que no la voy a utilizar
df_steam_games.drop("release_date", axis=1, inplace=True)

# Eliminar la columna 'developer' ya que no la voy a utilizar
df_steam_games.drop("developer", axis=1, inplace=True)

In [18]:
df_steam_games.drop_duplicates()

Unnamed: 0,app_name,price,id,genre
0,Lost Summoner Kitty,4.99,761140.0,Action
1,Lost Summoner Kitty,4.99,761140.0,Casual
2,Lost Summoner Kitty,4.99,761140.0,Indie
3,Lost Summoner Kitty,4.99,761140.0,Simulation
4,Lost Summoner Kitty,4.99,761140.0,Strategy
...,...,...,...,...
67582,EXIT 2 - Directions,4.99,658870.0,Indie
67583,Maze Run VR,4.99,681550.0,Adventure
67584,Maze Run VR,4.99,681550.0,Indie
67585,Maze Run VR,4.99,681550.0,Action


In [19]:
# Realizar un one-hot encoding para los géneros
df_encoded = pd.get_dummies(df_steam_games['genre'])
df_encoded

Unnamed: 0,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,True,False,False,False,False,False,False
1,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False
3,False,False,False,False,False,True,False
4,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...
67582,False,False,False,True,False,False,False
67583,False,True,False,False,False,False,False
67584,False,False,False,True,False,False,False
67585,True,False,False,False,False,False,False


In [20]:
# Convertir los valores True y False en 1 y 0 respectivamente
df_encoded = df_encoded.replace({True: 1, False: 0})
df_encoded

Unnamed: 0,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
67582,0,0,0,1,0,0,0
67583,0,1,0,0,0,0,0
67584,0,0,0,1,0,0,0
67585,1,0,0,0,0,0,0


In [21]:
# Concatenar el DataFrame original con las columnas codificadas
df_steam_games = pd.concat([df_steam_games, df_encoded], axis=1)
df_steam_games

Unnamed: 0,app_name,price,id,genre,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,Lost Summoner Kitty,4.99,761140.0,Action,1,0,0,0,0,0,0
1,Lost Summoner Kitty,4.99,761140.0,Casual,0,0,1,0,0,0,0
2,Lost Summoner Kitty,4.99,761140.0,Indie,0,0,0,1,0,0,0
3,Lost Summoner Kitty,4.99,761140.0,Simulation,0,0,0,0,0,1,0
4,Lost Summoner Kitty,4.99,761140.0,Strategy,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
67582,EXIT 2 - Directions,4.99,658870.0,Indie,0,0,0,1,0,0,0
67583,Maze Run VR,4.99,681550.0,Adventure,0,1,0,0,0,0,0
67584,Maze Run VR,4.99,681550.0,Indie,0,0,0,1,0,0,0
67585,Maze Run VR,4.99,681550.0,Action,1,0,0,0,0,0,0


(El siguiente código fué provisto por Copilot, para resolver el problema de sumar las columnas de género sin afecta price y id)

In [22]:
# Selecciona las columnas que deseas sumar
columnas_a_sumar = ["Action", "Adventure", "Casual", "Indie", "RPG", "Simulation", "Strategy"]

# Define una función personalizada para mantener la primera entrada en las columnas "price" e "id"
def first_entry(series):
    return series.iloc[0]

# Agrupa por 'app_name' y aplica la suma a las columnas seleccionadas, y la función personalizada a 'price' e 'id'
df_steam_games = df_steam_games.groupby('app_name').agg({
    'price': first_entry,
    'id': first_entry,
    **{col: 'sum' for col in columnas_a_sumar}
})

df_steam_games

Unnamed: 0_level_0,price,id,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
app_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
! That Bastard Is Trying To Steal Our Gold !,3.99,449940.0,1,1,1,1,0,0,0
"""BUTTS: The VR Experience""",0.99,439260.0,0,0,1,0,0,0,0
"""Barely Attuned Magic Thingy"" Staff",0.00,308163.0,1,0,0,1,1,0,0
"""Glow Ball"" - The billiard puzzle game",4.99,388390.0,0,0,1,1,0,0,1
"""Just Another Day"" - Seduce Me Otome CD",4.99,454790.0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
（尘沙惑设定集）Lost in Secular Love - Concept Design Works,3.99,541220.0,0,1,1,1,0,1,0
４人打ちアクション麻雀 / ACTION MAHJONG,9.99,575810.0,1,0,1,1,0,0,0
＜/reality＞,11.99,562280.0,0,1,0,1,0,0,0
＜/reality＞ Original Soundtrack,3.99,626850.0,0,1,0,1,0,0,0


In [23]:
# Resetear el índice para mantener la columna "app_name" como una columna en lugar de índice
df_steam_games.reset_index(inplace=True)

In [24]:
df_steam_games

Unnamed: 0,app_name,price,id,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,! That Bastard Is Trying To Steal Our Gold !,3.99,449940.0,1,1,1,1,0,0,0
1,"""BUTTS: The VR Experience""",0.99,439260.0,0,0,1,0,0,0,0
2,"""Barely Attuned Magic Thingy"" Staff",0.00,308163.0,1,0,0,1,1,0,0
3,"""Glow Ball"" - The billiard puzzle game",4.99,388390.0,0,0,1,1,0,0,1
4,"""Just Another Day"" - Seduce Me Otome CD",4.99,454790.0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
29680,（尘沙惑设定集）Lost in Secular Love - Concept Design ...,3.99,541220.0,0,1,1,1,0,1,0
29681,４人打ちアクション麻雀 / ACTION MAHJONG,9.99,575810.0,1,0,1,1,0,0,0
29682,＜/reality＞,11.99,562280.0,0,1,0,1,0,0,0
29683,＜/reality＞ Original Soundtrack,3.99,626850.0,0,1,0,1,0,0,0


In [25]:
# Normalizar los datos de precio
df_steam_games['price'] = scaler.fit_transform(df_steam_games[['price']])
df_steam_games

Unnamed: 0,app_name,price,id,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,! That Bastard Is Trying To Steal Our Gold !,-0.430874,449940.0,1,1,1,1,0,0,0
1,"""BUTTS: The VR Experience""",-0.780798,439260.0,0,0,1,0,0,0,0
2,"""Barely Attuned Magic Thingy"" Staff",-0.896273,308163.0,1,0,0,1,1,0,0
3,"""Glow Ball"" - The billiard puzzle game",-0.314232,388390.0,0,0,1,1,0,0,1
4,"""Just Another Day"" - Seduce Me Otome CD",-0.314232,454790.0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
29680,（尘沙惑设定集）Lost in Secular Love - Concept Design ...,-0.430874,541220.0,0,1,1,1,0,1,0
29681,４人打ちアクション麻雀 / ACTION MAHJONG,0.268975,575810.0,1,0,1,1,0,0,0
29682,＜/reality＞,0.502258,562280.0,0,1,0,1,0,0,0
29683,＜/reality＞ Original Soundtrack,-0.430874,626850.0,0,1,0,1,0,0,0


In [26]:
df_steam_games["app_name"].nunique()

29685

In [27]:
df_steam_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29685 entries, 0 to 29684
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   app_name    29685 non-null  object 
 1   price       29685 non-null  float64
 2   id          29685 non-null  float64
 3   Action      29685 non-null  int64  
 4   Adventure   29685 non-null  int64  
 5   Casual      29685 non-null  int64  
 6   Indie       29685 non-null  int64  
 7   RPG         29685 non-null  int64  
 8   Simulation  29685 non-null  int64  
 9   Strategy    29685 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 2.3+ MB


In [28]:
df_steam_games.isna().sum()

app_name      0
price         0
id            0
Action        0
Adventure     0
Casual        0
Indie         0
RPG           0
Simulation    0
Strategy      0
dtype: int64

### df_user_reviews

In [29]:
df_user_reviews

Unnamed: 0,user_id,posted,id,recommend,sentiment
0,76561197970982479,2011.0,1250.0,1,2
1,76561197970982479,2011.0,22200.0,1,2
2,76561197970982479,2011.0,43110.0,1,2
3,js41637,2014.0,251610.0,1,2
4,js41637,2013.0,227300.0,1,0
...,...,...,...,...,...
58426,76561198312638244,sin fecha,70.0,1,2
58427,76561198312638244,sin fecha,362890.0,1,2
58428,LydiaMorley,sin fecha,273110.0,1,2
58429,LydiaMorley,sin fecha,730.0,1,2


In [30]:
# Eliminar la columna de user_id ya que no voy a recomendar por usuario
df_user_reviews.drop("user_id", axis=1, inplace=True)

# Eliminar la columna de la fecha de posteo, porque voy a independizar mi análisis de las fechas
df_user_reviews.drop("posted", axis=1, inplace=True)

df_user_reviews

Unnamed: 0,id,recommend,sentiment
0,1250.0,1,2
1,22200.0,1,2
2,43110.0,1,2
3,251610.0,1,2
4,227300.0,1,0
...,...,...,...
58426,70.0,1,2
58427,362890.0,1,2
58428,273110.0,1,2
58429,730.0,1,2


In [31]:
# Sumar agrupando por juego los valores de las columnas de recomendación y de sentimiento
df_user_reviews = df_user_reviews.groupby('id')[["recommend", "sentiment"]].sum()

In [32]:
df_user_reviews

Unnamed: 0_level_0,recommend,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10.0,55,77
20.0,11,21
30.0,3,5
40.0,1,2
50.0,3,6
...,...,...
521340.0,2,4
521430.0,1,0
521570.0,2,1
521990.0,1,2


In [33]:
# Normalizar los datos de recomendación
df_user_reviews['recommend'] = scaler.fit_transform(df_user_reviews[['recommend']])

# Normalizar los datos de sentimiento
df_user_reviews['sentiment'] = scaler.fit_transform(df_user_reviews[['sentiment']])

df_user_reviews

Unnamed: 0_level_0,recommend,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10.0,0.411409,0.377982
20.0,-0.030525,-0.003172
30.0,-0.110876,-0.112073
40.0,-0.130964,-0.132492
50.0,-0.110876,-0.105267
...,...,...
521340.0,-0.120920,-0.118879
521430.0,-0.130964,-0.146105
521570.0,-0.120920,-0.139298
521990.0,-0.130964,-0.132492


In [34]:
df_user_reviews.reset_index(inplace=True)
df_user_reviews

Unnamed: 0,id,recommend,sentiment
0,10.0,0.411409,0.377982
1,20.0,-0.030525,-0.003172
2,30.0,-0.110876,-0.112073
3,40.0,-0.130964,-0.132492
4,50.0,-0.110876,-0.105267
...,...,...,...
3677,521340.0,-0.120920,-0.118879
3678,521430.0,-0.130964,-0.146105
3679,521570.0,-0.120920,-0.139298
3680,521990.0,-0.130964,-0.132492


In [35]:
df_user_reviews["id"].nunique()

3682

In [36]:
df_user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3682 entries, 0 to 3681
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         3682 non-null   float64
 1   recommend  3682 non-null   float64
 2   sentiment  3682 non-null   float64
dtypes: float64(3)
memory usage: 86.4 KB


In [37]:
df_user_reviews.isna().sum()

id           0
recommend    0
sentiment    0
dtype: int64

### UNION DE LOS DATAFRAMES

In [38]:
# Unir los dos primeros Dataframes por el id de juego, haciendo un inner join, ya que quiero los registros tengan todos los datos
df_semicompleto = pd.merge(df_user_items, df_steam_games, on='id', how='inner')
df_semicompleto

Unnamed: 0,id,playtime_forever,app_name,price,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,10,1.731158,Counter-Strike,0.268975,1,0,0,0,0,0,0
1,20,0.116053,Team Fortress Classic,-0.314232,1,0,0,0,0,0,0
2,30,0.031866,Day of Defeat,-0.314232,1,0,0,0,0,0,0
3,40,-0.127119,Deathmatch Classic,-0.314232,1,0,0,0,0,0,0
4,50,0.426963,Half-Life: Opposing Force,-0.314232,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
8167,527510,-0.157713,The Legions of Rome,-0.547515,0,0,1,1,0,1,1
8168,527810,-0.157914,Dynamite Alex,-0.780798,1,0,0,1,0,0,0
8169,527890,-0.157915,Shop-n-Spree: Shopping Paradise,-0.080949,0,0,1,0,0,1,0
8170,527900,-0.157868,Grim Tales: The Bride Collector's Edition,0.268975,0,1,1,0,0,0,0


In [39]:
# Unir los dos Dataframes restantes por el id de juego, haciendo un inner join, ya que quiero los registros tengan todos los datos
df_ML = pd.merge(df_semicompleto, df_user_reviews, on='id', how='inner')
df_ML

Unnamed: 0,id,playtime_forever,app_name,price,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy,recommend,sentiment
0,10,1.731158,Counter-Strike,0.268975,1,0,0,0,0,0,0,0.411409,0.377982
1,20,0.116053,Team Fortress Classic,-0.314232,1,0,0,0,0,0,0,-0.030525,-0.003172
2,30,0.031866,Day of Defeat,-0.314232,1,0,0,0,0,0,0,-0.110876,-0.112073
3,40,-0.127119,Deathmatch Classic,-0.314232,1,0,0,0,0,0,0,-0.130964,-0.132492
4,50,0.426963,Half-Life: Opposing Force,-0.314232,1,0,0,0,0,0,0,-0.110876,-0.105267
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,521340,-0.156676,True or False,-0.780798,0,0,1,1,0,0,0,-0.120920,-0.118879
2830,521430,-0.157910,Super Switch,-0.664157,0,0,0,1,0,0,0,-0.130964,-0.146105
2831,521570,-0.154337,You Have 10 Seconds 2,-0.896273,0,0,1,1,0,0,0,-0.120920,-0.139298
2832,521990,-0.157898,Galactic Storm,-0.547515,1,0,0,1,0,0,0,-0.130964,-0.132492


In [40]:
# Revisar si hay algún nombre duplicado
df_ML.drop_duplicates(subset="id")
df_ML

Unnamed: 0,id,playtime_forever,app_name,price,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy,recommend,sentiment
0,10,1.731158,Counter-Strike,0.268975,1,0,0,0,0,0,0,0.411409,0.377982
1,20,0.116053,Team Fortress Classic,-0.314232,1,0,0,0,0,0,0,-0.030525,-0.003172
2,30,0.031866,Day of Defeat,-0.314232,1,0,0,0,0,0,0,-0.110876,-0.112073
3,40,-0.127119,Deathmatch Classic,-0.314232,1,0,0,0,0,0,0,-0.130964,-0.132492
4,50,0.426963,Half-Life: Opposing Force,-0.314232,1,0,0,0,0,0,0,-0.110876,-0.105267
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,521340,-0.156676,True or False,-0.780798,0,0,1,1,0,0,0,-0.120920,-0.118879
2830,521430,-0.157910,Super Switch,-0.664157,0,0,0,1,0,0,0,-0.130964,-0.146105
2831,521570,-0.154337,You Have 10 Seconds 2,-0.896273,0,0,1,1,0,0,0,-0.120920,-0.139298
2832,521990,-0.157898,Galactic Storm,-0.547515,1,0,0,1,0,0,0,-0.130964,-0.132492


In [41]:
# Exportar el Dataframe a .csv eliminando el índice
df_ML.to_csv("data/df_ML.csv", index=False)

In [43]:
# Sacar el id y el nombre del juego para poder pasar los datos al modelo
df_matriz = df_ML.drop("app_name", axis=1)
df_matriz = df_matriz.drop("id", axis=1)
df_matriz

Unnamed: 0,playtime_forever,price,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy,recommend,sentiment
0,1.731158,0.268975,1,0,0,0,0,0,0,0.411409,0.377982
1,0.116053,-0.314232,1,0,0,0,0,0,0,-0.030525,-0.003172
2,0.031866,-0.314232,1,0,0,0,0,0,0,-0.110876,-0.112073
3,-0.127119,-0.314232,1,0,0,0,0,0,0,-0.130964,-0.132492
4,0.426963,-0.314232,1,0,0,0,0,0,0,-0.110876,-0.105267
...,...,...,...,...,...,...,...,...,...,...,...
2829,-0.156676,-0.780798,0,0,1,1,0,0,0,-0.120920,-0.118879
2830,-0.157910,-0.664157,0,0,0,1,0,0,0,-0.130964,-0.146105
2831,-0.154337,-0.896273,0,0,1,1,0,0,0,-0.120920,-0.139298
2832,-0.157898,-0.547515,1,0,0,1,0,0,0,-0.130964,-0.132492


# MODELIZADO

In [44]:
# Calcula la similitud del coseno entre los juegos
matriz_similitudes = cosine_similarity(df_matriz)

In [45]:
matriz_similitudes

array([[ 1.        ,  0.49928218,  0.3976403 , ..., -0.17247305,
         0.14786838,  0.17631843],
       [ 0.49928218,  1.        ,  0.98866093, ...,  0.15009274,
         0.71463158,  0.70016518],
       [ 0.3976403 ,  0.98866093,  1.        , ...,  0.1704428 ,
         0.73445968,  0.72101707],
       ...,
       [-0.17247305,  0.15009274,  0.1704428 , ...,  1.        ,
         0.59633068,  0.53929385],
       [ 0.14786838,  0.71463158,  0.73445968, ...,  0.59633068,
         1.        ,  0.98893787],
       [ 0.17631843,  0.70016518,  0.72101707, ...,  0.53929385,
         0.98893787,  1.        ]])

In [46]:
matriz_similitudes.shape

(2834, 2834)

In [50]:
df_matriz_similitudes = pd.DataFrame(matriz_similitudes)
df_matriz_similitudes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2824,2825,2826,2827,2828,2829,2830,2831,2832,2833
0,1.000000,0.499282,0.397640,0.263538,0.656401,0.293660,0.997779,0.838950,0.496136,0.914946,...,-0.213942,0.212741,0.077050,-0.218815,-0.157232,-0.168609,-0.218547,-0.172473,0.147868,0.176318
1,0.499282,1.000000,0.988661,0.962143,0.956030,0.971131,0.454633,0.772304,0.994936,0.187333,...,0.186380,0.224139,0.496703,0.169629,0.076848,0.134307,0.150552,0.150093,0.714632,0.700165
2,0.397640,0.988661,1.000000,0.988585,0.938290,0.986610,0.351117,0.733255,0.993241,0.078117,...,0.212035,0.239207,0.509364,0.197723,0.100521,0.154339,0.180325,0.170443,0.734460,0.721017
3,0.263538,0.962143,0.988585,1.000000,0.876175,0.995470,0.212724,0.649274,0.964506,-0.072037,...,0.230002,0.249135,0.515097,0.217709,0.118366,0.169664,0.201712,0.185017,0.744914,0.732233
4,0.656401,0.956030,0.938290,0.876175,1.000000,0.875015,0.624623,0.861070,0.970740,0.405895,...,0.157675,0.197743,0.451351,0.140693,0.056363,0.109659,0.122191,0.126205,0.645889,0.632295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,-0.168609,0.134307,0.154339,0.169664,0.109659,0.161813,-0.164207,-0.150604,0.139521,-0.131903,...,0.788400,-0.017264,0.444531,0.790042,0.569581,1.000000,0.787619,0.998152,0.591970,0.542889
2830,-0.218547,0.150552,0.180325,0.201712,0.122191,0.189287,-0.214049,-0.178831,0.159915,-0.179188,...,0.989602,0.039256,0.558021,0.997122,0.733206,0.787619,1.000000,0.798073,0.756372,0.704751
2831,-0.172473,0.150093,0.170443,0.185017,0.126205,0.177155,-0.167111,-0.159306,0.155877,-0.129814,...,0.806507,-0.059308,0.454732,0.804591,0.570051,0.998152,0.798073,1.000000,0.596331,0.539294
2832,0.147868,0.714632,0.734460,0.744914,0.645889,0.742251,0.114012,0.409164,0.715173,-0.073668,...,0.739474,0.366638,0.685819,0.749407,0.562879,0.591970,0.756372,0.596331,1.000000,0.988938


In [54]:
# Crea un nuevo DataFrame con los id y los nombres de los juegos
df_nombres_juegos = df_ML[["id", "app_name"]]

# Asigna los id de los juegos como índices en la matriz de similitud y los nombres de los juegos como columnas
df_matriz_similitudes.index = df_nombres_juegos["id"]
df_matriz_similitudes.columns = df_nombres_juegos["app_name"]

df_matriz_similitudes


app_name,Counter-Strike,Team Fortress Classic,Day of Defeat,Deathmatch Classic,Half-Life: Opposing Force,Ricochet,Half-Life,Counter-Strike: Condition Zero,Half-Life: Blue Shift,Half-Life 2,...,You Have 10 Seconds,Unbox: Newbie's Adventure,The Pirate: Caribbean Hunt,Sparky's Hunt,Everything is Peachy,True or False,Super Switch,You Have 10 Seconds 2,Galactic Storm,What The Box?
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,1.000000,0.499282,0.397640,0.263538,0.656401,0.293660,0.997779,0.838950,0.496136,0.914946,...,-0.213942,0.212741,0.077050,-0.218815,-0.157232,-0.168609,-0.218547,-0.172473,0.147868,0.176318
20,0.499282,1.000000,0.988661,0.962143,0.956030,0.971131,0.454633,0.772304,0.994936,0.187333,...,0.186380,0.224139,0.496703,0.169629,0.076848,0.134307,0.150552,0.150093,0.714632,0.700165
30,0.397640,0.988661,1.000000,0.988585,0.938290,0.986610,0.351117,0.733255,0.993241,0.078117,...,0.212035,0.239207,0.509364,0.197723,0.100521,0.154339,0.180325,0.170443,0.734460,0.721017
40,0.263538,0.962143,0.988585,1.000000,0.876175,0.995470,0.212724,0.649274,0.964506,-0.072037,...,0.230002,0.249135,0.515097,0.217709,0.118366,0.169664,0.201712,0.185017,0.744914,0.732233
50,0.656401,0.956030,0.938290,0.876175,1.000000,0.875015,0.624623,0.861070,0.970740,0.405895,...,0.157675,0.197743,0.451351,0.140693,0.056363,0.109659,0.122191,0.126205,0.645889,0.632295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521340,-0.168609,0.134307,0.154339,0.169664,0.109659,0.161813,-0.164207,-0.150604,0.139521,-0.131903,...,0.788400,-0.017264,0.444531,0.790042,0.569581,1.000000,0.787619,0.998152,0.591970,0.542889
521430,-0.218547,0.150552,0.180325,0.201712,0.122191,0.189287,-0.214049,-0.178831,0.159915,-0.179188,...,0.989602,0.039256,0.558021,0.997122,0.733206,0.787619,1.000000,0.798073,0.756372,0.704751
521570,-0.172473,0.150093,0.170443,0.185017,0.126205,0.177155,-0.167111,-0.159306,0.155877,-0.129814,...,0.806507,-0.059308,0.454732,0.804591,0.570051,0.998152,0.798073,1.000000,0.596331,0.539294
521990,0.147868,0.714632,0.734460,0.744914,0.645889,0.742251,0.114012,0.409164,0.715173,-0.073668,...,0.739474,0.366638,0.685819,0.749407,0.562879,0.591970,0.756372,0.596331,1.000000,0.988938


### La siguiente función provista por Copilot me ayuda a buscar los juegos recomendados.

In [60]:
# Supongamos que 'df_matriz_similitudes' es tu matriz de similitud y 'juego_referencia' es el id del juego que quieres comparar
juego_referencia = 10

# Encuentra el índice del juego de referencia
indice_juego_referencia = df_matriz_similitudes.index.get_loc(juego_referencia)

# Extrae la fila de similitud para el juego de referencia
fila_similitud = df_matriz_similitudes.iloc[indice_juego_referencia]

# Ordena los valores de similitud en orden descendente
juegos_similares = fila_similitud.sort_values(ascending=False)

# Puedes seleccionar los N juegos más similares, por ejemplo:
N = 5
juego_seleccionado = juegos_similares[0:1]
juegos_recomendados = juegos_similares[1:N+1]  # Excluye el juego de referencia

print(f"Juegos similares a '{juego_seleccionado}':")
print(juegos_recomendados)


Juegos similares a 'app_name
Counter-Strike    1.0
Name: 10, dtype: float64':
app_name
Half-Life                                   0.997779
Day of Defeat: Source                       0.988878
Saints Row 2                                0.987442
Tom Clancy's Rainbow Six® Siege             0.976660
Star Wars: Battlefront 2 (Classic, 2005)    0.961046
Name: 10, dtype: float64


In [93]:
# Exportar el Dataframe de la matriz de similitudes a .csv, eliminando el índice
# df_matriz_similitudes.to_csv("df_matriz_similitudes.csv", index=False)

In [61]:
# Exportar el Dataframe de la matriz de similitudes a .parquet, eliminando el índice
df_matriz_similitudes.to_parquet("df_matriz_similitudes.parquet")

In [2]:
df_matriz_similitudes = pd.read_parquet("df_matriz_similitudes.parquet")

In [3]:
df_matriz_similitudes.tail()

Unnamed: 0,Counter-Strike,Team Fortress Classic,Day of Defeat,Deathmatch Classic,Half-Life: Opposing Force,Ricochet,Half-Life,Counter-Strike: Condition Zero,Half-Life: Blue Shift,Half-Life 2,...,You Have 10 Seconds,Unbox: Newbie's Adventure,The Pirate: Caribbean Hunt,Sparky's Hunt,Everything is Peachy,True or False,Super Switch,You Have 10 Seconds 2,Galactic Storm,What The Box?
2829,-0.168609,0.134307,0.154339,0.169664,0.109659,0.161813,-0.164207,-0.150604,0.139521,-0.131903,...,0.7884,-0.017264,0.444531,0.790042,0.569581,1.0,0.787619,0.998152,0.59197,0.542889
2830,-0.218547,0.150552,0.180325,0.201712,0.122191,0.189287,-0.214049,-0.178831,0.159915,-0.179188,...,0.989602,0.039256,0.558021,0.997122,0.733206,0.787619,1.0,0.798073,0.756372,0.704751
2831,-0.172473,0.150093,0.170443,0.185017,0.126205,0.177155,-0.167111,-0.159306,0.155877,-0.129814,...,0.806507,-0.059308,0.454732,0.804591,0.570051,0.998152,0.798073,1.0,0.596331,0.539294
2832,0.147868,0.714632,0.73446,0.744914,0.645889,0.742251,0.114012,0.409164,0.715173,-0.073668,...,0.739474,0.366638,0.685819,0.749407,0.562879,0.59197,0.756372,0.596331,1.0,0.988938
2833,0.176318,0.700165,0.721017,0.732233,0.632295,0.728903,0.138586,0.463808,0.701278,-0.071223,...,0.668815,0.484018,0.658293,0.687602,0.542775,0.542889,0.704751,0.539294,0.988938,1.0
