In [53]:
# Cargar las librerías a utilizar
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [9]:
# Cargar los Datasets
df_user_items = pd.read_csv("data/user_items_3.csv")
df_steam_games = pd.read_csv("data/steam_games_3.csv")
df_user_reviews = pd.read_csv("data/user_reviews_3.csv")

Para el sistema de recomendación Item-Item voy a usar Similaridad del Coseno de Scikit Learn. A partir de los datasets que salieron del EDA, lo primero a realizar, es dejar en cada dataset una fila para cada juego, y adaptar las variables para que puedan ser consumidas por el modelo. Logrado esto lo siguiente es unirlos y eliminar los juegos que no tengan datos de los tres datasets. Con este dataset final voy a realizar el modelado.

### df_user_items

In [85]:
df_user_items

Unnamed: 0,item_id,item_name,playtime_forever,user_id
0,10,Counter-Strike,6,76561197970982479
1,30,Day of Defeat,7,76561197970982479
2,240,Counter-Strike: Source,1853,76561197970982479
3,3830,Psychonauts,333,76561197970982479
4,2630,Call of Duty 2,75,76561197970982479
...,...,...,...,...
3004627,304930,Unturned,677,76561198329548331
3004628,227940,Heroes & Generals,43,76561198329548331
3004629,388490,One Way To Die: Steam Edition,3,76561198329548331
3004630,521570,You Have 10 Seconds 2,4,76561198329548331


In [86]:
# Eliminar la columna del usuario ya que no la voy a utilizar
df_user_items.drop("user_id", axis=1, inplace=True)

In [87]:
# Eliminar la columna con el nombre del juego, ya que lo voy a sacar del dataframe de steam_games
df_user_items.drop("item_name", axis=1, inplace=True)

In [88]:
# Renombrar la columna de 'item_id'
df_user_items = df_user_items.rename(columns={'item_id': 'id'})

In [89]:
df_user_items

Unnamed: 0,id,playtime_forever
0,10,6
1,30,7
2,240,1853
3,3830,333
4,2630,75
...,...,...
3004627,304930,677
3004628,227940,43
3004629,388490,3
3004630,521570,4


In [90]:
# Sumar los minutos jugados para cada juego
df_user_items = df_user_items.groupby(['id'])['playtime_forever'].sum()

In [91]:
# Volver a generar el DataFrame a partir de la serie resultante
df_user_items = df_user_items.to_frame()

In [92]:
# Resetear el índice para que no sea la columna id
df_user_items.reset_index(inplace=True)

In [93]:
# Instanciar el normalizador
scaler = StandardScaler()

In [94]:
# Normalizar los datos
df_user_items['playtime_forever'] = scaler.fit_transform(df_user_items[['playtime_forever']])

In [95]:
df_user_items

Unnamed: 0,id,playtime_forever
0,10,1.731158
1,20,0.116053
2,30,0.031866
3,40,-0.127119
4,50,0.426963
...,...,...
10007,527570,-0.157914
10008,527810,-0.157914
10009,527890,-0.157915
10010,527900,-0.157868


In [97]:
df_user_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10012 entries, 0 to 10011
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10012 non-null  int64  
 1   playtime_forever  10012 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 156.6 KB


### df_steam_games

In [52]:
df_steam_games

Unnamed: 0,app_name,release_date,price,id,developer,genre
0,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Action
1,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Casual
2,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Indie
3,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Simulation
4,Lost Summoner Kitty,2018.0,4.99,761140.0,Kotoshiro,Strategy
...,...,...,...,...,...,...
67582,EXIT 2 - Directions,2017.0,4.99,658870.0,"xropi,stev3ns",Indie
67583,Maze Run VR,,4.99,681550.0,sin datos,Adventure
67584,Maze Run VR,,4.99,681550.0,sin datos,Indie
67585,Maze Run VR,,4.99,681550.0,sin datos,Action


In [58]:
# Eliminar la columna 'release_date' ya que no la voy a utilizar
df_steam_games.drop("release_date", axis=1, inplace=True)

# Eliminar la columna 'developer' ya que no la voy a utilizar
df_steam_games.drop("developer", axis=1, inplace=True)

In [59]:
df_steam_games.drop_duplicates()

Unnamed: 0,app_name,price,id,genre
0,Lost Summoner Kitty,4.99,761140.0,Action
1,Lost Summoner Kitty,4.99,761140.0,Casual
2,Lost Summoner Kitty,4.99,761140.0,Indie
3,Lost Summoner Kitty,4.99,761140.0,Simulation
4,Lost Summoner Kitty,4.99,761140.0,Strategy
...,...,...,...,...
67582,EXIT 2 - Directions,4.99,658870.0,Indie
67583,Maze Run VR,4.99,681550.0,Adventure
67584,Maze Run VR,4.99,681550.0,Indie
67585,Maze Run VR,4.99,681550.0,Action


In [71]:
# Realizar un one-hot encoding para los géneros
df_encoded = pd.get_dummies(df_steam_games['genre'])
df_encoded

Unnamed: 0,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,True,False,False,False,False,False,False
1,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False
3,False,False,False,False,False,True,False
4,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...
67582,False,False,False,True,False,False,False
67583,False,True,False,False,False,False,False
67584,False,False,False,True,False,False,False
67585,True,False,False,False,False,False,False


In [72]:
# Convertir los valores True y False en 1 y 0 respectivamente
df_encoded = df_encoded.replace({True: 1, False: 0})
df_encoded

Unnamed: 0,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
67582,0,0,0,1,0,0,0
67583,0,1,0,0,0,0,0
67584,0,0,0,1,0,0,0
67585,1,0,0,0,0,0,0


In [73]:
# Concatenar el DataFrame original con las columnas codificadas
df_steam_games = pd.concat([df_steam_games, df_encoded], axis=1)
df_steam_games

Unnamed: 0,app_name,price,id,genre,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,Lost Summoner Kitty,4.99,761140.0,Action,1,0,0,0,0,0,0
1,Lost Summoner Kitty,4.99,761140.0,Casual,0,0,1,0,0,0,0
2,Lost Summoner Kitty,4.99,761140.0,Indie,0,0,0,1,0,0,0
3,Lost Summoner Kitty,4.99,761140.0,Simulation,0,0,0,0,0,1,0
4,Lost Summoner Kitty,4.99,761140.0,Strategy,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
67582,EXIT 2 - Directions,4.99,658870.0,Indie,0,0,0,1,0,0,0
67583,Maze Run VR,4.99,681550.0,Adventure,0,1,0,0,0,0,0
67584,Maze Run VR,4.99,681550.0,Indie,0,0,0,1,0,0,0
67585,Maze Run VR,4.99,681550.0,Action,1,0,0,0,0,0,0


In [74]:
# Agrupa por la columna que identifica cada juego (por ejemplo, el nombre del juego)
df_steam_games = df_steam_games.groupby('app_name').sum()
df_steam_games

Unnamed: 0_level_0,price,id,genre,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
app_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
! That Bastard Is Trying To Steal Our Gold !,15.96,1799760.0,ActionAdventureCasualIndie,1,1,1,1,0,0,0
"""BUTTS: The VR Experience""",0.99,439260.0,Casual,0,0,1,0,0,0,0
"""Barely Attuned Magic Thingy"" Staff",0.00,924489.0,ActionIndieRPG,1,0,0,1,1,0,0
"""Glow Ball"" - The billiard puzzle game",14.97,1165170.0,CasualIndieStrategy,0,0,1,1,0,0,1
"""Just Another Day"" - Seduce Me Otome CD",9.98,909580.0,CasualSimulation,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
（尘沙惑设定集）Lost in Secular Love - Concept Design Works,15.96,2164880.0,AdventureCasualIndieSimulation,0,1,1,1,0,1,0
４人打ちアクション麻雀 / ACTION MAHJONG,29.97,1727430.0,ActionCasualIndie,1,0,1,1,0,0,0
＜/reality＞,23.98,1124560.0,AdventureIndie,0,1,0,1,0,0,0
＜/reality＞ Original Soundtrack,7.98,1253700.0,AdventureIndie,0,1,0,1,0,0,0


In [75]:
# Resetear el índice para mantener la columna "app_name" como una columna en lugar de índice
df_steam_games.reset_index(inplace=True)

In [76]:
# Eliminar la columna 'genre' que ya no es necesaria
df_steam_games.drop("genre", axis=1, inplace=True)
df_steam_games

Unnamed: 0,app_name,price,id,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,! That Bastard Is Trying To Steal Our Gold !,15.96,1799760.0,1,1,1,1,0,0,0
1,"""BUTTS: The VR Experience""",0.99,439260.0,0,0,1,0,0,0,0
2,"""Barely Attuned Magic Thingy"" Staff",0.00,924489.0,1,0,0,1,1,0,0
3,"""Glow Ball"" - The billiard puzzle game",14.97,1165170.0,0,0,1,1,0,0,1
4,"""Just Another Day"" - Seduce Me Otome CD",9.98,909580.0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
29680,（尘沙惑设定集）Lost in Secular Love - Concept Design ...,15.96,2164880.0,0,1,1,1,0,1,0
29681,４人打ちアクション麻雀 / ACTION MAHJONG,29.97,1727430.0,1,0,1,1,0,0,0
29682,＜/reality＞,23.98,1124560.0,0,1,0,1,0,0,0
29683,＜/reality＞ Original Soundtrack,7.98,1253700.0,0,1,0,1,0,0,0


In [77]:
# Normalizar los datos de precio
df_steam_games['price'] = scaler.fit_transform(df_steam_games[['price']])
df_steam_games

Unnamed: 0,app_name,price,id,Action,Adventure,Casual,Indie,RPG,Simulation,Strategy
0,! That Bastard Is Trying To Steal Our Gold !,-0.033980,1799760.0,1,1,1,1,0,0,0
1,"""BUTTS: The VR Experience""",-0.767147,439260.0,0,0,1,0,0,0,0
2,"""Barely Attuned Magic Thingy"" Staff",-0.815633,924489.0,1,0,0,1,1,0,0
3,"""Glow Ball"" - The billiard puzzle game",-0.082466,1165170.0,0,0,1,1,0,0,1
4,"""Just Another Day"" - Seduce Me Otome CD",-0.326855,909580.0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
29680,（尘沙惑设定集）Lost in Secular Love - Concept Design ...,-0.033980,2164880.0,0,1,1,1,0,1,0
29681,４人打ちアクション麻雀 / ACTION MAHJONG,0.652170,1727430.0,1,0,1,1,0,0,0
29682,＜/reality＞,0.358805,1124560.0,0,1,0,1,0,0,0
29683,＜/reality＞ Original Soundtrack,-0.424806,1253700.0,0,1,0,1,0,0,0


In [78]:
df_steam_games["app_name"].nunique()

29685

In [98]:
df_steam_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29685 entries, 0 to 29684
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   app_name    29685 non-null  object 
 1   price       29685 non-null  float64
 2   id          29685 non-null  float64
 3   Action      29685 non-null  int64  
 4   Adventure   29685 non-null  int64  
 5   Casual      29685 non-null  int64  
 6   Indie       29685 non-null  int64  
 7   RPG         29685 non-null  int64  
 8   Simulation  29685 non-null  int64  
 9   Strategy    29685 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 2.3+ MB


### df_user_reviews

In [None]:
df_user_reviews = pd.read_csv("data/user_reviews_3.csv")

In [79]:
df_user_reviews

Unnamed: 0,user_id,posted,id,recommend,sentiment
0,76561197970982479,2011.0,1250.0,1,2
1,76561197970982479,2011.0,22200.0,1,2
2,76561197970982479,2011.0,43110.0,1,2
3,js41637,2014.0,251610.0,1,2
4,js41637,2013.0,227300.0,1,0
...,...,...,...,...,...
58426,76561198312638244,sin fecha,70.0,1,2
58427,76561198312638244,sin fecha,362890.0,1,2
58428,LydiaMorley,sin fecha,273110.0,1,2
58429,LydiaMorley,sin fecha,730.0,1,2


In [80]:
# Eliminar la columna de user_id ya que no voy a recomendar por usuario
df_user_reviews.drop("user_id", axis=1, inplace=True)

# Eliminar la columna de de la fecha de posteo, porque voy a independizar mi análisis de las fechas
df_user_reviews.drop("posted", axis=1, inplace=True)

df_user_reviews

Unnamed: 0,id,recommend,sentiment
0,1250.0,1,2
1,22200.0,1,2
2,43110.0,1,2
3,251610.0,1,2
4,227300.0,1,0
...,...,...,...
58426,70.0,1,2
58427,362890.0,1,2
58428,273110.0,1,2
58429,730.0,1,2


In [81]:
# La columna 'recommend' ya está normalizada
# Normalizar la columna 'sentiment' convirtiendo los valores 0, 1 y 2 en 0, 0.5 y 1 respectivamente
df_user_reviews["sentiment"] = df_user_reviews["sentiment"].replace({1: 0.5, 2: 1})
df_user_reviews

Unnamed: 0,id,recommend,sentiment
0,1250.0,1,1.0
1,22200.0,1,1.0
2,43110.0,1,1.0
3,251610.0,1,1.0
4,227300.0,1,0.0
...,...,...,...
58426,70.0,1,1.0
58427,362890.0,1,1.0
58428,273110.0,1,1.0
58429,730.0,1,1.0


In [100]:
df_user_reviews["id"].nunique()

3682

In [99]:
df_user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58431 entries, 0 to 58430
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         58431 non-null  float64
 1   recommend  58431 non-null  int64  
 2   sentiment  58431 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.3 MB


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Supongamos que tienes una matriz de calificaciones de usuarios e ítems
ratings = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

# Calcula la similitud del coseno entre los ítems
item_similarity = cosine_similarity(ratings.T)

# Ahora puedes usar esta matriz de similitud del ítem para recomendar ítems similares


# Homework
matriz = pd.DataFrame(cosine_similarity(df_esc, df_esc))

sentiment: Esta variable podría tratarse como categórica ordinal, ya que sus valores representan un orden (negativo < neutro < positivo). Podrías dejarla como está, o si prefieres, podrías normalizarla para que los valores estén entre 0 y 1.  