In [1]:
import datetime

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import joblib

import nltk
nltk.download('vader_lexicon')
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.sentiment import SentimentIntensityAnalyzer


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ozi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Assuming you have a pre-trained sentiment analysis model
sentiment_analyzer_model = SentimentIntensityAnalyzer()

df = pd.read_csv("../datasets/processed/datasets.csv")

columns_to_drop = ['Unnamed: 0', 'user_id', 'user_reviews_item_id', 'users_items_item_id']
df = df.drop(columns=columns_to_drop)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47675 entries, 0 to 47674
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genres_str        47675 non-null  object 
 1   app_name          47675 non-null  object 
 2   release_date      47675 non-null  object 
 3   price             47675 non-null  float64
 4   game_id           47675 non-null  int64  
 5   developer         47675 non-null  object 
 6   year              47675 non-null  int64  
 7   recommend         47675 non-null  int64  
 8   review            47675 non-null  object 
 9   posted_date       47675 non-null  object 
 10  items_count       47675 non-null  int64  
 11  steam_id          47675 non-null  int64  
 12  item_name         47675 non-null  object 
 13  playtime_forever  47675 non-null  int64  
 14  playtime_2weeks   47675 non-null  int64  
dtypes: float64(1), int64(7), object(7)
memory usage: 5.5+ MB
None


In [3]:
df.head(20)

Unnamed: 0,genres_str,app_name,release_date,price,game_id,developer,year,recommend,review,posted_date,items_count,steam_id,item_name,playtime_forever,playtime_2weeks
0,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,og cs is god,2015-05-23,11,76561197971591953,Counter-Strike,0,0
1,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,GoODGAmP!!!,2024-04-01,11,76561197971591953,Counter-Strike,0,0
2,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,Best game ever can't believe it's still alive.,2015-05-30,11,76561197971591953,Counter-Strike,0,0
3,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,echamos unas partidas,2013-11-26,11,76561197971591953,Counter-Strike,0,0
4,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,dA-beSt<3,2013-10-06,11,76561197971591953,Counter-Strike,0,0
5,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,Counter-Strike makes you work to enjoy the con...,2015-12-12,11,76561197971591953,Counter-Strike,0,0
6,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,o cs e um otimo jogo de fps ele quase sempre,2014-12-14,11,76561197971591953,Counter-Strike,0,0
7,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,,2014-01-05,11,76561197971591953,Counter-Strike,0,0
8,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,A good fps but i would get Source or GO first,2015-03-28,11,76561197971591953,Counter-Strike,0,0
9,action,Counter-Strike,2000-11-01,9.99,10,Valve,2000,1,"Hola, acabo de descargar recien, y al iniciar ...",2014-05-18,11,76561197971591953,Counter-Strike,0,0


In [6]:
df_filter = df[df['playtime_forever']!=0]
df_filter.head(25) 

Unnamed: 0,genres_str,app_name,release_date,price,game_id,developer,year,recommend,review,posted_date,items_count,steam_id,item_name,playtime_forever,playtime_2weeks
57,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,Best Cod Ever,2013-12-18,45,76561198059261781,Call of Duty: World at War,2571,0
58,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,Campaign has variety and is fun to play. Zombi...,2014-03-10,45,76561198059261781,Call of Duty: World at War,2571,0
59,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,"Multiplayers a little scetchy, but zombies is ...",2015-03-14,45,76561198059261781,Call of Duty: World at War,2571,0
60,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,Nice game I like it,2014-02-28,45,76561198059261781,Call of Duty: World at War,2571,0
61,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,amazing game,2014-01-06,45,76561198059261781,Call of Duty: World at War,2571,0
62,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,Play with me on zombies and we can get to roun...,2013-06-27,45,76561198059261781,Call of Duty: World at War,2571,0
63,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,good game 10/10 ign #YOLOSWAG,2014-12-10,45,76561198059261781,Call of Duty: World at War,2571,0
64,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,Zombies.,2014-06-25,45,76561198059261781,Call of Duty: World at War,2571,0
65,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,World at war.. Probably the best Cod I have pl...,2014-03-09,45,76561198059261781,Call of Duty: World at War,2571,0
66,action,Call of Duty: World at War,2008-11-18,19.99,10090,Treyarch,2008,1,huge zombies fan.this is the game that started...,2014-09-26,45,76561198059261781,Call of Duty: World at War,2571,0


In [136]:


# Dividir los datos en conjuntos de entrenamiento y prueba
data_train, data_test = train_test_split(df, test_size=0.3, random_state=42)
filename = '../datasets/test/'
data_train.to_csv(filename + 'data_train.csv', sep=',', index=False)
data_test.to_csv(filename + 'data_test.csv', sep=',', index=False)

In [137]:
df = pd.read_csv("../datasets/test/data_train.csv")

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33372 entries, 0 to 33371
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genres_str        33372 non-null  object 
 1   app_name          33372 non-null  object 
 2   release_date      33372 non-null  object 
 3   price             33372 non-null  float64
 4   game_id           33372 non-null  int64  
 5   developer         33372 non-null  object 
 6   year              33372 non-null  int64  
 7   recommend         33372 non-null  int64  
 8   review            33372 non-null  object 
 9   posted_date       33372 non-null  object 
 10  items_count       33372 non-null  int64  
 11  steam_id          33372 non-null  int64  
 12  item_name         33372 non-null  object 
 13  playtime_forever  33372 non-null  int64  
 14  playtime_2weeks   33372 non-null  int64  
dtypes: float64(1), int64(7), object(7)
memory usage: 3.8+ MB


In [139]:
df2= df.copy()

In [140]:
df2

Unnamed: 0,genres_str,app_name,release_date,price,game_id,developer,year,recommend,review,posted_date,items_count,steam_id,item_name,playtime_forever,playtime_2weeks
0,simulation,Farming Simulator 15,2014-10-30,19.990000,313160,Giants Software,2014,1,i like this game but i think it needs more mes...,2015-11-04,101,76561198059332886,Farming Simulator 15,10141,0
1,sports,Super MNC,2012-04-19,-1.000000,104700,Uber Entertainment,2012,1,sexy,2012-05-18,39,76561198059388718,Super Monday Night Combat,0,0
2,Unknown,PAYDAY 2,1970-01-01,4.990000,218620,Unknown,1970,1,"So since Overkill now fully owns Payday 2, I f...",2015-08-25,66,76561198059260834,PAYDAY 2,175,0
3,rpg,Saturday Morning RPG,2014-01-29,6.990000,263320,Mighty Rabbit Studios,2014,1,This game is pretty much a love letter to almo...,2014-05-12,61,76561198061126419,Saturday Morning RPG,20,0
4,strategy,"Invisible, Inc.",2015-05-12,19.990000,243970,Klei Entertainment,2015,1,A great game if you enjoy Turn Based Strategy ...,2014-11-18,158,76561198060131355,"Invisible, Inc.",762,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33367,freetoplay,Dirty Bomb®,1970-01-01,-1.000000,333930,Splash Damage,1970,0,I havent played much of this game but ive alre...,2015-10-09,332,76561197971851704,Dirty Bomb,73,0
33368,rpg,Mount & Blade: Warband,2010-03-31,19.990000,48700,TaleWorlds Entertainment,2010,1,"Luv this game, like skyrim but worst but still...",2015-06-01,29,76561198059323447,Mount & Blade: Warband,13317,0
33369,strategy,The Long Dark,2017-08-01,34.990002,305620,Hinterland Studio Inc.,2017,1,,2014-11-28,115,76561198059637978,The Long Dark,90,0
33370,rpg,Realm of the Mad God,2012-02-20,-1.000000,200210,Wild Shadow Studios,2012,1,"i hope that when u die,you will not lose your ...",2014-09-23,24,76561198059261008,Realm of the Mad God,0,0


In [141]:
app_name_column = df['app_name'].copy()
game_id_column = df['game_id'].copy()
columns_to_drop = ['app_name', 'game_id','steam_id', 'developer','posted_date', 'item_name']
df = df.drop(columns=columns_to_drop)

In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33372 entries, 0 to 33371
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genres_str        33372 non-null  object 
 1   release_date      33372 non-null  object 
 2   price             33372 non-null  float64
 3   year              33372 non-null  int64  
 4   recommend         33372 non-null  int64  
 5   review            33372 non-null  object 
 6   items_count       33372 non-null  int64  
 7   playtime_forever  33372 non-null  int64  
 8   playtime_2weeks   33372 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 2.3+ MB


In [155]:
class SentimentAnalyzer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.sentiment_column_name = 'sentiment_score'
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, column='review', y=None):
        if X is None or X.empty: 
            return X
        
        X[self.sentiment_column_name] = 0
        
        if column in X.columns:
            analyzer = SentimentIntensityAnalyzer()
            X[self.sentiment_column_name] = X[column].astype(str).apply(lambda x: analyzer.polarity_scores(x)['compound'])
        
        return X

    def get_feature_names_out(self, input_features=None):
        return input_features
sentiment_analyzer = SentimentAnalyzer()

class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.label_encoders = {}

    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
        return self

    def transform(self, X):
        for col in self.columns:
            le = self.label_encoders[col]
            X[col] = le.transform(X[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return input_features

# Obtener las columnas numéricas y categóricas
numeric_columns = df.select_dtypes(include=['int64', 'float64', 'int32']).columns
ordinal_var = ['recommend']
categorical_columns = ["genres_str"]
# Crear una nueva variable excluyendo las columnas en ordinal_var
categorical_col_excluded_ordinal = [col for col in categorical_columns if col not in ordinal_var]
numeric_col_excluded_ordinal = [col for col in numeric_columns if col not in ordinal_var]

# Convertir las columnas categóricas a tipo str
df[categorical_col_excluded_ordinal] = df[categorical_col_excluded_ordinal].astype(str)

# Definir las transformaciones para las columnas numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Definir las transformaciones para las columnas ordinales
ordinal_transformer = MultiColumnLabelEncoder(columns=ordinal_var)

# Definir las transformaciones para las columnas categóricas restantes
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Crear la columna transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_col_excluded_ordinal),
        ('ord', ordinal_transformer, ordinal_var),
        ('cat', categorical_transformer, categorical_col_excluded_ordinal)
    ])


# Añadir el transformador a la pipeline
pipeline = Pipeline(steps=[
    ('sentiment_analyzer', sentiment_analyzer),
    ('preprocessor', preprocessor)
])


transformed_df = pipeline.fit_transform(df)


In [156]:
transformed_df

array([[ 0.34550946,  0.37419767, -0.25386219, ...,  0.        ,
         0.        ,  0.        ],
       [-0.99472378,  0.15607338, -0.41734584, ...,  0.        ,
         0.        ,  0.        ],
       [-0.61225609, -4.42453671, -0.34615134, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.30327512,  0.70138411, -0.21694652, ...,  0.        ,
         0.        ,  0.        ],
       [-0.99472378,  0.15607338, -0.45689833, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02625428,  0.15607338, -0.46480883, ...,  0.        ,
         0.        ,  0.        ]])

In [157]:
transformed_df_dense = pd.DataFrame(transformed_df, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())
transformed_df_dense['app_name'] = app_name_column
transformed_df_dense['game_id'] = game_id_column

# Mostrar las primeras filas del DataFrame
transformed_df_dense.head(5)

Unnamed: 0,num__price,num__year,num__items_count,num__playtime_forever,num__playtime_2weeks,num__sentiment_score,ord__recommend,cat__genres_str_Unknown,cat__genres_str_action,cat__genres_str_adventure,...,cat__genres_str_rpg,cat__genres_str_simulation,cat__genres_str_softwaretraining,cat__genres_str_sports,cat__genres_str_strategy,cat__genres_str_utilities,cat__genres_str_videoproduction,cat__genres_str_webpublishing,app_name,game_id
0,0.345509,0.374198,-0.253862,0.636688,-0.091774,-0.225712,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Farming Simulator 15,313160
1,-0.994724,0.156073,-0.417346,-0.364297,-0.091774,0.351779,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Super MNC,104700
2,-0.612256,-4.424537,-0.346151,-0.347023,-0.091774,1.2432,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,PAYDAY 2,218620
3,-0.484554,0.374198,-0.359336,-0.362323,-0.091774,0.940997,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Saturday Morning RPG,263320
4,0.345509,0.48326,-0.103563,-0.289082,-0.091774,0.962144,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,"Invisible, Inc.",243970


In [158]:
# Mostrar las primeras filas del DataFrame
transformed_df_dense.columns

Index(['num__price', 'num__year', 'num__items_count', 'num__playtime_forever',
       'num__playtime_2weeks', 'num__sentiment_score', 'ord__recommend',
       'cat__genres_str_Unknown', 'cat__genres_str_action',
       'cat__genres_str_adventure', 'cat__genres_str_casual',
       'cat__genres_str_earlyaccess', 'cat__genres_str_freetoplay',
       'cat__genres_str_indie', 'cat__genres_str_massivelymultiplayer',
       'cat__genres_str_racing', 'cat__genres_str_rpg',
       'cat__genres_str_simulation', 'cat__genres_str_softwaretraining',
       'cat__genres_str_sports', 'cat__genres_str_strategy',
       'cat__genres_str_utilities', 'cat__genres_str_videoproduction',
       'cat__genres_str_webpublishing', 'app_name', 'game_id'],
      dtype='object')

In [159]:
# Eliminar las columnas no numéricas
non_numeric_columns = ['cat__genres_str_massivelymultiplayer',
                       'cat__genres_str_Unknown','cat__genres_str_casual','cat__genres_str_racing',
                       'cat__genres_str_sports',
                       'num__playtime_2weeks',
                       'cat__genres_str_earlyaccess',
                       'cat__genres_str_adventure','cat__genres_str_simulation','cat__genres_str_webpublishing','app_name', 'game_id']
transformed_df_dense = transformed_df_dense.drop(columns=non_numeric_columns)

In [160]:
transformed_df_dense['num__sentiment_score'] = transformed_df_dense['num__sentiment_score'].apply(lambda x: 0 if x < -1 else (2 if x > 1 else 1))

In [161]:
transformed_df_dense

Unnamed: 0,num__price,num__year,num__items_count,num__playtime_forever,num__sentiment_score,ord__recommend,cat__genres_str_action,cat__genres_str_freetoplay,cat__genres_str_indie,cat__genres_str_rpg,cat__genres_str_softwaretraining,cat__genres_str_strategy,cat__genres_str_utilities,cat__genres_str_videoproduction
0,0.345509,0.374198,-0.253862,0.636688,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.994724,0.156073,-0.417346,-0.364297,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.612256,-4.424537,-0.346151,-0.347023,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.484554,0.374198,-0.359336,-0.362323,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.345509,0.483260,-0.103563,-0.289082,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33367,-0.994724,-4.424537,0.355246,-0.357091,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
33368,0.345509,-0.062051,-0.443714,0.950180,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
33369,1.303275,0.701384,-0.216947,-0.355413,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
33370,-0.994724,0.156073,-0.456898,-0.364297,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [162]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import resample
sample_size = 2000  # ajusta este tamaño según sea necesario
sampled_data = resample(transformed_df_dense, n_samples=sample_size, random_state=42)
# Calcular la similitud del coseno
similarity_matrix = cosine_similarity(sampled_data)

['../datasets/test/similarity_matrix.joblib']

In [163]:
# Seleccionar un juego de referencia (por ejemplo, el primer juego en el conjunto de entrenamiento)
juego_referencia_index = 10

# Obtener la fila de similitud para el juego de referencia
similarity_scores = similarity_matrix[juego_referencia_index]
# Obtener los índices ordenados por similitud
sorted_indices = similarity_scores.argsort()[::-1]
# Elegir la cantidad de juegos similares que deseas obtener
num_juegos_similares = 5

# Obtener los índices de los juegos más similares (excluyendo el juego de referencia)
juegos_similares_indices = sorted_indices[1:num_juegos_similares + 1]

# Obtener los nombres de los juegos similares
juegos_similares = df2['game_id'].iloc[juegos_similares_indices]
juegos_similares

1208       730
10      287700
242     400800
1351    271590
1940    301520
Name: game_id, dtype: int64

In [165]:
# Seleccionar un juego de referencia (por ejemplo, el primer juego en el conjunto de entrenamiento)


# Seleccionar un juego de referencia (por ejemplo, el primer juego en el conjunto de entrenamiento)
juego_referencia_index = 10

# Obtener la fila de similitud para el juego de referencia
similarity_scores = similarity_matrix[juego_referencia_index]
# Obtener los índices ordenados por similitud
sorted_indices = similarity_scores.argsort()[::-1]
# Elegir la cantidad de juegos similares que deseas obtener
num_juegos_similares = 5

# Obtener los índices de los juegos más similares (excluyendo el juego de referencia)
juegos_similares_indices = sorted_indices[1:num_juegos_similares + 1]


# Obtener los nombres de los juegos similares
juegos_similares = df2['app_name'].iloc[juegos_similares_indices]



In [166]:
juegos_similares

1208            Counter-Strike: Global Offensive
10          METAL GEAR SOLID V: THE PHANTOM PAIN
242     MXGP2 - The Official Motocross Videogame
1351                          Grand Theft Auto V
1940                                   Robocraft
Name: app_name, dtype: object

In [None]:
import joblib
# Save the similarity matrix to a file using joblib
joblib.dump(similarity_matrix, '../datasets/test/similarity_matrix.joblib')