In [119]:
import pandas as pd
import os

path_reviews = os.path.join('..','data','clear','user_reviews.csv.gz')
path_steam_games = os.path.join('..','data','clear','steam_games.csv.gz')

In [120]:
## Cargamos la data necesaria para solucionar este problema.

steam_colums = ['id','app_name','publisher','genres','price','developer','release_year']
# steam_colums = ['id','app_name','release_year']
steam_games = pd.read_csv(path_steam_games,usecols=steam_colums)

## renombraremos la columna id por steam_id
steam_games.rename(columns={'id':'steam_id'}, inplace=True)


In [121]:
steam_games.head(4)

Unnamed: 0,publisher,genres,app_name,price,steam_id,developer,release_year
0,Kotoshiro,"Strategy, Action, Indie, Casual, Simulation",Lost summoner kitty,4.99,761140.0,Kotoshiro,2018
1,"Making fun, inc.","Free to Play, Strategy, Indie, RPG, Card Game,...",Ironbound,0.0,643980.0,Secret level srl,2018
2,Poolians.com,"Free to Play, Simulation, Sports, Casual, Indi...",Real pool 3d - poolians,0.0,670290.0,Poolians.com,2017
3,彼岸领域,"Action, Adventure, Casual",弹炸人2222,0.99,767400.0,彼岸领域,2017


In [122]:
import pandas as pd
from collections import Counter
from itertools import chain

def select_n_best(caracteristica, n):
    
    # Dividir las palabras en cada fila y contar la frecuencia de cada palabra
    conteo_palabras = Counter(chain.from_iterable(caracteristica.str.split(', ')))

    # Obtener las n palabras más comunes
    palabras_mas_comunes = conteo_palabras.most_common(n)

    print("Palabras más comunes:")
    for palabra, frecuencia in palabras_mas_comunes:
        print(f"{palabra}: {frecuencia}")
    
    return([palabras_mas_comunes[i][0] for i in range(n)])

In [123]:
steam_games['genres']

0              Strategy, Action, Indie, Casual, Simulation
1        Free to Play, Strategy, Indie, RPG, Card Game,...
2        Free to Play, Simulation, Sports, Casual, Indi...
3                                Action, Adventure, Casual
4                            Action, Indie, Casual, Sports
                               ...                        
29970                  Strategy, Indie, Casual, Simulation
29971                              Strategy, Indie, Casual
29972                            Indie, Simulation, Racing
29973    Indie, Casual, Puzzle, Singleplayer, Atmospher...
29974    Early Access, Adventure, Indie, Action, Simula...
Name: genres, Length: 29975, dtype: object

In [124]:
palabras_mas_relevantes = select_n_best(steam_games['genres'], 10)

Palabras más comunes:
Indie: 16783
Action: 12320
Casual: 9471
Adventure: 9344
Simulation: 7508
Strategy: 7289
RPG: 5552
Singleplayer: 4268
Multiplayer: 2283
Great Soundtrack: 2185


In [125]:
def drop_not_important(genres):
  return(', ').join([palabra for palabra in genres.split(', ') if palabra.capitalize() in palabras_mas_relevantes])


In [126]:
steam_games['genres'] = steam_games['genres'].apply(drop_not_important)

In [127]:

cols_concatenate = ['app_name','developer','genres']

steam_games['features'] = steam_games[cols_concatenate].apply(lambda row: ', '.join(row), axis=1)

In [128]:
consulta = steam_games[['steam_id','features']].head(5000)

In [129]:
import nltk
from nltk.corpus import stopwords

#Eliminaremos las stopwords

stop_words_steams = ['op','based','co','first','000', '101', '102', '18', '2008']
stop = list(stopwords.words('english'))
stop += stop_words_steams

In [130]:
import os 
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [131]:
tf = TfidfVectorizer(stop_words=stop)

In [132]:
tf_idf_matrix_prueba =tf.fit_transform(consulta['features'])

In [99]:
# tf.get_feature_names_out()

In [133]:
tf_idf_matrix_prueba.toarray().shape

(5000, 7883)

In [134]:
data_vector_df = pd.DataFrame(tf_idf_matrix_prueba.toarray(),
                              columns = tf.get_feature_names_out(),
                              index = consulta['steam_id'])

In [136]:
# data_vector_df

In [139]:
vector_similitud_coseno = cosine_similarity(data_vector_df.values)  ## crea la matriz

cos_sim_df = pd.DataFrame(vector_similitud_coseno, index=data_vector_df.index, columns=data_vector_df.index)

juego_simil = cos_sim_df.loc[2780] ##  se trae la linea 2780 (indice) 

In [142]:
import joblib

joblib.dump(vector_similitud_coseno, 'prueba06.pkl',compress=1)

['prueba06.pkl']

## carga de datos

In [143]:
with open('prueba06.pkl','rb') as file:
  modelo = joblib.load(file)


In [145]:
modelo

array([[1.        , 0.03351191, 0.03482668, ..., 0.03909307, 0.02202796,
        0.03446117],
       [0.03351191, 1.        , 0.00836192, ..., 0.        , 0.01126954,
        0.        ],
       [0.03482668, 0.00836192, 1.        , ..., 0.02697959, 0.00732202,
        0.02378295],
       ...,
       [0.03909307, 0.        , 0.02697959, ..., 1.        , 0.        ,
        0.65132936],
       [0.02202796, 0.01126954, 0.00732202, ..., 0.        , 1.        ,
        0.        ],
       [0.03446117, 0.        , 0.02378295, ..., 0.65132936, 0.        ,
        1.        ]])

In [114]:

modelo = pd.DataFrame(modelo, index=data_vector_df.index, columns=data_vector_df.index)

array([[1.        , 0.02744998, 0.02978931, ..., 0.03321324, 0.02254848,
        0.06258432],
       [0.02744998, 1.        , 0.00594145, ..., 0.01642366, 0.02185189,
        0.01248239],
       [0.02978931, 0.00594145, 1.        , ..., 0.01105389, 0.        ,
        0.04253039],
       ...,
       [0.03321324, 0.01642366, 0.01105389, ..., 1.        , 0.01979625,
        0.02322311],
       [0.02254848, 0.02185189, 0.        , ..., 0.01979625, 1.        ,
        0.        ],
       [0.06258432, 0.01248239, 0.04253039, ..., 0.02322311, 0.        ,
        1.        ]])

In [104]:
juego_simil

steam_id
761140.0    0.029803
643980.0    0.083293
670290.0    0.012240
767400.0    0.014541
773570.0    0.016057
              ...   
684110.0    0.000000
349790.0    0.000000
670700.0    0.015813
665310.0    0.000000
563420.0    0.025715
Name: 2780.0, Length: 10000, dtype: float64

In [105]:
simil_ordenada = juego_simil.sort_values(ascending=False)

In [106]:
simil_ordenada

steam_id
2780.0      1.000000
304380.0    0.639207
304400.0    0.606144
332350.0    0.526931
639600.0    0.526931
              ...   
533600.0    0.000000
319560.0    0.000000
533090.0    0.000000
421110.0    0.000000
380710.0    0.000000
Name: 2780.0, Length: 10000, dtype: float64

In [107]:
resultado = simil_ordenada.head(6).reset_index()


In [108]:
resultado

Unnamed: 0,steam_id,2780.0
0,2780.0,1.0
1,304380.0,0.639207
2,304400.0,0.606144
3,332350.0,0.526931
4,639600.0,0.526931
5,288520.0,0.487864


In [109]:
ids = resultado['steam_id']

list(consulta.merge(resultado).sort_values(2780, ascending=False)['features'].apply(lambda x: x.split(',')[0])[1:6])


  list(consulta.merge(resultado).sort_values(2780, ascending=False)['features'].apply(lambda x: x.split(',')[0])[1:6])


['Arma 3 helicopters',
 'Arma 3 dlc bundle 1',
 'Arma 3 marksmen',
 'Arma 3 malden',
 'Arma 3 karts']