In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

In [318]:
data_review = pd.read_parquet("dataSet/user_review_clean.parquet")

In [319]:
data_item = pd.read_parquet("dataSet/user_items_clean.parquet")

In [320]:
data_review = data_review.sample(n=1000, random_state=40)

In [321]:
data_item = data_item.sample(n=1000, random_state=40)

In [322]:
data_review.shape

(1000, 4)

In [323]:
data_item.shape

(1000, 7)

In [324]:
data_item.head()

Unnamed: 0,item_id,item_name,playtime_forever,steam_id,items_count,user_id,user_url
1905169,219150,Hotline Miami,212,76561198080157237,98,76561198080157237,http://steamcommunity.com/profiles/76561198080...
1235994,244630,NEOTOKYO°,0,76561197989723505,194,76561197989723505,http://steamcommunity.com/profiles/76561197989...
3145882,302610,Boson X,529,76561198049582412,73,pi_pie,http://steamcommunity.com/id/pi_pie
497888,220200,Kerbal Space Program,955,76561198068985504,47,76561198068985504,http://steamcommunity.com/profiles/76561198068...
2236541,24240,PAYDAY: The Heist,233,76561198087379866,31,76561198087379866,http://steamcommunity.com/profiles/76561198087...


In [325]:
data_review.head()

Unnamed: 0,review,item_id,recommend,sentimentanalysis
1042,played 5 mins and looking forward to more love...,50300,True,0.8303
34158,Sick of boring old FPS (like CoD) then pick th...,440,True,-0.4278
38951,This game is amazing it's got one of the best ...,203160,True,0.9631
4283,How to play:Step 1: Design the perfect version...,47890,True,0.802
10311,"It is good indeed, but most of the resident ev...",221040,True,-0.9437


In [326]:
data_item_tmp = data_item[['item_id', 'item_name', 'user_id']]

In [327]:
data_item_tmp.dtypes

item_id      object
item_name    object
user_id      object
dtype: object

In [328]:
data_review = pd.merge(data_review, data_item_tmp, on = 'item_id', how= 'inner', indicator=True)

In [329]:
data_review.head(5)

Unnamed: 0,review,item_id,recommend,sentimentanalysis,item_name,user_id,_merge
0,played 5 mins and looking forward to more love...,50300,True,0.8303,Spec Ops: The Line,76561198028552934,both
1,This game is amazing it's got one of the best ...,203160,True,0.9631,Tomb Raider,diet_cola,both
2,This game is amazing it's got one of the best ...,203160,True,0.9631,Tomb Raider,76561197965263006,both
3,This game is amazing it's got one of the best ...,203160,True,0.9631,Tomb Raider,botakboy,both
4,Great game,252490,True,0.6249,Rust,foolsfoolsfools,both


In [330]:
data_review.drop(['_merge'], inplace=True, axis=1)

In [331]:
data_review.isnull().sum()

review               0
item_id              0
recommend            0
sentimentanalysis    0
item_name            0
user_id              0
dtype: int64

In [332]:
data_review.duplicated().sum()

0

In [333]:
data_review.drop_duplicates(inplace=True) 

In [334]:
data_review = data_review.sample(n=1000, random_state=38)

In [335]:
#si el archivo base_modelo_clean.parquet NO EXISTE debe de omitirse este paso. Continuar en el vectorizer
data_review = pd.read_parquet("./dataSet/base_modelo_clean.parquet")

In [336]:
data_review.head(5)

Unnamed: 0,review,item_id,recommend,sentimentanalysis,item_name,user_id
0,Real Boss once you get the hang of it.,230410,True,0.0,Warframe,me2118
1,this is the best game EVER,4000,True,0.6369,Garry's Mod,76561198053502339
2,OK. So let me explain. Borderlands 2 has you s...,49520,True,-0.681,Borderlands 2,76561198077032945
3,this game is so much fun,4000,True,0.5542,Garry's Mod,TuttyGT
4,Not gonna lie runs like ♥♥♥ on the fastest,252490,False,0.3612,Rust,CuzySato


In [337]:
data_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   review             1000 non-null   object 
 1   item_id            1000 non-null   int32  
 2   recommend          1000 non-null   bool   
 3   sentimentanalysis  1000 non-null   float64
 4   item_name          1000 non-null   object 
 5   user_id            1000 non-null   object 
dtypes: bool(1), float64(1), int32(1), object(3)
memory usage: 36.3+ KB


In [338]:
# Convertir 'item_id' de tipo 'object' a 'int'
data_review['item_id'] = data_review['item_id'].astype(int)

In [339]:
data_review.shape

(1000, 6)

In [340]:
# Primero, vamos a convertir la columna 'nombre' a una representación numérica usando TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data_review['review'])

# Luego, vamos a añadir las columnas numéricas a nuestra matriz de características
features = np.column_stack([tfidf_matrix.toarray(), data_review['recommend'], data_review['sentimentanalysis']])


In [341]:
features

array([[ 0.    ,  0.    ,  0.    , ...,  0.    ,  1.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  1.    ,  0.6369],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  1.    , -0.681 ],
       ...,
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  1.    ,  0.5106],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  1.    ,  0.9601],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.4075]])

In [342]:
# Reindexamos el DataFrame
data_review = data_review.reset_index(drop=True)



In [343]:
# Ahora, calculamos la matriz de similitud de coseno
similarity_matrix = cosine_similarity(features)



In [344]:
# Para hacer recomendaciones, puedes buscar los productos más similares a un producto dado
id_game = 1250  # El nombre del producto para el que quieres hacer recomendaciones
producto = data_review[data_review['item_id'] == id_game]

if not producto.empty:
    product_index = producto.index[0]
    product_similarities = similarity_matrix[product_index]
    most_similar_products_indices = np.argsort(-product_similarities)
    most_similar_products = data_review.loc[most_similar_products_indices, 'item_name']
    print("Los productos más similares al producto", id_game, "son:")
    print(most_similar_products)
else:
    print("Producto no encontrado")


Los productos más similares al producto 1250 son:
485                       Killing Floor
336    Counter-Strike: Global Offensive
501    Counter-Strike: Global Offensive
409    Counter-Strike: Global Offensive
445                       Left 4 Dead 2
                     ...               
440                         Garry's Mod
658                            PAYDAY 2
173                            PAYDAY 2
718                   Heroes & Generals
73                    Heroes & Generals
Name: item_name, Length: 1000, dtype: object


In [345]:
# Para hacer recomendaciones, puedes buscar los productos más similares a un producto dado
id_user = 'diet_cola'  # El nombre del producto para el que quieres hacer recomendaciones
producto = data_review[data_review['user_id'] == id_user]

if not producto.empty:
    product_index = producto.index[0]
    product_similarities = similarity_matrix[product_index]
    most_similar_products_indices = np.argsort(-product_similarities)
    most_similar_products = data_review.loc[most_similar_products_indices, 'item_name']
    print("Los productos más similares al producto", id_game, "son:")
    print(most_similar_products)
else:
    print("Producto no encontrado")


Los productos más similares al producto 1250 son:
806              Tomb Raider
5                Tomb Raider
520              Tomb Raider
309    Saints Row: The Third
305        BioShock Infinite
               ...          
853              Garry's Mod
173                 PAYDAY 2
658                 PAYDAY 2
73         Heroes & Generals
718        Heroes & Generals
Name: item_name, Length: 1000, dtype: object


In [352]:
data_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   review             1000 non-null   object 
 1   item_id            1000 non-null   int32  
 2   recommend          1000 non-null   bool   
 3   sentimentanalysis  1000 non-null   float64
 4   item_name          1000 non-null   object 
 5   user_id            1000 non-null   object 
dtypes: bool(1), float64(1), int32(1), object(3)
memory usage: 36.3+ KB


In [350]:
modelo = './dataSet/base_modelo_clean.parquet'
data_review.to_parquet(modelo, index=False)
print(f'Se guardó el archivo {modelo}')

Se guardó el archivo ./dataSet/base_modelo_clean.parquet


In [347]:
most_similar_products[:5]

806              Tomb Raider
5                Tomb Raider
520              Tomb Raider
309    Saints Row: The Third
305        BioShock Infinite
Name: item_name, dtype: object

In [18]:
def recomendacion_juego(id_de_producto:int):
    modelo = pd.read_parquet('base_modelo_clean.parquet')
    #Primero, vamos a convertir la columna 'title' a una representación numérica usando TF-IDF (Frecuencia de Terminos - Freciencia Inversa de Terminos)

    
    id_de_producto = str(id_de_producto)
    modelo['item_id'] = modelo['item_id'].astype(str)


    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(modelo['item_id'])

    #se buscan los que son similares al id del juego
    features = np.column_stack([tfidf_matrix.toarray(), modelo['recommend'], modelo['sentimentanalysis']])

    #Verificamos la existencia del Id del juego a establecer simulitud de juegos
    result = modelo[modelo['item_id'] == id_de_producto]
    nombre_del_juego=result.iloc[0]['item_name']
    if result.empty:
        return "No hay datos para el datos para el juego seleccionado."
    #Reindexamos el DataFrame
    data_aplicativo = modelo.reset_index(drop=True)
    
    #Ahora, calculamos la matriz de similitud de coseno
    similarity_matrix = cosine_similarity(features)

    # Para hacer recomendaciones, puedes buscar los juegos más similares a un juego dado
    juego = data_aplicativo[data_aplicativo['item_name'] == nombre_del_juego].index[0]
    score = list(enumerate(similarity_matrix[juego]))
    score= sorted(score, key=lambda x: x[1],reverse=True)
    resultado = score[1:6]
    total = modelo['item_name'].iloc[[i[0] for i in resultado]].tolist()
    
    return{'Juego Recomendado ': total}

    
recomendacion_juego(49520)

{'Juego Recomendado ': ['Borderlands 2',
  'Borderlands 2',
  'Borderlands 2',
  'Borderlands 2',
  'Borderlands 2']}

In [8]:
modelo = pd.read_parquet('base_modelo_clean.parquet')
modelo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   review             1000 non-null   object 
 1   item_id            1000 non-null   int32  
 2   recommend          1000 non-null   bool   
 3   sentimentanalysis  1000 non-null   float64
 4   item_name          1000 non-null   object 
 5   user_id            1000 non-null   object 
dtypes: bool(1), float64(1), int32(1), object(3)
memory usage: 36.3+ KB


In [14]:
modelo.head(5)

Unnamed: 0,review,item_id,recommend,sentimentanalysis,item_name,user_id
0,Real Boss once you get the hang of it.,230410,True,0.0,Warframe,me2118
1,this is the best game EVER,4000,True,0.6369,Garry's Mod,76561198053502339
2,OK. So let me explain. Borderlands 2 has you s...,49520,True,-0.681,Borderlands 2,76561198077032945
3,this game is so much fun,4000,True,0.5542,Garry's Mod,TuttyGT
4,Not gonna lie runs like ♥♥♥ on the fastest,252490,False,0.3612,Rust,CuzySato
