In [1]:
import pandas as pd
import numpy as np

import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator

import pyarrow as pa
import pyarrow.parquet as pq

Traemos la data necesaria

In [2]:
df_reviews = pd.read_parquet("../Datasets/reviews_analisis_sentiment.parquet")
df_items = pd.read_parquet("../Datasets/australian_items_limpio.parquet")

In [3]:
df_reviews.head(2)

Unnamed: 0,user_id,user_url,reviews_item_id,reviews_recommend,reviews_date,year,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,True,2011-11-05,2011,2
1,js41637,http://steamcommunity.com/id/js41637,251610,True,2014-06-24,2014,2


In [4]:
df_items.head(2)

Unnamed: 0,user_id,item_id,item_name,playtime_forever
0,76561197970982479,10,Counter-Strike,6
1,76561197970982479,20,Team Fortress Classic,0


Para el modelo de recomendación se busca obtener una lista de 5 juegos a partir del id de un producto, o de un usuario  

Con la intencion de crear un puntaje que nos permita clasificar los juegos de una manera eficiente, vamos a crear una clasificacion tomando en cuenta el analisis de sentimiento y los reviews_recommend, para poder obtener esta escala:

1 si el análisis de sentimiento es negativo ya sea que este recomendado o no (True o False)

2 si el análisis de sentimiento es neutral y no es recomendado (False)

3 si el análisis de sentimiento es neutral pero es recomendado (True)

4 si el análisis de sentimiento es positivo y no es recomendado (False)

5 si el análisis de sentimiento es positivo y es recomendado (True)

Creamos la siguiente funcion

In [5]:
def calcula_rating(row):
   
    if row["sentiment_analysis"] == 0 and not row["reviews_recommend"]:
        return 1
    elif row["sentiment_analysis"] == 0 and row["reviews_recommend"]:
        return 1
    elif row["sentiment_analysis"] == 1 and not row["reviews_recommend"]:
        return 2
    elif row["sentiment_analysis"] == 1 and row["reviews_recommend"]:
        return 3
    elif row["sentiment_analysis"] == 2 and not row["reviews_recommend"]:
        return 4
    elif row["sentiment_analysis"] == 2 and row["reviews_recommend"]:
        return 5
    else:
        return None

Aplicamos la funcion a un anueva columna en el dataset

In [6]:
df_reviews["rating"] = df_reviews.apply(calcula_rating, axis=1)
df_reviews

Unnamed: 0,user_id,user_url,reviews_item_id,reviews_recommend,reviews_date,year,sentiment_analysis,rating
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,True,2011-11-05,2011,2,5
1,js41637,http://steamcommunity.com/id/js41637,251610,True,2014-06-24,2014,2,5
3,doctr,http://steamcommunity.com/id/doctr,250320,True,2013-10-14,2013,2,5
4,maplemage,http://steamcommunity.com/id/maplemage,211420,True,2014-04-15,2014,1,3
5,Wackky,http://steamcommunity.com/id/Wackky,249130,True,2014-05-05,2014,1,3
...,...,...,...,...,...,...,...,...
231273,kushikushigani,http://steamcommunity.com/id/kushikushigani,332310,True,2015-12-30,2015,1,3
231291,How51,http://steamcommunity.com/id/How51,440,True,2014-08-15,2014,0,1
231293,76561198111410893,http://steamcommunity.com/profiles/76561198111...,304930,True,2014-08-02,2014,2,5
231419,zaza147,http://steamcommunity.com/id/zaza147,265630,True,2015-07-31,2015,2,5


Seleccionamos oslo las columnas que necesitaremos

In [7]:
df_reviews = df_reviews[["user_id", "reviews_item_id", "rating"]]
df_reviews

Unnamed: 0,user_id,reviews_item_id,rating
0,76561197970982479,1250,5
1,js41637,251610,5
3,doctr,250320,5
4,maplemage,211420,3
5,Wackky,249130,3
...,...,...,...
231273,kushikushigani,332310,3
231291,How51,440,1
231293,76561198111410893,304930,5
231419,zaza147,265630,5


Seleccionamos los columnas necesarias de items

In [8]:
df_items = df_items[["item_id", "item_name"]]
df_items

Unnamed: 0,item_id,item_name
0,10,Counter-Strike
1,20,Team Fortress Classic
2,30,Day of Defeat
3,40,Deathmatch Classic
4,50,Half-Life: Opposing Force
...,...,...
999995,333930,Dirty Bomb
999996,366844,Call of Duty: Black Ops III - Awakening DLC Pack
999997,377160,Fallout 4
999998,384190,ABZU


In [9]:
# se beorran los datos duplicados
df_items = df_items.drop_duplicates()

Unimops los datasets

In [11]:
df = df_reviews.merge(df_items, left_on="reviews_item_id", right_on="item_id", how="left")
df

Unnamed: 0,user_id,reviews_item_id,rating,item_id,item_name
0,76561197970982479,1250,5,1250,Killing Floor
1,js41637,251610,5,251610,Barbie™ Dreamhouse Party™
2,doctr,250320,5,250320,The Wolf Among Us
3,maplemage,211420,3,211420,Dark Souls: Prepare to Die Edition
4,Wackky,249130,3,249130,LEGO® MARVEL Super Heroes
...,...,...,...,...,...
47594,kushikushigani,332310,3,332310,LEGO® Worlds
47595,How51,440,1,,
47596,76561198111410893,304930,5,304930,Unturned
47597,zaza147,265630,5,265630,Fistful of Frags


In [12]:
df.isnull().sum()

user_id               0
reviews_item_id       0
rating                0
item_id            5619
item_name          5619
dtype: int64

Podemos observar que existe una cantidad de datos nulos en las columnas item, esto puede deberse a que algunos juego que recibieron recomendaciones, no existen en el dataframe

In [13]:
df = df.dropna(subset=['item_id'])

In [14]:
df.isnull().any()

user_id            False
reviews_item_id    False
rating             False
item_id            False
item_name          False
dtype: bool

Podemos verificar que eliminamos los registros nulos 

Seleccionamos solo las columnas que cumplen una funcion en nuestro analisis

In [16]:
df = df[["user_id", "item_name", "rating"]]
df

Unnamed: 0,user_id,item_name,rating
0,76561197970982479,Killing Floor,5
1,js41637,Barbie™ Dreamhouse Party™,5
2,doctr,The Wolf Among Us,5
3,maplemage,Dark Souls: Prepare to Die Edition,3
4,Wackky,LEGO® MARVEL Super Heroes,3
...,...,...,...
47593,76561198107177722,BattleBlock Theater,5
47594,kushikushigani,LEGO® Worlds,3
47596,76561198111410893,Unturned,5
47597,zaza147,Fistful of Frags,5


Procedemos a guardar la data en formato parquet

In [17]:
df.to_parquet("../Funciones/data/modelo_recomendacion.parquet")

Ya tenemos los datos seleccionados procedemos a crear la funcion y condiciones necesaria para cumplir con el modelo recomendado

In [18]:
# leemos la data
df_recomend = pd.read_parquet("../Funciones/data/modelo_recomendacion.parquet")
df_recomend

Unnamed: 0,user_id,item_name,rating
0,76561197970982479,Killing Floor,5
1,js41637,Barbie™ Dreamhouse Party™,5
2,doctr,The Wolf Among Us,5
3,maplemage,Dark Souls: Prepare to Die Edition,3
4,Wackky,LEGO® MARVEL Super Heroes,3
...,...,...,...
47593,76561198107177722,BattleBlock Theater,5
47594,kushikushigani,LEGO® Worlds,3
47596,76561198111410893,Unturned,5
47597,zaza147,Fistful of Frags,5


Colocamos los juegos como columnas, y sus valores serian el rating

In [19]:
piv = df.pivot_table(index=['user_id'], columns=['item_name'], values='rating')
piv

item_name,0RBITALIS,"10,000,000",100% Orange Juice,1001 Spikes,12 Labours of Hercules,12 Labours of Hercules II: The Cretan Bull,123 Slaughter Me Street,140,16 Bit Arena,200% Mixed Juice!,...,ibb & obb,inMomentum,liteCam Game: 100 FPS Game Capture,oO,planetarian ~the reverie of a little planet~,resident evil 4 / biohazard 4,sZone-Online,the static speaks my name,theHunter,theHunter: Primal
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,,,,,,,,,,,...,,,,,,,,,,
--ionex--,,,,,,,,,,,...,,,,,,,,,,
-2SV-vuLB-Kg,,,,,,,,,,,...,,,,,,,,,,
-Beave-,,,,,,,,,,,...,,,,,,,,,,
-GM-Dragon,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zv_odd,,,,,,,,,,,...,,,,,,,,,,
zvanik,,,,,,,,,,,...,,,,,,,,,,
zwanzigdrei,,,,,,,,,,,...,,,,,,,,,,
zyr0n1c,,,,,,,,,,,...,,,,,,,,,,


Vamos a restar la media de las calificaciones de un usuario y luego dividir por la diferencia entre el valor máximo y mínimo de las calificaciones

In [20]:
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
piv_norm

user_id,-Beave-,-GM-Dragon,-I_AM_EPIC-,00000000000000000001227,00690069006900,022899,03092002,04061993,0468313256,05041129,...,zimran,ziqan,zjaerya135,zombi_anon,zomgCoBfAce,zoom-the-flash,zsharoarkbr,zukuta,zvanik,zzoptimuszz
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100% Orange Juice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001 Spikes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Labours of Hercules,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
resident evil 4 / biohazard 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


A los datos de esta matriz normalizada se los convierte a un formato de matriz dispersa (sparse matrix) para reducir la memoria utilizada y mejorar la eficiencia en el manejo de grandes conjuntos de datos, especialmente cuando la mayoría de los valores en la matriz son ceros

In [21]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)
piv_sparse

<2617x6070 sparse matrix of type '<class 'numpy.float64'>'
	with 22583 stored elements in Compressed Sparse Row format>

Utilizaremos la similitud del coseno para evaluar los vectores 

In [22]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

Ingresamos las matrices en dataframes para poder lograr un mejor manejo

In [23]:
#item similarity dataframe
item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
#user similarity dataframe
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

Funcion recomendacion_juego

In [33]:
def top_game(game):
        # Obtener la lista de juegos similares ordenados
    similar_games = item_sim_df.sort_values(by=game, ascending=False).iloc[1:6]

    count = 1
    contador = 1
    recomendaciones = {}
    
    for item in similar_games:
        if contador <= 5:
            item = str(item)
            recomendaciones[count] = item
            count += 1
            contador += 1 
        else:
            break
    return recomendaciones

Probamos la funcion

In [35]:
top_game('Killing Floor')

{1: '0RBITALIS',
 2: '10,000,000',
 3: '100% Orange Juice',
 4: '1001 Spikes',
 5: '12 Labours of Hercules'}

In [37]:
top_game('Barbie™ Dreamhouse Party™')

{1: '0RBITALIS',
 2: '10,000,000',
 3: '100% Orange Juice',
 4: '1001 Spikes',
 5: '12 Labours of Hercules'}

Funcion recomendacion_usuario

In [28]:
def similar_user_recs(user):

    # Verifica si el usuario está presente en las columnas de piv_norm (si no está, devuelve un mensaje)
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    
    # Obtiene los usuarios más similares al usuario dado
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    
    best = []  # Lista para almacenar los juegos mejor calificados por usuarios similares
    most_common = {}  # Diccionario para contar cuántas veces se recomienda cada juego
    
    # Para cada usuario similar, encuentra el juego mejor calificado y lo agrega a la lista 'best'
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
    
    # Cuenta cuántas veces se recomienda cada juego
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    
    # Ordena los juegos por la frecuencia de recomendación en orden descendente
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    
    # Devuelve los 5 juegos más recomendados
    return sorted_list[:5]

Probando la funcion

In [29]:
similar_user_recs('zvanik')

[('Borderlands 2', 2),
 ('Call of Duty: World at War', 2),
 ('Counter-Strike: Global Offensive', 1),
 ('Crysis 2 Maximum Edition', 1),
 ('Goat Simulator', 1)]

In [30]:
similar_user_recs('76561197970982479')

[('Killing Floor', 8),
 ('Broforce', 1),
 ('Hotline Miami', 1),
 ('Metro 2033', 1)]

Procedemos a guadar las matrices para que puedan ser consumias por las funciones para la API

In [32]:
piv_norm.to_parquet("../Funciones/data/piv_norm.parquet")
user_sim_df.to_parquet("../Funciones/data/user_sim_df.parquet")
item_sim_df.to_parquet("../Funciones/data/item_sim_df.parquet")