In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import PolynomialFeatures
from modelo_cuadratico import modelo_cuadratico

In [None]:
run_test = False # Si está en True se va a hacer una prueba inicial con datos sintéticos
load_model = True # Si está en True no se va a entrenar y va a cargar los pesos load_weights_file
save_model = False # Si está en True se van a guardar los pesos luego de entrenar en save_weights_file
load_weights_file = "model.pt" # Archivo con los pesos guardados.
save_weights_file = "model.pt" # Archivo donde se guardan los pesos 

### Proto Recomendador
Esta notebook está basada en el paper A Recommendation Model Based on Deep Neural Network del autor LIBO ZHANG

#### Prueba con datos sintéticos

In [None]:
if not run_test: print("run_test = False -> No se va a correr el test")

In [None]:
if run_test:
    n_users = 50
    n_items = 100
    a = 10
    b = 20
    sparce_rate = .9 # Qué tan dispersa es la matriz de ratings de ejemplo

    # Inicializo la matriz de interacciones y pongo -1 en muchos lugares
    # El -1 representa las interacciones que no conocemos o no sucedieron. Ej: las peliculas que no vio el usuario.
    ratings = np.random.randint(0, 2, n_users*n_items).reshape(n_users,n_items)
    aux = np.random.rand(ratings.shape[0], ratings.shape[1])
    ratings[aux<sparce_rate] = -1
    ratings_df = pd.DataFrame(ratings)
    ratings_df.columns = list(ratings_df.columns + 1)
    ratings_df.index = list(ratings_df.index + 1)
    ratings = torch.from_numpy(ratings)
    
    model = modelo_cuadratico(a, b, n_users, n_items, ratings_df)

In [None]:
%time if run_test: R = model.entrenar(ratings)

#### Prueba con datos de MovieLens

Uso el dataset de MovieLens 1M (https://grouplens.org/datasets/movielens/1m/).
Para correr esta notebook hay que bajarlo, descomprimirlo y apuntar los siguientes paths a los archivos correspondientes

In [None]:
user_data = '../../MovieLens/data/users.dat'
movie_data = '../../MovieLens/data/movies.dat'
ratings_data = '../../MovieLens/data/ratings.dat'

In [None]:
users = pd.read_csv(user_data, sep = '::', header = None, names = ['UserId', 'Gender', 'AgeGroup', 'Occupation', 'Zip'], encoding = 'latin-1')
movies = pd.read_csv(movie_data, sep = '::', header = None, names = ['MovieId', 'Title', 'Genre'], encoding = 'latin-1')
ratings_data = pd.read_csv(ratings_data, sep = '::', header = None, names = ['UserId', 'MovieId', 'Rating', 'Timestamp'], encoding = 'latin-1')

In [None]:
ratings_data.head()

In [None]:
ratings_data['Match'] = (ratings_data['Rating']>=4).astype(int) # Considero un match si el ratings es 4 o más (esto es bastante burdo pero es para probar)

In [None]:
ratings_df = ratings_data.pivot(index='UserId',
                                 columns='MovieId',
                                 values='Match').fillna(-1)

In [None]:
ratings_df.head()

In [None]:
ratings_matrix = ratings_df.to_numpy()

In [None]:
n_users = ratings_data.UserId.nunique()
n_items = ratings_data.MovieId.nunique()
a = 16
b = 20
l = a + b
ratings = ratings_matrix.copy()
ratings = torch.from_numpy(ratings)

In [None]:
model = modelo_cuadratico(a, b, n_users, n_items, ratings_df)

### Entrenamiento Regresión Cuadrática

In [None]:
if load_model: print("No se va a entrenar el modelo. Se va a cargar de {}".format(load_weights_file))

In [None]:
if load_model:
    model.load_weights(load_weights_file)
else:
    %time R = model.entrenar(ratings, lr = 0.1)
    if save_weights:
        model.save_weights(save_weights_file)

### Datos ZPAR

In [None]:
import pickle5 as pickle

In [None]:
with open('data_ZPAR.pkl.pkl', "rb") as f:
      segmentos_df = pickle.load(f)

In [None]:
segmentos = segmentos_df[['idusuario', 'pred_label_km']].set_index('idusuario').to_dict()

In [None]:
with open('leads_ZPAR_2022-05-10.pkl', "rb") as fh:
  ratings_df = pickle.load(fh)

In [None]:
ratings_df.groupby('pred_label_km').idusuario.nunique(), ratings_df.groupby('pred_label_km').idaviso.nunique() 

In [None]:
ratings_df[ratings_df['pred_label_km']=="4"].idusuario.nunique(), ratings_df[ratings_df['pred_label_km']=="2"].idaviso.nunique()

In [None]:
ratings_df.idusuario.nunique(), ratings_df.idaviso.nunique() 

In [None]:
ratings_df.idusuario.value_counts().hist(bins=45)

In [None]:
ratings_df.idusuario.nunique()

In [None]:
cluster = ratings_df[ratings_df['pred_label_km']=='4']

In [None]:
cluster.idusuario.nunique()

In [None]:
cluster = cluster[['idusuario', 'idaviso', 'Match']].pivot(index='idusuario',
                                 columns='idaviso',
                                 values='Match').fillna(-1)

In [None]:
(cluster==-1).sum().sum()*100/(cluster.shape[0]*cluster.shape[1])

In [None]:
cluster['Train'] = np.random.rand(cluster.shape[0])<.95

In [None]:
test = cluster[cluster['Train']==False].copy()

In [None]:
test = test.drop('Train', axis=1)

In [None]:
cluster = cluster[cluster['Train']]

In [None]:
cluster = cluster.drop('Train', axis=1)

In [None]:
ratings_matrix = cluster.to_numpy()

In [None]:
ratings_matrix.shape[0]

In [None]:
ratings_matrix.shape

In [None]:
n_users = ratings_matrix.shape[0]
n_items = ratings_matrix.shape[1]
a = 15
b = 15
l = a + b
ratings = ratings_matrix.copy()
ratings = torch.from_numpy(ratings)

In [None]:
model = modelo_cuadratico(a, b, n_users, n_items, cluster)

In [None]:
ratings.shape

In [None]:
%time R = model.entrenar(ratings, lr = 0.2, track_every = 10, epochs = 9000)

In [None]:
cluster

In [None]:
s = torch.nn.Sigmoid()
pd.DataFrame(s(R).cpu().numpy())

In [None]:
ratings_matrix = test.to_numpy()
ratings = ratings_matrix.copy()
ratings = torch.from_numpy(ratings)

In [None]:
ratings.shape

In [None]:
#n_users = ratings_matrix.shape[0]
#n_items = ratings_matrix.shape[1]

In [None]:
p_broad = torch.broadcast_to(model.p, (-1, n_items))
q_broad = torch.broadcast_to(model.q, (-1, n_users)).transpose(0,1)
z_broad = torch.broadcast_to(model.z, (n_users, n_items))

In [None]:
U = model.U.cpu().numpy()

In [None]:
from annoy import AnnoyIndex
#### Annoy

f = 15  # len de los vectores

c = AnnoyIndex(f, 'euclidean')
n_embeddings = U.shape[0]

for i, emb in enumerate(U):
    c.add_item(i, emb)
    print('Progress: {}/{}'.format(i,n_embeddings), end = '\r')
print('Progress: {}/{} ... ฅ^•ﻌ•^ฅ OK!'.format(n_embeddings, n_embeddings), end = '\r')

c.build(10) # Número de árboles de ANN

In [None]:
avisos = pd.read_csv('avisos.csv', index_col=0)

In [None]:
def usuarios_similares(idusuario, cantidad=5):
    pos = model.get_user_position(idusuario)
    similar_users = c.get_nns_by_item(pos, cantidad+1)
    return similar_users

In [None]:
user = cluster.sample(1).index.item()
similar = [model.get_userid(x) for x in usuarios_similares(user, 50)]
similar.remove(user)
ratings_df[(ratings_df['idusuario'].isin([user]))&(ratings_df['Match']==1)].merge(avisos, on = 'idaviso', how = 'left').ciudad.value_counts()

In [None]:
ratings_df[(ratings_df['idusuario'].isin(similar))&(ratings_df['Match']==1)].merge(avisos, on = 'idaviso', how = 'left').ciudad.value_counts()

In [None]:
user_features = ['barrio_favorito','barrio_fav_count', 'operacion_favorita', 'operacion_fav_count', 'hits_count',
                 'leads_count', 'Alquiler_rel', 'Alquiler temporal_rel','Venta_rel', 'Alquiler_precio_mediana',
                 'Alquiler temporal_precio_mediana', 'Venta_precio_mediana','Alquiler_precio_media',
                 'Alquiler temporal_precio_media','Venta_precio_media','precision_barriofav','precision_operacionfav',
                 'precision_precio', 'precision_busqueda', 'ctr', 'tsne_x', 'tsne_y']
len(user_features)

In [None]:
segmentos_df.columns