# Analisis de un modelo de recomendacion

Se realizara una recomendacion del tipo USUARIOXITEM, se usara ALS de la libreria implicit, debido a sus ventajas de escalamiento, descubrir patrones ocultos y manejo de datos dispersos.

In [41]:
#Importamos librerias
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split,ParameterGrid
import implicit #Porque usaremos datos implicitos, no se tiene el rating
from implicit.evaluation import AUC_at_k

## 1. Feature engineering

In [65]:
#Cargamos la datos que vamos a utilizar
data_train = pd.read_csv("./data/order_products__train.csv") # Usamos el train por la cantidad de registros es menor (menor costo computacional)
orders = pd.read_csv("./data/orders.csv")
products = pd.read_csv("./data/products.csv")

In [43]:
data_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [44]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [45]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [66]:
# Filtramos solo las ordenes relacionadas con train
orders_train = orders[orders["eval_set"] == "train"]

In [67]:
# Guardo la data procesada porque github no me permite subir toda la data
orders_train.to_csv("./data/orders.csv", index=False)

In [47]:
# Unimos las ordenes con la data para obtener al usuario id, producto id juntos
data = pd.merge(data_train,orders_train,on="order_id")

In [48]:
# Mientras mas alto el add_to_cart_order, le doy un menor score
data["score"] = (1 + data["reordered"]) / data["add_to_cart_order"] #Heuristica

# Luego normaliza los valores a un rango de 0 a 1
data["score"] = (data["score"] - data["score"].min()) / (data["score"].max() - data["score"].min())


In [49]:
data

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,score
0,1,49302,1,1,112108,train,4,4,10,9.0,1.000000
1,1,11109,2,1,112108,train,4,4,10,9.0,0.496855
2,1,10246,3,0,112108,train,4,4,10,9.0,0.161426
3,1,49683,4,0,112108,train,4,4,10,9.0,0.119497
4,1,43633,5,1,112108,train,4,4,10,9.0,0.194969
...,...,...,...,...,...,...,...,...,...,...,...
1384612,3421063,14233,3,1,169679,train,30,0,10,4.0,0.329140
1384613,3421063,35548,4,1,169679,train,30,0,10,4.0,0.245283
1384614,3421070,35951,1,1,139822,train,15,6,10,8.0,1.000000
1384615,3421070,16953,2,1,139822,train,15,6,10,8.0,0.496855


In [50]:
# Obtenemos la ultima compra de los usuarios
last_purchases = orders_train.sort_values(['user_id', 'order_number'], ascending=[True, False]).drop_duplicates('user_id', keep='first')
last_purchases = data[data_train['order_id'].isin(last_purchases['order_id'])]
last_purchases = last_purchases.merge(products, on='product_id', how='left')
last_purchases=last_purchases[['product_id', 'user_id', 'product_name']]

last_purchases

Unnamed: 0,product_id,user_id,product_name
0,49302,112108,Bulgarian Yogurt
1,11109,112108,Organic 4% Milk Fat Whole Milk Cottage Cheese
2,10246,112108,Organic Celery Hearts
3,49683,112108,Cucumber Kirby
4,43633,112108,Lightly Smoked Sardines in Olive Oil
...,...,...,...
1384612,14233,169679,Natural Artesian Water
1384613,35548,169679,Twice Baked Potatoes
1384614,35951,139822,Organic Unsweetened Almond Milk
1384615,16953,139822,Creamy Peanut Butter


In [51]:
# Si hubiera datos duplicados, sumamos el score
df = data.groupby(["user_id", "product_id"], as_index=False)["score"].sum()
df

Unnamed: 0,user_id,product_id,score
0,1,196,1.000000
1,1,10258,0.161426
2,1,13032,0.137466
3,1,25133,0.496855
4,1,26088,0.119497
...,...,...,...
1384612,206209,24852,0.329140
1384613,206209,37966,0.065588
1384614,206209,39216,0.119497
1384615,206209,40603,0.119497


## 2. Modelo

In [52]:
# -------- split --------
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [53]:
# reindex solo con TRAIN
u_codes_train, u_uni_train = pd.factorize(train_df['user_id'], sort=True)
i_codes_train, i_uni_train = pd.factorize(train_df['product_id'], sort=True)

In [54]:
# matriz USUARIOXITEM para implicit
U, I = u_uni_train.size, i_uni_train.size
train_mat_ui = csr_matrix(
    (train_df['score'].astype(np.float32).values, ( u_codes_train, i_codes_train)),
    shape=(U,I), dtype=np.float32
)

In [55]:
# -------- fit --------
model = implicit.als.AlternatingLeastSquares(
    factors=64, regularization=0.1, iterations=20, dtype=np.float32, random_state=42
)
model.fit(train_mat_ui)


100%|██████████| 20/20 [00:10<00:00,  1.82it/s]


## 3. Evaluacion

In [56]:
# Número de productos recomendados a evaluar
k = 10

# Ejecutar la evaluación AUC at K
train_auc = AUC_at_k(model, train_mat_ui, train_mat_ui, K=k, show_progress=True)

print(f"AUC@10 train: {train_auc}")


100%|██████████| 129473/129473 [00:22<00:00, 5862.84it/s]

AUC@10 train: 0.49986593536601626





## 4. Optimizacion

In [57]:
# Definir los valores para el grid search
param_grid = {
    'factors': [20, 50, 100],
    'regularization': [0.01, 0.1, 1.0],
    'iterations': [10, 20, 30]
}

# Inicializar las variables para almacenar el mejor modelo
best_auc = 0
best_params = {}

# Iterar sobre las combinaciones de hiperparámetros
for params in ParameterGrid(param_grid):
    print(f"Evaluando combinación: {params}")
    
    # Crear y entrenar el modelo con los hiperparámetros actuales
    model = implicit.als.AlternatingLeastSquares(
        factors=params['factors'],
        regularization=params['regularization'],
        iterations=params['iterations'],
        dtype=np.float32,
        random_state=42
    )
    model.fit(train_mat_ui)
    
    # Evaluar el modelo en el conjunto de prueba usando AUC
    auc = AUC_at_k(
        model, train_mat_ui, train_mat_ui, K=10, show_progress=False
    )
    
    # Mostrar la métrica AUC para la combinación actual
    print(f"AUC@10: {auc}")
    
    # Actualizar el mejor modelo si encontramos uno con mejor AUC
    if auc > best_auc:
        best_auc = auc
        best_params = params
        best_model = model
# Mostrar los mejores hiperparámetros encontrados y la AUC correspondiente
print(f"\nMejores hiperparámetros: {best_params}")
print(f"Mejor AUC@10: {best_auc}")

Evaluando combinación: {'factors': 20, 'iterations': 10, 'regularization': 0.01}


100%|██████████| 10/10 [00:05<00:00,  1.90it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 20, 'iterations': 10, 'regularization': 0.1}


100%|██████████| 10/10 [00:05<00:00,  1.98it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 20, 'iterations': 10, 'regularization': 1.0}


100%|██████████| 10/10 [00:04<00:00,  2.03it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 20, 'iterations': 20, 'regularization': 0.01}


100%|██████████| 20/20 [00:09<00:00,  2.00it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 20, 'iterations': 20, 'regularization': 0.1}


100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 20, 'iterations': 20, 'regularization': 1.0}


100%|██████████| 20/20 [00:10<00:00,  1.99it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 20, 'iterations': 30, 'regularization': 0.01}


100%|██████████| 30/30 [00:15<00:00,  1.92it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 20, 'iterations': 30, 'regularization': 0.1}


100%|██████████| 30/30 [00:15<00:00,  1.91it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 20, 'iterations': 30, 'regularization': 1.0}


100%|██████████| 30/30 [00:16<00:00,  1.84it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 50, 'iterations': 10, 'regularization': 0.01}


100%|██████████| 10/10 [00:06<00:00,  1.65it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 50, 'iterations': 10, 'regularization': 0.1}


100%|██████████| 10/10 [00:06<00:00,  1.67it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 50, 'iterations': 10, 'regularization': 1.0}


100%|██████████| 10/10 [00:06<00:00,  1.65it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 50, 'iterations': 20, 'regularization': 0.01}


100%|██████████| 20/20 [00:12<00:00,  1.64it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 50, 'iterations': 20, 'regularization': 0.1}


100%|██████████| 20/20 [00:12<00:00,  1.64it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 50, 'iterations': 20, 'regularization': 1.0}


100%|██████████| 20/20 [00:12<00:00,  1.64it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 50, 'iterations': 30, 'regularization': 0.01}


100%|██████████| 30/30 [00:18<00:00,  1.66it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 50, 'iterations': 30, 'regularization': 0.1}


100%|██████████| 30/30 [00:17<00:00,  1.67it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 50, 'iterations': 30, 'regularization': 1.0}


100%|██████████| 30/30 [00:18<00:00,  1.63it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 100, 'iterations': 10, 'regularization': 0.01}


100%|██████████| 10/10 [00:07<00:00,  1.39it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 100, 'iterations': 10, 'regularization': 0.1}


100%|██████████| 10/10 [00:07<00:00,  1.39it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 100, 'iterations': 10, 'regularization': 1.0}


100%|██████████| 10/10 [00:07<00:00,  1.40it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 100, 'iterations': 20, 'regularization': 0.01}


100%|██████████| 20/20 [00:14<00:00,  1.39it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 100, 'iterations': 20, 'regularization': 0.1}


100%|██████████| 20/20 [00:14<00:00,  1.39it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 100, 'iterations': 20, 'regularization': 1.0}


100%|██████████| 20/20 [00:14<00:00,  1.39it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 100, 'iterations': 30, 'regularization': 0.01}


100%|██████████| 30/30 [00:21<00:00,  1.40it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 100, 'iterations': 30, 'regularization': 0.1}


100%|██████████| 30/30 [00:21<00:00,  1.39it/s]


AUC@10: 0.49986593536601626
Evaluando combinación: {'factors': 100, 'iterations': 30, 'regularization': 1.0}


100%|██████████| 30/30 [00:21<00:00,  1.39it/s]


AUC@10: 0.49986593536601626

Mejores hiperparámetros: {'factors': 20, 'iterations': 10, 'regularization': 0.01}
Mejor AUC@10: 0.49986593536601626


No se observa que mejora a pesar de que cambian los parametros, entonces se escoge el menor para ahorrar costo computacional y tambien evaluar con datos de testeo.

## 5. Probamos modelo

In [58]:
# reindex solo con TRAIN
u_codes, u_uni = pd.factorize(df['user_id'], sort=True)
i_codes, i_uni = pd.factorize(df['product_id'], sort=True)

In [59]:
# Helpers
user_to_num = dict(zip(u_uni, range(len(u_uni))))
num_to_user = dict(zip(range(len(u_uni)), u_uni))

product_to_num = dict(zip(i_uni, range(len(i_uni))))
num_to_product = dict(zip(range(len(i_uni)), i_uni))

product_id_to_name = dict(zip(products['product_id'], products['product_name']))


In [60]:
# matriz USUARIOXITEM para implicit
U, I = u_uni.size, i_uni.size
mat_ui = csr_matrix(
    (df['score'].astype(np.float32).values, (u_codes, i_codes)),
    shape=(U, I),
    dtype=np.float32
)

In [61]:
# Entrenar con los mejores parametros
model = implicit.als.AlternatingLeastSquares(
    factors=20, regularization=0.1, iterations=10, dtype=np.float32, random_state=42
)
model.fit(mat_ui)

100%|██████████| 10/10 [00:05<00:00,  1.73it/s]


In [62]:
def recommend_user(u_orig, N=10, filter_already=True, recalc=True):
    # Verificar si el usuario está registrado
    if u_orig not in u_uni:
        raise KeyError(f"user_id {u_orig} no es un cliente registrado")
    
    u_ix = user_to_num.get(u_orig)

    # Obtener las recomendaciones del modelo
    ids, scores = model.recommend(
        u_ix,
        user_items=mat_ui[u_ix],
        N=N,
        filter_already_liked_items=filter_already,
    )

    # Mapear las recomendaciones a product_id y su score
    return [(num_to_product[i], float(s)) for i, s in zip(ids, scores)]

def report_user(u_orig, N=10):
    try:
        recs = recommend_user(u_orig, N=N)
    except KeyError:
        return f"El usuario {u_orig} no está registrado"
    
    # Filtrar las últimas compras del usuario
    last_purchases_user = last_purchases[last_purchases['user_id'] == u_orig]

    # Mostrar las últimas compras
    print("Última compra del usuario:")
    if not last_purchases_user.empty:
        print(last_purchases_user[['product_id', 'product_name']])
    else:
        print("El usuario no tiene compras registradas.")

    # Mostrar las recomendaciones
    print("\nRecomendaciones:")
    recommendations = [(pid, product_id_to_name.get(pid, 'Desconocido'), score) for pid, score in recs]

    # Mostrar las recomendaciones con los nombres
    for pid, name, score in recommendations:
        print(f"Producto: {name}, Puntuación: {score:.4f}")

In [63]:
print(report_user(1, N=10))

Última compra del usuario:
        product_id                      product_name
484420         196                              Soda
484421       25133             Organic String Cheese
484422       38928          0% Greek Strained Yogurt
484423       26405  XL Pick-A-Size Paper Towel Rolls
484424       39657            Milk Chocolate Almonds
484425       10258                        Pistachios
484426       13032             Cinnamon Toast Crunch
484427       26088        Aged White Cheddar Popcorn
484428       27845                Organic Whole Milk
484429       49235               Organic Half & Half
484430       46149                 Zero Calorie Cola

Recomendaciones:
Producto: Clementines, Puntuación: 0.0376
Producto: Organic Unsweetened Almond Milk, Puntuación: 0.0370
Producto: Half & Half, Puntuación: 0.0316
Producto: Trail Mix, Puntuación: 0.0294
Producto: Sparkling Mineral Water, Puntuación: 0.0234
Producto: Mineral Water, Puntuación: 0.0210
Producto: Sparkling Natural Mineral

## 6. Guardamos modelo

In [64]:
import joblib

# Guardar el modelo ALS
joblib.dump(model, './artifacts/modelo_als.pkl')

# Guardar la matriz dispersa de usuario-producto (mat_iu)
joblib.dump(mat_ui, './artifacts/matriz_ui.pkl')

# Guardar los diccionarios necesarios
joblib.dump(product_id_to_name, './artifacts/product_id_to_name.pkl')
joblib.dump(user_to_num, './artifacts/user_to_num.pkl')
joblib.dump(num_to_product, './artifacts/num_to_product.pkl')
joblib.dump(last_purchases, './artifacts/last_purchases.pkl')


['./artifacts/last_purchases.pkl']