In [1]:
import pandas as pd
import sys
sys.path.append('../supercenter-product-recommender')  # Adjust the path as necessary
from db_utilities import read_table
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = read_table('processed_orders_data')
df.dtypes

row_id                   int64
order_id                 int64
product_id              object
cart_inclusion_order     int64
reordered                int64
product_name            object
aisle_id                object
department_id           object
department              object
aisle                   object
dtype: object

In [4]:
df.head(10)

Unnamed: 0,row_id,order_id,product_id,cart_inclusion_order,reordered,product_name,aisle_id,department_id,department,aisle
0,0,1,49302,1,1,EL YOGUR DE BULGARIA,120,16,HUEVOS Y LÁCTEOS,YOGUR
1,1,1,11109,2,1,4% ORGÁNICO QUESO GRASA DE LECHE LECHE ENTERA ...,108,16,HUEVOS Y LÁCTEOS,OTROS QUESOS CREMAS
2,2,1,10246,3,0,CORAZONES DE APIO ORGÁNICOS,83,4,FRUTAS Y VERDURAS,VEGETALES FRESCOS
3,3,1,49683,4,0,KIRBY PEPINO,83,4,FRUTAS Y VERDURAS,VEGETALES FRESCOS
4,4,1,43633,5,1,LAS SARDINAS LIGERAMENTE AHUMADO EN ACEITE DE ...,95,15,PRODUCTOS ENLATADOS,MARISCOS CARNE ENLATADA
5,5,1,13176,6,0,BOLSA DE BANANO ORGÁNICO,24,4,FRUTAS Y VERDURAS,FRUTAS FRESCAS
6,6,1,47209,7,0,Missing,Missing,Missing,Missing,Missing
7,7,1,22035,8,1,QUESO ORGÁNICO TOTAL CADENA,21,16,HUEVOS Y LÁCTEOS,QUESOS ENVASADOS
8,8,36,39612,1,0,QUESO RALLADO PECORINO ROMANO,2,16,HUEVOS Y LÁCTEOS,QUESOS DE ESPECIALIDAD
9,9,36,19660,2,1,AGUA DE MANANTIAL,115,7,BEBIDAS,SELTZER AGUA Y AGUA CON GAS


## Setting the Baseline

KNN with just product co-occurence

In [3]:
# Create the interaction interaction matrix
interaction_matrix = df.pivot_table(index='order_id', columns='product_id', aggfunc='size', fill_value=0)
interaction_matrix

  interaction_matrix = df.pivot_table(index='order_id', columns='product_id', aggfunc='size', fill_value=0)


product_id,1,10,100,1000,10000,10001,10005,10006,10008,10009,...,9985,9986,9989,999,9990,9993,9995,9996,9997,9998
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3421049,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3421056,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3421058,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3421063,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_orders, test_orders = train_test_split(interaction_matrix, train_size=0.8, random_state=42)

# Function to hide some items in the test set
def leave_k_out(interaction_matrix, k=1):
    test_matrix = interaction_matrix.copy()
    hidden_items = {}
    for order_id in interaction_matrix.index:
        nonzero_indices = interaction_matrix.loc[order_id].to_numpy().nonzero()[0]
        if len(nonzero_indices) > k:
            hidden_indices = np.random.choice(nonzero_indices, size=k, replace=False)
            hidden_items[order_id] = hidden_indices
            test_matrix.loc[order_id, hidden_indices] = 0
    return test_matrix, hidden_items

# Apply leave-k-out on the test set
test_orders_hidden, hidden_items = leave_k_out(test_orders, k=1)

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_orders, test_orders = train_test_split(interaction_matrix, train_size=0.8, random_state=42)

# Function to hide some items in the test set
def leave_k_out(interaction_matrix, k=1):
    test_matrix = interaction_matrix.copy()
    
    # Get the positions of all non-zero entries
    nonzero_positions = np.nonzero(test_matrix.values)
    
    # Create a DataFrame with these positions
    nonzero_df = pd.DataFrame({
        'order_index': nonzero_positions[0],
        'product_index': nonzero_positions[1]
    })
    
    # Group by order index to simulate the leave-k-out per order
    hidden_items = nonzero_df.groupby('order_index').apply(lambda x: x.sample(k) if len(x) > k else x)
    
    # Flatten the MultiIndex
    hidden_items = hidden_items.reset_index(drop=True)
    
    # Hide the selected items in the test matrix
    test_matrix.values[hidden_items['order_index'], hidden_items['product_index']] = 0
    
    # Convert order and product indices back to their original labels
    hidden_items['order_id'] = interaction_matrix.index[hidden_items['order_index']]
    hidden_items['product_id'] = interaction_matrix.columns[hidden_items['product_index']]
    
    # Create a dictionary for hidden items
    hidden_items_dict = hidden_items.groupby('order_id')['product_id'].apply(list).to_dict()
    
    return test_matrix, hidden_items_dict

# Apply leave-k-out on the test set
test_orders_hidden, hidden_items = leave_k_out(test_orders, k=1)


  hidden_items = nonzero_df.groupby('order_index').apply(lambda x: x.sample(k) if len(x) > k else x)


In [8]:
test_orders_hidden.head()

product_id,1,10,100,1000,10000,10001,10005,10006,10008,10009,...,9985,9986,9989,999,9990,9993,9995,9996,9997,9998
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3007878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1727889,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2829462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3261021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1711491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
test_orders.head()

product_id,1,10,100,1000,10000,10001,10005,10006,10008,10009,...,9985,9986,9989,999,9990,9993,9995,9996,9997,9998
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3007878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1727889,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2829462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3261021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1711491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.neighbors import NearestNeighbors

# Initialize the k-NN model
knn = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit the model on the training data
knn.fit(train_orders)