In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Load the data
events_df = pd.read_csv('./data/events.csv')

In [3]:
# Convert timestamps to datetime for better readability
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'], unit='ms')

In [4]:
# Only sample 10,000 interactions
sampled_events_df = events_df.sample(n=100000, random_state=42)

In [5]:
# Define a split date (example: 80% of the data before the split date for training, 20% after for testing)
split_date = sampled_events_df['timestamp'].quantile(0.8)
print("Split date:", split_date)

# Split the data into training and testing sets based on the split date
train_df = sampled_events_df[sampled_events_df['timestamp'] < split_date]
test_df = sampled_events_df[sampled_events_df['timestamp'] >= split_date]

print("Number of interactions in the training set:", len(train_df))
print("Number of interactions in the test set:", len(test_df))

Split date: 2015-08-18 15:23:50.822800128
Number of interactions in the training set: 80000
Number of interactions in the test set: 20000


In [172]:
# Prepare train set
train_user_item_matrix = train_df.set_index('timestamp') # sets the index
train_user_item_matrix = train_user_item_matrix.groupby(['visitorid']).count()
# Converts the index back to columns, making timestamp and visitorid regular columns again. (reset_index)
# Create a pivot table where rows are visitorid and columns are itemid, with the values being the count of event.
train_user_item_matrix = train_user_item_matrix.reset_index().pivot_table(index='visitorid', columns='itemid', values='event', fill_value=0)
# Create the user-item matrix with binary indicators
train_user_item_matrix = train_user_item_matrix.apply(lambda x: x.apply(lambda y: 1 if y > 0 else 0))

In [173]:
# Prepare test set
test_df = sampled_events_df[(sampled_events_df['timestamp'] >= split_date) & (sampled_events_df['visitorid'].isin(train_df['visitorid']))]
test_user_item_matrix = test_df.set_index('timestamp') # sets the index
test_user_item_matrix = test_user_item_matrix.groupby(['visitorid']).count()
test_user_item_matrix = test_user_item_matrix.reset_index().pivot_table(index='visitorid', columns='itemid', values='event', fill_value=0)
test_user_item_matrix = test_user_item_matrix.apply(lambda x: x.apply(lambda y: 1 if y > 0 else 0))

In [174]:
# Build and train the KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute')
train_sparse_matrix = csr_matrix(train_user_item_matrix.values)
knn.fit(train_sparse_matrix)

In [175]:
# Function to obtain KNN recommendations
def get_knn_recommendations(user_id, user_item_matrix, knn_model, n_recommendations=5):
    if user_id not in user_item_matrix.index:
        return []
    distances, indices = knn_model.kneighbors(user_item_matrix.loc[user_id].values.reshape(1, -1), n_neighbors=n_recommendations)
    recommended_items = []
    for idx in indices.flatten():
        items = user_item_matrix.columns[user_item_matrix.iloc[idx].to_numpy().nonzero()]
        recommended_items.extend(items)
    return recommended_items[:n_recommendations]

In [176]:
# Model evaluation using hits, precision and recall
def evaluate_model(test_user_item_matrix, user_item_matrix, knn_model, k=5):
    hits = 0
    total_relevant = 0
    total_recommended = 0

    for user_id in test_user_item_matrix.index:
        if user_id in user_item_matrix.index:
            actual_items = set(test_user_item_matrix.loc[user_id][test_user_item_matrix.loc[user_id] > 0].index)
            recommended_items = set(get_knn_recommendations(user_id, user_item_matrix, knn_model, n_recommendations=k))
            hits += len(actual_items.intersection(recommended_items))
            total_relevant += len(actual_items)
            total_recommended += len(recommended_items)
            
            # Verificação intermediária
            print(f'User ID: {user_id}')
            print(f'Actual items: {actual_items}')
            print(f'Recommended items: {recommended_items}')
            print(f'Intersection: {actual_items.intersection(recommended_items)}\n')

    precision = hits / total_recommended if total_recommended > 0 else 0
    recall = hits / total_relevant if total_relevant > 0 else 0

    return precision, recall, hits

In [177]:
# Calculate precision, recall and number of hits
precision, recall, hits = evaluate_model(test_user_item_matrix, train_user_item_matrix, knn, k=5)

User ID: 8043
Actual items: {1}
Recommended items: {1}
Intersection: {1}

User ID: 8927
Actual items: {1}
Recommended items: {1}
Intersection: {1}

User ID: 12356
Actual items: {2}
Recommended items: {1}
Intersection: set()

User ID: 17676
Actual items: {11}
Recommended items: {4}
Intersection: set()

User ID: 18131
Actual items: {1}
Recommended items: {1}
Intersection: {1}

User ID: 31174
Actual items: {1}
Recommended items: {2}
Intersection: set()

User ID: 42552
Actual items: {1}
Recommended items: {3}
Intersection: set()

User ID: 43853
Actual items: {1}
Recommended items: {1}
Intersection: {1}

User ID: 49302
Actual items: {1}
Recommended items: {1}
Intersection: {1}

User ID: 54791
Actual items: {2}
Recommended items: {7}
Intersection: set()

User ID: 55997
Actual items: {1}
Recommended items: {1}
Intersection: {1}

User ID: 60502
Actual items: {1}
Recommended items: {1}
Intersection: {1}

User ID: 61715
Actual items: {1}
Recommended items: {1}
Intersection: {1}

User ID: 61731
A

In [178]:
print(f'Precision@5: {precision}')
print(f'Recall@5: {recall}')
print(f'Hits@5: {hits}')

Precision@5: 0.5460829493087558
Recall@5: 0.5697115384615384
Hits@5: 237
