In [9]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

In [10]:
# Load the data
events_df = pd.read_csv('./data/events.csv')

In [11]:
# Convert timestamps to datetime for better readability
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'], unit='ms')

In [12]:
# Map event types to numeric values
event_type_mapping = {'view': 1, 'addtocart': 2, 'transaction': 3}
events_df['event'] = events_df['event'].map(event_type_mapping)

In [13]:
# Only sample 10,000 interactions
sampled_events_df = events_df.sample(n=100000, random_state=42)

In [14]:
# Define a split date (example: 80% of the data before the split date for training, 20% after for testing)
split_date = sampled_events_df['timestamp'].quantile(0.8)
print("Split date:", split_date)

Split date: 2015-08-18 15:23:50.822800128


In [15]:
# Split the data into training and testing sets based on the split date
train_df = sampled_events_df[sampled_events_df['timestamp'] < split_date]
test_df = sampled_events_df[sampled_events_df['timestamp'] >= split_date]

print("Number of interactions in the training set:", len(train_df))
print("Number of interactions in the test set:", len(test_df))

Number of interactions in the training set: 80000
Number of interactions in the test set: 20000


In [16]:
# Prepare train set
train_user_item_matrix = train_df.set_index('timestamp') # sets the index
train_user_item_matrix = train_user_item_matrix.groupby(['visitorid']).count()
# Converts the index back to columns, making timestamp and visitorid regular columns again. (reset_index)
# Create a pivot table where rows are visitorid and columns are itemid, with the values being the count of event.
train_user_item_matrix = train_user_item_matrix.reset_index().pivot_table(index='visitorid', columns='itemid', values='event', fill_value=0)
# Create the user-item matrix with binary indicators
train_user_item_matrix = train_user_item_matrix.apply(lambda x: x.apply(lambda y: 1 if y > 0 else 0))

In [17]:
# Prepare test set
test_df = sampled_events_df[(sampled_events_df['timestamp'] >= split_date) & (sampled_events_df['visitorid'].isin(train_df['visitorid']))]
test_user_item_matrix = test_df.set_index('timestamp') # sets the index
test_user_item_matrix = test_user_item_matrix.groupby(['visitorid']).count()
test_user_item_matrix = test_user_item_matrix.reset_index().pivot_table(index='visitorid', columns='itemid', values='event', fill_value=0)
test_user_item_matrix = test_user_item_matrix.apply(lambda x: x.apply(lambda y: 1 if y > 0 else 0))

In [18]:
# Prepare the data for the SVD model
train_data = train_df[['visitorid', 'itemid', 'event']]
test_data = test_df[['visitorid', 'itemid', 'event']]

In [19]:
# Define the reader and load the data into the Surprise format
reader = Reader(rating_scale=(1, 3))

In [20]:
# Create the Dataset from the DataFrame
train_dataset = Dataset.load_from_df(train_data, reader)
test_dataset = Dataset.load_from_df(test_data, reader)

In [21]:
# Build the full trainset and testset
trainset = train_dataset.build_full_trainset()
testset = test_dataset.build_full_trainset().build_testset()

In [22]:
# Build the SVD model
svd = SVD()
# Train the model
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x26fa1c00b10>

In [23]:
# Make predictions on the test set
predictions = svd.test(testset)

In [24]:
# Evaluate the model
print(f'Root Mean Squared Error: {rmse(predictions)}')

RMSE: 0.4177
Root Mean Squared Error: 0.4177068827128145


In [25]:
# Function to obtain recommendations using SVD
def get_svd_recommendations(user_id, model, user_item_matrix, n_recommendations=5):
    if user_id not in user_item_matrix.index:
        return []
    all_items = user_item_matrix.columns
    rated_items = set(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index)
    unrated_items = [item for item in all_items if item not in rated_items]
    
    # Predict the rating for all unrated items
    predictions = [model.predict(user_id, item) for item in unrated_items]
    
    # Sort the predictions by the highest estimated rating
    recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:n_recommendations]
    
    recommended_items = [pred.iid for pred in recommendations]
    return recommended_items

In [26]:
# Evaluate the SVD model using hits, precision, and recall
def evaluate_svd_model(test_user_item_matrix, user_item_matrix, svd_model, k=5):
    hits = 0
    total_relevant = 0
    total_recommended = 0

    for user_id in test_user_item_matrix.index:
        if user_id in user_item_matrix.index:
            actual_items = set(test_user_item_matrix.loc[user_id][test_user_item_matrix.loc[user_id] > 0].index)
            recommended_items = set(get_svd_recommendations(user_id, svd_model, user_item_matrix, n_recommendations=k))
            hits += len(actual_items.intersection(recommended_items))
            total_relevant += len(actual_items)
            total_recommended += len(recommended_items)
            
            # Intermediate verification
            print(f'User ID: {user_id}')
            print(f'Actual items: {actual_items}')
            print(f'Recommended items: {recommended_items}')
            print(f'Intersection: {actual_items.intersection(recommended_items)}\n')

    precision = hits / total_recommended if total_recommended > 0 else 0
    recall = hits / total_relevant if total_relevant > 0 else 0

    return precision, recall, hits

In [27]:
# Calculate precision, recall, and number of hits
precision, recall, hits = evaluate_svd_model(test_user_item_matrix, train_user_item_matrix, svd, k=5)

User ID: 8043
Actual items: {1}
Recommended items: {32, 37, 6, 15, 16}
Intersection: set()

User ID: 8927
Actual items: {1}
Recommended items: {32, 37, 9, 15, 16}
Intersection: set()

User ID: 12356
Actual items: {2}
Recommended items: {2, 3, 37, 15, 16}
Intersection: {2}

User ID: 17676
Actual items: {11}
Recommended items: {32, 1, 2, 15, 16}
Intersection: set()

User ID: 18131
Actual items: {1}
Recommended items: {2, 3, 37, 6, 9}
Intersection: set()

User ID: 31174
Actual items: {1}
Recommended items: {1, 3, 4, 9, 16}
Intersection: {1}

User ID: 40140
Actual items: {1}
Recommended items: {32, 2, 3, 4, 6}
Intersection: set()

User ID: 42552
Actual items: {1}
Recommended items: {1, 2, 4, 6, 16}
Intersection: {1}

User ID: 43853
Actual items: {1}
Recommended items: {32, 37, 6, 9, 15}
Intersection: set()

User ID: 49302
Actual items: {1}
Recommended items: {2, 37, 9, 15, 16}
Intersection: set()

User ID: 54791
Actual items: {2}
Recommended items: {32, 1, 37, 15, 16}
Intersection: set()



In [28]:
print(f'Precision@5: {precision}')
print(f'Recall@5: {recall}')
print(f'Hits@5: {hits}')

Precision@5: 0.0585956416464891
Recall@5: 0.2929782082324455
Hits@5: 121
