In [2]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the dataset
data = pd.read_csv('archive/events.csv')

# Keep only the relevant columns (visitorid, itemid, and event type)
data = data[['visitorid', 'itemid', 'event']]

# Convert the event column to binary ratings (1 for transaction, 0 otherwise)
data['rating'] = data['event'].apply(lambda x: 1 if x == 'transaction' else 0)
print(1)
# Create a Surprise Reader object
reader = Reader(rating_scale=(0, 1))

# Load the data into a Surprise Dataset
dataset = Dataset.load_from_df(data[['visitorid', 'itemid', 'rating']], reader)
print(2)
# Split the dataset into train and test sets for collaborative filtering
trainset, testset = train_test_split(dataset, test_size=0.2, random_state=42)

# Build the collaborative filtering model (SVD is used here as an example)
model_collab = SVD()
model_collab.fit(trainset)
print(3)
# Function to get collaborative filtering recommendations for a given user
def get_collab_recommendations(user_id, model, top_n=10):
    items_to_predict = data['itemid'].unique()
    user_items = [(user_id, item_id, 0) for item_id in items_to_predict]
    predictions = model.test(user_items)
    recommended_items = [(pred.iid, pred.est) for pred in predictions]
    recommended_items.sort(key=lambda x: x[1], reverse=True)
    return [item[0] for item in recommended_items[:top_n]]


from sklearn.metrics import recall_score, precision_score, f1_score, mean_squared_error

# Define thresholds
positive_prediction_threshold = 0.5
relevant_item_threshold = 1

# Make predictions on the test data
predictions = model_collab.test(testset)

# Initialize variables for evaluation metrics
true_positives, false_positives, true_negatives, false_negatives = 0, 0, 0, 0
predicted_ratings = []
actual_ratings = []
for uid, iid, true_rating, est, _ in predictions:
  predicted_rating = 1 if est > positive_prediction_threshold else 0
  actual_rating = 1 if true_rating > relevant_item_threshold else 0
  predicted_ratings.append(est)
  actual_ratings.append(true_rating)
  if predicted_rating == actual_rating:
    if predicted_rating == 1:
      true_positives += 1
    else:
      true_negatives += 1
  else:
    if predicted_rating == 1:
      false_positives += 1
    else:
      false_negatives += 1

# Calculate and print the evaluation metrics
# recall = recall_score(actual_ratings, predicted_ratings)
# precision = precision_score(actual_ratings, predicted_ratings)
# f1 = f1_score(actual_ratings, predicted_ratings)
# rmse = mean_squared_error(actual_ratings, predicted_ratings)**0.5


from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(actual_ratings, predicted_ratings)
mae = mean_absolute_error(actual_ratings, predicted_ratings)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

# print(f"Recall: {recall:.4f}")
# print(f"Precision: {precision:.4f}")
# print(f"F1 score: {f1:.4f}")
# print(f"RMSE: {rmse:.4f}")


1
2
3
RMSE: 0.0096
MAE: 0.0309
