In [1]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the dataset
data = pd.read_csv('archive/events.csv')

# Keep only the relevant columns (visitorid, itemid, and event type)
data = data[['visitorid', 'itemid', 'event']]

# Convert the item category column to a string representation
data['category'] = data.groupby('itemid')['event'].transform(lambda x: ' '.join(x))
print(5)
# Create a TF-IDF vectorizer to convert the item category into numerical features
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['category'])

print(6)
# Calculate the cosine similarity between items based on their categories
# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(7)
# Function to get content-based recommendations for a given item
def get_content_based_recommendations(item_id, tfidf_matrix=tfidf_matrix, cosine_sim=None, top_n=10):
    # Calculate the cosine similarity scores for the given item only
    if cosine_sim is None:
        item_vector = tfidf_matrix[data[data['itemid'] == item_id].index[0]]
        cosine_sim = linear_kernel(item_vector, tfidf_matrix).flatten()
    else:
        idx = data[data['itemid'] == item_id].index[0]
        cosine_sim = cosine_sim[idx]

    # Get the indices of the top N similar items
    sim_scores = sorted(enumerate(cosine_sim), key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]
    item_indices = [i[0] for i in sim_scores]

    return data['itemid'].iloc[item_indices]



5
6
7


In [2]:
user_item_interactions = {}
for _, row in data.iterrows():
    user_id = row['visitorid']
    item_id = row['itemid']
    if user_id not in user_item_interactions:
        user_item_interactions[user_id] = []
    user_item_interactions[user_id].append(item_id)


In [None]:
predicted_recommendations = {}
for user_id, interacted_items in user_item_interactions.items():
    # Get top N recommendations
    recommended_items = get_content_based_recommendations(interacted_items[0], tfidf_matrix, cosine_sim=None, top_n=10)

    # Store recommendations for this user
    predicted_recommendations[user_id] = recommended_items


In [None]:
from scipy.spatial.distance import cosine

rmse = 0.0
mae = 0.0
total_interactions = 0

for user_id, interacted_items in user_item_interactions.items():
    for item_id in interacted_items:
        recommended_items = predicted_recommendations[user_id]
        for recommended_item in recommended_items:
            item_vector = tfidf_matrix[data[data['itemid'] == recommended_item].index[0]]
            interacted_item_vector = tfidf_matrix[data[data['itemid'] == item_id].index[0]]
            distance = cosine(item_vector, interacted_item_vector)

            rmse += (distance - 0)**2
            mae += abs(distance - 0)

            total_interactions += 1

rmse = math.sqrt(rmse / total_interactions)
mae = mae / total_interactions

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
