In [14]:
import pandas as pd
import numpy as np

In [15]:
# Load preference data
preference_df = pd.read_csv("synthetic-dataset/preference.csv")
user_item_matrix = preference_df.pivot(index='user_id', columns='activity_id', values='preference').fillna(0)

# Adjust preference values
user_item_matrix[user_item_matrix == -1] = np.nan

user_item_matrix.head()


activity_id,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,0.055834,,...,,,0.117999,,0.346329,,,0.437896,,
1,,,,,,,,,,,...,,,,,,0.410648,,,,
2,,,0.039474,0.119892,,,,0.100369,,0.295491,...,,0.163593,,0.489428,,,,,,
3,,0.113285,,,,,,,,,...,,,,,0.484622,,,,,
4,,0.203818,,0.17372,,0.162265,,,,,...,,0.338047,,0.288253,,,,,,


In [16]:
# see how cosine similarity works
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
import numpy as np

a = np.array([0, 2, 4])
b = np.array([1, 4, 2])
a = a -np.mean(a).repeat(len(a))
b = b - np.mean(b).repeat(len(b))
a = a / np.linalg.norm(a)
b = b / np.linalg.norm(b)
mat = np.stack((a, b), axis=0)

print(spatial.distance.cosine(a, b))
print(cosine_similarity(mat))

0.6726731646460115
[[1.         0.32732684]
 [0.32732684 1.        ]]


In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

# Load preference data
preference_df = pd.read_csv("synthetic-dataset/preference.csv")
user_item_matrix = preference_df.pivot(index='user_id', columns='activity_id', values='preference')

# Replace -1 with NaN to indicate missing data
user_item_matrix.replace(-1, np.nan, inplace=True)

# Fill NaN values with the mean of each user's preferences for the purpose of computing similarities
mean_user_preferences = user_item_matrix.mean(axis=1)
mean_user_preferences = mean_user_preferences.fillna(mean_user_preferences.mean())
user_item_matrix_filled = user_item_matrix.T.fillna(mean_user_preferences).T

# Compute cosine similarity between users
user_similarity_matrix = cosine_similarity(user_item_matrix_filled)

print("Dimensions of user similarity matrix:", user_similarity_matrix.shape)
print("Number of users:", user_similarity_matrix.shape[0])
print("Number of items:", user_item_matrix_filled.shape[1])
print("length of one row of user item matrix:", len(user_item_matrix_filled.iloc[0]))

# Function to predict preferences
def predict_preferences(user_id):
    # Weights are the similarities with other users
    weights = user_similarity_matrix[user_id]
    # Get the ratings from all users
    ratings = user_item_matrix_filled.values
    # Compute the weighted sum of ratings
    weighted_ratings = np.dot(weights, ratings)
    # Compute the sum of weights
    sum_of_weights = np.array([np.abs(weights).sum() for _ in range(ratings.shape[1])])
    # Predicted preferences
    predicted_preferences = weighted_ratings / sum_of_weights
    # Replace known values with original to not predict them
    predicted_preferences[user_item_matrix.iloc[user_id].notna().values] = user_item_matrix.iloc[user_id].values[user_item_matrix.iloc[user_id].notna().values]
    return predicted_preferences

# Example: predict preferences for user 0
predicted_preferences_user_0 = predict_preferences(0)
print("Predicted preferences for user 0:", predicted_preferences_user_0)


Dimensions of user similarity matrix: (50, 50)
Number of users: 50
Number of items: 72
length of one row of user item matrix: 72
Predicted preferences for user 0: [0.40818407 0.40226747 0.39986448 0.41868597 0.4299372  0.40618758
 0.41737391 0.42974141 0.05583405 0.43376164 0.43924129 0.4394808
 0.39420667 0.40745125 0.13751427 0.41695946 0.43362695 0.42882697
 0.41886005 0.43492098 0.39719428 0.41688493 0.40758056 0.40058912
 0.4207035  0.4155041  0.418903   0.42922414 0.45358475 0.40384028
 0.42422356 0.41763295 0.4158484  0.44244377 0.42782088 0.41295477
 0.43778945 0.43803012 0.42791533 0.42017057 0.42285523 0.4014258
 0.43542672 0.421923   0.41349299 0.42615532 0.03176449 0.43118313
 0.43793809 0.79596852 0.41739868 0.43420925 0.43092539 0.43087611
 0.41823188 0.43673672 0.40107678 0.44593398 0.44374848 0.43739748
 0.43483454 0.00518205 0.44129229 0.41337002 0.11799863 0.41845607
 0.34632881 0.42438964 0.43144422 0.43789649 0.4258599  0.40242161]


In [18]:
from sklearn.metrics import mean_squared_error

# Predict preferences for all user-item pairs
all_predictions = np.zeros_like(user_item_matrix.values)
for i in range(user_item_matrix.shape[0]):
    all_predictions[i, :] = predict_preferences(i)

# Calculate MSE
mse = mean_squared_error(user_item_matrix_filled.values.flatten(), all_predictions.flatten())
print("Mean Squared Error:", mse)


Mean Squared Error: 0.008795839952161585


### Make some analysis on the predictions

In [28]:
users = pd.read_csv("synthetic-dataset/user.csv")
activities = pd.read_csv("synthetic-dataset/activity.csv", sep='|')
user_id = np.random.choice(users['user_id'])
print("User ID:", user_id)

true_user_condition = users[users['user_id'] == user_id].values[0][1]
print("True user conditions:", true_user_condition)

predicted_preferences = predict_preferences(user_id)
print("Predicted preferences:", predicted_preferences)

# Find the top 5 activities with the highest predicted preferences
top_activities = np.argsort(predicted_preferences)[::-1][:5]
print("Top 5 activities:", top_activities)

# Get the target condition of the top activities
target_conditions = activities.loc[top_activities, 'target_condition'].values
print("Target conditions of top activities:", target_conditions)

User ID: 0
True user conditions: 0
Predicted preferences: [0.40818407 0.40226747 0.39986448 0.41868597 0.4299372  0.40618758
 0.41737391 0.42974141 0.05583405 0.43376164 0.43924129 0.4394808
 0.39420667 0.40745125 0.13751427 0.41695946 0.43362695 0.42882697
 0.41886005 0.43492098 0.39719428 0.41688493 0.40758056 0.40058912
 0.4207035  0.4155041  0.418903   0.42922414 0.45358475 0.40384028
 0.42422356 0.41763295 0.4158484  0.44244377 0.42782088 0.41295477
 0.43778945 0.43803012 0.42791533 0.42017057 0.42285523 0.4014258
 0.43542672 0.421923   0.41349299 0.42615532 0.03176449 0.43118313
 0.43793809 0.79596852 0.41739868 0.43420925 0.43092539 0.43087611
 0.41823188 0.43673672 0.40107678 0.44593398 0.44374848 0.43739748
 0.43483454 0.00518205 0.44129229 0.41337002 0.11799863 0.41845607
 0.34632881 0.42438964 0.43144422 0.43789649 0.4258599  0.40242161]
Top 5 activities: [49 28 57 58 33]
Target conditions of top activities: [0 0 1 1 1]


In [20]:
# for each possible condition, we want to measure the accuracy of the predictions
# for users with that condition

def custom_accuracy(user, prediction):
    # Get the user's condition
    user_condition = users[users['user_id'] == user]['condition_id'].values[0]
    # Get the target condition of the top activities
    top_activities = np.argsort(prediction)[::-1][:5]
    target_conditions = activities.loc[top_activities, 'target_condition'].values
    # measure the accuracy by summing a coefficient for each suggestion, decreasing with the rank
    accuracy = 0
    for rk, condition in enumerate(target_conditions):
        if condition == user_condition:
            accuracy += 0.5 / 2**rk

    return accuracy

# Get the unique target conditions
unique_conditions = activities['target_condition'].unique()

# Initialize a dictionary to store the MSE for each condition
mse_by_condition = {condition: 0 for condition in unique_conditions}
accuracy_by_condition = {condition: 0 for condition in unique_conditions}

# Iterate over each condition
for condition in unique_conditions:
    # Get the user ids with the target condition
    user_ids = users[users['condition_id'] == condition]['user_id'].values
    # Calculate the MSE for the users with the target condition
    mse_by_condition[condition] = mean_squared_error(user_item_matrix_filled.loc[user_ids].values.flatten(), all_predictions[user_ids].flatten())
    # Calculate the accuracy for the users with the target condition
    for user_id in user_ids:
        accuracy_by_condition[condition] += custom_accuracy(user_id, all_predictions[user_id])
    # normalize the accuracy
    accuracy_by_condition[condition] /= len(user_ids)

print("MSE by condition:", mse_by_condition)
print("Accuracy by condition:", accuracy_by_condition)

MSE by condition: {0: 0.007484768984445411, 1: 0.011842625221082198, 2: 0.003506661714960012, 3: 0.010472903297912131}
Accuracy by condition: {0: 0.7139423076923077, 1: 0.8871527777777778, 2: 0.85, 3: 0.6458333333333334}
