In [221]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [222]:
dir_path = "~/movielens/ralf"
user_features_path = f"{dir_path}/user_features.csv"
movie_features_path = f"{dir_path}/movie_features.csv"
ratings_path = f"{dir_path}/ratings.csv"

In [223]:
def get_features(file_path):
    df = pd.read_csv(file_path)
    features = dict()
    for row in df.itertuples():
        features[row.id] = np.array(eval(row.features))
    return features 

In [224]:
def get_feature_update(features, row):
    features[row.user_id] = np.array(eval(row.user_features))

In [225]:
user_features = get_features(user_features_path)
movie_features = get_features(movie_features_path)

In [226]:
test_data = pd.read_csv(ratings_path)

In [227]:
experiment_path = "~/experiment_results/17"
feature_updates = pd.read_csv(f"{experiment_path}/results_workers_1_fifo_learningrate_0.02_userfeaturereg_0.01.csv")

In [228]:
len(test_data)

49271

In [238]:
len(test_data[test_data['timestamp'] == 49])

200

In [237]:
len(feature_updates[feature_updates['timestamp'] == 60])

18

In [239]:
max(feature_updates['timestamp'])

246

In [235]:
def predict_rating(user_feature, movie_feature):
    return user_feature.dot(movie_feature.T)

def compute_test_mse(user_features, movie_features):
    predictions = []
    ratings = []
    count = 0
    for row in test_data.itertuples():
        user_id = row.user_id
        movie_id = row.movie_id
        rating = row.rating
        predictions.append(predict_rating(user_features[user_id], movie_features[movie_id]))
        ratings.append(rating) 
    mse = mean_squared_error(ratings, predictions)
    return mse

def compute_baseline():
    predictions = []
    ratings = []
    for row in test_data.itertuples():
        user_id = row.user_id
        movie_id = row.movie_id
        rating = row.rating
        ratings.append(rating)
        predictions.append(predict_rating(user_features[user_id], movie_features[movie_id]))
    return mean_squared_error(ratings, predictions)

def compute_improvement():
    count = 0
    predictions = []
    ratings = []
    max_timestamp = max(feature_updates['timestamp'])
    print(max_timestamp, len(feature_updates), len(test_data))
    baseline = compute_baseline()
    while count <= max_timestamp:
        feature_update_ts = feature_updates[feature_updates['timestamp'] == count] 
        print(len(feature_update_ts))
        for row in feature_update_ts.itertuples():
            user_features[row.user_id] = np.array(eval(row.user_features))
        query_ts = test_data[test_data['timestamp'] == count]
        print(len(query_ts))
        for row in query_ts.itertuples():
            user_id = row.user_id
            movie_id = row.movie_id
            rating = row.rating
            predictions.append(predict_rating(user_features[user_id], movie_features[movie_id]))
            ratings.append(rating)
        count += 1
    new_mse = mean_squared_error(ratings, predictions)
    return baseline, new_mse

In [236]:
mse = compute_improvement()
print(mse)

246 49199 49271
0
199
5
200
18
200
16
200
12
200
10
200
14
200
15
200
12
200
16
200
15
200
18
200
14
200
14
200
17
200
17
200
14
200
21
200
16
200
17
200
17
200
18
200
17
200
20
200
18
200
16
200
19
200
21
200
19
200
19
200
19
200
20
200
20
200
22
200
19
200
20
200
20
200
20
200
20
200
20
200
20
200
20
200
20
200
20
200
13
200
18
200
22
200
19
200
21
200
20
200
20
200
21
200
21
200
17
200
20
200
21
200
17
200
20
200
21
200
19
200
18
200
19
200
19
200
20
200
19
200
20
200
22
200
21
200
20
200
19
200
20
200
19
200
20
200
15
200
18
200
14
200
19
200
20
200
16
200
17
200
21
200
22
200
21
200
21
200
21
200
17
200
19
200
21
200
21
200
18
200
19
200
20
200
21
200
18
200
18
200
21
200
19
200
22
200
21
200
20
200
19
200
20
200
19
200
23
200
23
200
22
200
17
200
19
200
21
200
22
200
20
200
22
200
10
200
19
200
39
200
21
200
21
200
17
200
15
200
14
200
20
200
18
200
17
200
18
200
20
200
18
200
20
200
18
200
18
200
19
200
20
200
17
200
19
200
20
200
19
200
13
200
19
200
20
200
19
200
21
200
19
200

In [None]:
def plot_learning_curve(mse):
        """visualize the training/testing loss"""
        linewidth = 3
        plt.plot(mse, label = 'Ralf', linewidth = linewidth)
        plt.xlabel('iterations')
        plt.ylabel('MSE')
        plt.legend(loc = 'best')

In [None]:
plot_learning_curve(mse)