In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

import pandas as pd, numpy as np, matplotlib.pyplot as plt

import sys 
sys.path.insert(1, "../")
from workloads.util import use_results, use_dataset, read_config, log_dataset

%load_ext autoreload
%autoreload 2

In [None]:
dataset_dir = use_dataset("ml-latest-small")

In [None]:
ratings_path = f"{dataset_dir}/ratings.csv"
tags_path = f"{dataset_dir}/tags.csv" 
movies_path = f"{dataset_dir}/movies.csv"

In [None]:
tags = pd.read_csv(tags_path)
tags.columns = ['user_id', 'movie_id', 'tag', 'timestamp']
ratings = pd.read_csv(ratings_path)
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
movies = pd.read_csv(movies_path)
movies.columns = ['movie_id', 'title', 'genres']

In [None]:
test_df

# Preprocessing Dataset 
* Re-assing timestamp 
* Split train/test

## Re-assing timestamps 
Set user start timestamp to all be the same 

In [None]:
users = list(set(ratings.user_id.tolist()))
user_start_ts = {user: ratings[ratings["user_id"] == user].timestamp.min() for user in users}
ratings.timestamp = ratings.apply(lambda x: x["timestamp"] - user_start_ts[x["user_id"]], axis=1)

Squash rating timestamps by 100 (TODO: potentially make sure timestamps dont userlap)

In [None]:
ratings.timestamp = ratings.timestamp.apply(lambda x: int(x/100))

In [None]:
ratings.movie_id.value_counts()

In [None]:
import torch 

A = torch.zeros((ratings.user_id.max()+1, ratings.movie_id.max()+1), dtype=int)
R = torch.zeros((ratings.user_id.max()+1, ratings.movie_id.max()+1), dtype=int)
A.shape

In [None]:
idx = ratings[["user_id", "movie_id", "rating"]].to_numpy().astype(int)
idx

In [None]:
A[idx[:,0], idx[:,1]] = torch.LongTensor(idx[:,2])
R[idx[:,0], idx[:,1]] = torch.LongTensor(idx[:,2] > 0)

In [None]:
A

In [None]:
R

## Train/Test Split 
Split by median timestamp 

In [None]:
cutoff = ratings.timestamp.median()
mask = ratings["timestamp"] < cutoff
cutoff

In [None]:
#train_df = pd.read_csv(ratings_path)
#train_df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
#train_df["rating"] = train_df["rating"].where(mask, np.NaN)

In [None]:
train_df = ratings[ratings["timestamp"] < cutoff].sort_values("timestamp")
test_df = ratings[ratings["timestamp"] > cutoff].sort_values("timestamp")

In [None]:
train_movies = train_df.movie_id.tolist()
train_movies

In [None]:
test_df = test_df[test_df.movie_id.apply(lambda x: x in train_movies)]

In [None]:
test_df

In [None]:
train_df

In [None]:
train_df

In [None]:
train_df[train_df["user_id"] == 1]

## Create Matrix 

In [None]:
combined_df = train_df.join(movies, on=['movie_id'], rsuffix='_r').join(tags, on=['movie_id'], rsuffix='_t')
combined_df

In [None]:
pivot_table = combined_df.pivot_table(columns=['movie_id'], index=['user_id'], values='rating')
A = pivot_table.fillna(0).values

In [None]:
movie_to_index = {pivot_table.columns[i]: i for i in range(len(pivot_table.columns))}
user_to_index = {pivot_table.index[i]: i for i in range(len(pivot_table.index))}

In [None]:
R = A>0.5; R[R == True] = 1; R[R == False] = 0; R = R.astype(np.float64, copy=False)

In [None]:
R

In [None]:
R.shape

In [None]:
A

In [None]:
A.shape

# Train ALS Model 

In [None]:
train_df[train_df["user_id"] == 42]

In [None]:
spark = SparkSession.builder.master('local').appName('als').getOrCreate()

In [None]:
spark_als_df = spark.createDataFrame(train_df) 

In [None]:
als = ALS(
         userCol="user_id", 
         itemCol="movie_id",
         ratingCol="rating", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop",
         rank=150,
         maxIter=10,
         regParam=.1
)

In [None]:
model=als.fit(spark_als_df)

In [None]:
model.itemFactors.toPandas()

In [None]:
movie_table = model.itemFactors.toPandas().sort_values("id").set_index("id")
user_table = model.userFactors.toPandas().sort_values("id").set_index("id")
movie_table.loc[1]

In [None]:
user_table.loc[42]

In [None]:
user_matrix = model.userFactors.toPandas().sort_values("id").set_index("id").features.to_list()

In [None]:
movie_matrix = model.itemFactors.toPandas().sort_values("id").set_index("id").features.to_list()

In [None]:
user_matrix = np.array(user_matrix)

In [None]:
movie_matrix = np.array(movie_matrix)

In [None]:
movie_matrix.shape

In [None]:
user_matrix.shape

## Evaluate on Test Data 

In [None]:
# ALS Evaluation
spark_als_test_df = spark.createDataFrame(test_df) 
predictions = model.transform(spark_als_test_df)

In [None]:
evaluator = RegressionEvaluator().setMetricName("rmse").setLabelCol("rating").setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
rmse

In [None]:
evaluator = RegressionEvaluator().setMetricName("rmse").setLabelCol("rating").setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
rmse

In [None]:
rmse*rmse

In [None]:
def evaluate(df, curr_user_matrix, curr_movie_matrix): 

    y_pred = []
    y_true = [] 

    for index, row in df.iterrows():
        #print(row)
        ui = user_to_index[int(row["user_id"])]
        mi = movie_to_index[int(row["movie_id"])]
        user_features = curr_user_matrix[ui]
        movie_features = curr_movie_matrix[mi]
        y_pred.append(np.dot(user_features, movie_features))
        y_true.append(row["rating"])
        
    return mean_squared_error(y_true, y_pred)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
evaluate(test_df, user_matrix, movie_matrix)

In [None]:
evaluate(train_df, user_matrix, movie_matrix)

In [None]:
evaluate(test_df, pickle.load(open(f"{dataset_dir}/user_matrix.pkl", "rb")), pickle.load(open(f"{dataset_dir}/movie_matrix.pkl", "rb")))

In [None]:
evaluate(train_df, pickle.load(open(f"{dataset_dir}/user_matrix.pkl", "rb")), pickle.load(open(f"{dataset_dir}/movie_matrix.pkl", "rb")))

# Write Batch ALS Features 

In [None]:
import pickle
import json

In [None]:
pickle.dump(user_matrix, open(f"{dataset_dir}/spark_user_matrix.pkl", "wb"))
pickle.dump(movie_matrix, open(f"{dataset_dir}/spark_movie_matrix.pkl", "wb"))

In [None]:
#user_matrix = pickle.load(open(f"{dataset_dir}/spark_user_matrix.pkl", "rb"))
#movie_matrix = pickle.load(open(f"{dataset_dir}/spark_movie_matrix.pkl", "rb"))
user_matrix = pickle.load(open(f"{dataset_dir}/trained_users.pkl", "rb"))
movie_matrix = pickle.load(open(f"{dataset_dir}/trained_items.pkl", "rb"))

In [None]:
movie_to_index = json.load(open(f"{dataset_dir}/movie_to_index.json", "r"))
user_to_index = json.load(open(f"{dataset_dir}/user_to_index.json", "r"))
A = pickle.load(open(f"{dataset_dir}/A.pkl", "rb"))
R = pickle.load(open(f"{dataset_dir}/R.pkl", "rb"))

test_df = pd.read_csv(f"{dataset_dir}/test.csv")

# Test Streaming Updates 

In [None]:
from tqdm import tqdm

In [None]:
del streaming_user_matrix
del A_matrix
del R_matrix

In [None]:
limit = 100

In [None]:
from workloads.recsys.als import runALS

In [None]:
streaming_user_matrix_batch = np.array(user_matrix, copy=True) 
streaming_movie_matrix_batch = np.array(movie_matrix, copy=True) 
A_matrix_batch = np.array(A, copy=True)
R_matrix_batch = np.array(R, copy=True)

y = []
origin_pred = []
updated_pred = []

for index, row in tqdm(test_df.iloc[:limit].iterrows()):
    
    
    ui = user_to_index[str(int(row["user_id"]))]
    mi = movie_to_index[str(int(row["movie_id"]))]
    
    print(index, ui, mi, movie_matrix.shape)
    
    A_matrix_batch[ui][mi] = row["rating"]
    R_matrix_batch[ui][mi] = 1
    
    origin_pred.append(user_matrix[ui].dot(movie_matrix[mi].T))
    updated_pred.append(streaming_user_matrix_batch[ui].dot(streaming_movie_matrix_batch[mi].T))
    y.append(row["rating"])
    
    print(row["rating"] - origin_pred[-1], row["rating"] - updated_pred[-1])
    
    n_factors = len(user_matrix[ui])
    n_iter = 2
    reg = 0.1
    streaming_user_matrix_batch, streaming_movie_matrix_batch = runALS(
        A_matrix_batch, 
        R_matrix_batch, 
        n_factors, 
        n_iter, 
        reg, 
        streaming_user_matrix_batch, 
        streaming_movie_matrix_batch, 
        users=None #, [ui]
    )



In [None]:
streaming_user_matrix = np.array(user_matrix, copy=True) 
A_matrix = np.array(A, copy=True)
R_matrix = np.array(R, copy=True)


y = []
origin_pred = []
updated_pred = []
#for index, row in test_df.iloc[:limit].iterrows():
for index, row in tqdm(test_df.iloc[:limit].iterrows()):
    ui = user_to_index[int(row["user_id"])]
    mi = movie_to_index[int(row["movie_id"])]
    
    #print(ui, mi)
    
    A_matrix[ui][mi] = row["rating"]
    R_matrix[ui][mi] = 1
    
    Ri = R_matrix[ui]
    user_features = user_matrix[ui]
    user_feature_reg = 10
    n_factors = len(user_features)

    origin_pred.append(user_features.dot(movie_matrix[mi].T))
    updated_pred.append(streaming_user_matrix[ui].dot(movie_matrix[mi].T))
    y.append(row["rating"])
    
    streaming_user_matrix[ui] = np.linalg.solve(
        np.dot(movie_matrix.T, np.dot(np.diag(Ri), movie_matrix)) + user_feature_reg * np.eye(n_factors),
        np.dot(movie_matrix.T, np.dot(np.diag(Ri), A_matrix[ui].T))
    ).T

In [None]:
# 0.01
print(mean_squared_error(y, origin_pred))
print(mean_squared_error(y, updated_pred))

In [None]:
# 0.1
print(mean_squared_error(y, origin_pred))
print(mean_squared_error(y, updated_pred))

In [None]:
# 10 
print(mean_squared_error(y, origin_pred))
print(mean_squared_error(y, updated_pred))

In [None]:
# 5
print(mean_squared_error(y, origin_pred))
print(mean_squared_error(y, updated_pred))

In [None]:
# 1
print(mean_squared_error(y, origin_pred))
print(mean_squared_error(y, updated_pred))

In [None]:
mean_squared_error(y, origin_pred)

In [None]:
mean_squared_error(y, updated_pred)

In [None]:
A[199]

In [None]:
A_matrix[199]

In [None]:
del streaming_sgd_user_matrix
streaming_sgd_user_matrix = np.array(user_matrix, copy=True) 

sgd_updated_pred = []
y = []
origin_pred = []
for index, row in tqdm(test_df[:limit].iterrows()):
    ui = user_to_index[int(row["user_id"])]
    mi = movie_to_index[int(row["movie_id"])]
    
    user_features = streaming_sgd_user_matrix[ui]
    user_feature_reg = 0.01
    learning_rate = 0.02

    prediction = user_features.dot(movie_matrix[mi].T)
    
    sgd_updated_pred.append(prediction)
    origin_pred.append(user_matrix[ui].dot(movie_matrix[mi].T))
    y.append(row["rating"])
    
    error = row["rating"] - prediction
    streaming_sgd_user_matrix[ui] = user_features + learning_rate * (error * movie_matrix[mi] - user_feature_reg * user_features)
    

In [None]:
# reg = 10 
print(mean_squared_error(y, origin_pred)) # original user matrix (ALS)
print(mean_squared_error(y, updated_pred)) # update entire user vector (ALS) - expensive 
print(mean_squared_error(y, sgd_updated_pred)) # sgd update - cheap 

In [None]:
# reg = 5 
print(mean_squared_error(y, origin_pred)) # original user matrix (ALS)
print(mean_squared_error(y, updated_pred)) # update entire user vector (ALS) - expensive 
print(mean_squared_error(y, sgd_updated_pred)) # sgd update - cheap 

In [None]:
# reg = 5 
print(mean_squared_error(y, origin_pred)) # original user matrix (ALS)
#print(mean_squared_error(y, updated_pred)) # update entire user vector (ALS) - expensive 
print(mean_squared_error(y, sgd_updated_pred)) # sgd update - cheap 

In [None]:
# reg = 5 
print(mean_squared_error(y, origin_pred)) # original user matrix (ALS)
print(mean_squared_error(y, updated_pred)) # update entire user vector (ALS) - expensive 
print(mean_squared_error(y, sgd_updated_pred)) # sgd update - cheap 

In [None]:
# original user matrix (ALS)
evaluate(test_df.iloc[limit:limit+1], user_matrix, movie_matrix)

In [None]:
# update entire user vector (ALS) - expensive 
evaluate(test_df.iloc[limit:], streaming_user_matrix_batch, streaming_movie_matrix_batch)

In [None]:
# sgd update - cheap 
evaluate(test_df.iloc[limit:], streaming_sgd_user_matrix, movie_matrix)

In [None]:
import time

In [None]:
user_id = 41
movie_id = 2
rating = 5

#[user_id][movie_id] = rating
#[user_id][movie_id] = 1

Ri = R[user_id]
user_features = user_matrix[user_id]
user_feature_reg = 0.1
n_factors = len(user_features)

# TODO: update A and R? 

st = time.time()
user_i = np.linalg.solve(
    np.dot(movie_matrix.T, np.dot(np.diag(Ri), movie_matrix)) + user_feature_reg * np.eye(n_factors),
    np.dot(movie_matrix.T, np.dot(np.diag(Ri), A[user_id].T))
).T
print(time.time() - st)

In [None]:
np.dot(user_i, movie_matrix.T)

In [None]:
A[i]

In [None]:
np.dot(user_features, movie_matrix.T)

In [None]:
np.dot(user_matrix[i], movie_matrix.T)

In [None]:
np.dot(user_matrix, movie_matrix.T)[i]

In [None]:
user_to_index[217]

In [None]:
R_matrix[268].sum()

In [None]:
movie_to_index

In [None]:
u = 509

In [None]:
R[u-1]

In [None]:
(R_matrix[u-1] - R[u-1]).sum()

In [None]:
df = test_df.iloc[limit:]

In [None]:
evaluate(df[df["user_id"] == u], streaming_user_matrix)

In [None]:
evaluate(df[df["user_id"] == u], user_matrix)

In [None]:
evaluate(df[df["user_id"] == u], streaming_sgd_user_matrix)

In [None]:
train_df[train_df["user_id"] == u]

In [None]:
test_df[test_df["user_id"] == u].movie_id.tolist()

In [None]:
train_df[train_df["user_id"] == u]