In [None]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from numpy.linalg import inv
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import sys 
sys.path.insert(1, "../")
from workloads.util import use_results, use_dataset, read_config, log_dataset
from tqdm import tqdm 

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
dataset_dir = use_dataset("ml-latest-small")

In [None]:
ratings_path = f"{dataset_dir}/ratings.csv"
ratings_df = pd.read_csv(ratings_path)
ratings_df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

In [None]:
def split_data(df):  
    start_ts = df['timestamp'].min()
    med_ts = df['timestamp'].quantile(.25)
    end_ts = df['timestamp'].max()
    train_df = df[df['timestamp'] <= med_ts]
    stream_df = df[df['timestamp'] > med_ts]
    seen_movies = set(train_df['movie_id'])
    print(len(seen_movies), len(set(stream_df['movie_id'])), len(stream_df))
    stream_df = stream_df.drop(stream_df[stream_df['movie_id'].map(lambda x: x not in seen_movies)].index)
    train_df.to_csv(f'{dataset_dir}/train.csv', header=True, index = False)
    stream_df.to_csv(f'{dataset_dir}/stream.csv', header=True, index = False)
    return start_ts, med_ts, end_ts

In [None]:
start_ts, med_ts, end_ts = split_data(ratings_df)
train_df = pd.read_csv(f'{dataset_dir}/train.csv')
test_df = pd.read_csv(f'{dataset_dir}/stream.csv')
test_df

In [None]:
class CustomALS(object):
    """Predicts using ALS"""
    
    def __init__(self, k=20, n_iter=20, lambda_u=0.001,
                 lambda_v=0.001):
        
        self.k = k
        self.n_iter = n_iter
        self.lambda_u = lambda_u
        self.lambda_v = lambda_v
        
    def predict(uid, mid): 
        return np.dot(self.U[uid,:], self.V[mid:,])
                                   
    def fit(self, R):
        self.R = R.copy()
        
        # Convert missing entries to 0
        self.R = np.nan_to_num(self.R)
            
        m, n = R.shape
  
        # Initialize
        self.U = np.random.normal(loc=0., scale=0.01, size=(m, self.k))
        self.V = np.random.normal(loc=0., scale=0.01, size=(n, self.k))

        I = np.eye(self.k)
        Iu = self.lambda_u * I
        Iv = self.lambda_v * I
        
        R_T = self.R.T
        
        #model_u = make_pipeline(StandardScaler(with_mean=False), Ridge(alpha=self.lambda_u, fit_intercept=True))
        #model_v = make_pipeline(StandardScaler(with_mean=False), Ridge(alpha=self.lambda_v, fit_intercept=True))
        model_u = Ridge(alpha=self.lambda_u, fit_intercept=True)
        model_v = Ridge(alpha=self.lambda_v, fit_intercept=True)
        

        for _ in tqdm(range(self.n_iter)):
            # NOTE: This can be parallelized
            for i in range(m):
                model_u.fit(X=self.V,
                            y=R_T[:,i])       
                self.U[i,:] = model_u.coef_ #model_u.named_steps['ridge'].coef_
                
            # NOTE: This can be parallelized
            for j in range(n):
                model_v.fit(X=self.U,
                            y=R_T[j,:])        
                self.V[j,:] = model_v.coef_ #model_v.named_steps['ridge'].coef_
                
        self.R_hat = self.U.dot(self.V.T)

In [None]:
model = CustomALS(n_iter=5)
model.fit(R_df.values)

In [None]:
test_df = pd.read_csv(f'{dataset_dir}/stream.csv')

In [None]:
def predict(uid, mid): 
    print(model.U[uid,:])
    print(model.V[mid,:].T)
    return np.dot(model.U[uid,:], model.V[mid,:].T)

model.R_hat

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(model.R, model.R_hat)

In [None]:
spark = SparkSession.builder \
        .master("local") \
        .appName("test") \
        .getOrCreate()
        
class SparkALS(object):
    def __init__(self, k=20, n_iter=20, lambda_=0.001):   
        self.als = ALS(rank=k, maxIter=n_iter, regParam=lambda_)
    
    def fit(self, R):
        R = np.nan_to_num(R)
        ratings = []
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                ratings.append((i, j, float(R[i,j])))

        df = spark.createDataFrame(ratings,
                                   ["user", "item", "rating"])
        
        model = self.als.fit(df)
        
        user_factors = model.userFactors.orderBy("id").collect()
        item_factors = model.itemFactors.orderBy("id").collect()
        
        self.U = np.array([f.features for f in user_factors])
        self.V = np.array([f.features for f in item_factors])
        self.R_hat = self.U.dot(self.V.T)

In [None]:
R_df = train_df.pivot(index='user_id',
                        columns='movie_id',
                        values='rating'
                       ).fillna(np.nan)
R_df

In [None]:
train_df

In [None]:
def cross_val_ndcg(model, X, n_splits=2):
    m, n = X.shape
    
    rows = list(range(m))
    
    # Row index
    I = np.array(range(n))
        
    # Now, split into K folds (by users)
    kf = KFold(n_splits=n_splits, shuffle=True)
    scores = []
    for train, test in kf.split(rows):
        
        # Assign test entries as undefined
        X_train = X.copy()
        X_test = X[test,:]
        
        user_indx = dict()
        
        # Prepare training set
        for i in test:            
            # Indices with non-nan
            pos_indx = I[~np.isnan(X[i,:])]
            neg_indx = I[np.isnan(X[i,:])]
            
            # Shuffle indices
            np.random.shuffle(pos_indx)
            np.random.shuffle(neg_indx)
                    
            pos_test, _ = np.array_split(pos_indx, 2)
            neg_test, _ = np.array_split(neg_indx, 2)
       
            test_indx = np.append(pos_test, neg_test)
            
            # "Hide" entries for this person
            X_train[i, pos_test] = np.nan
            
            # Remember what indices to
            # use during testing
            user_indx[i] = test_indx
        
        # Train
        model.fit(X_train)
        
        R_hat = model.R_hat
        
        for i in test:
            test_indx = user_indx[i]
            
            # Need to rank these
            # according to our algorithm
            values = X[i, test_indx]
            
            # Replace missing entries with 0
            I_ = np.array(range(len(values)))
            neg_indx = I_[np.isnan(values)]
            values[neg_indx] = 0.
            
            # These are the predicted values
            pred = R_hat[i, test_indx]
            
            # Get sorted index position
            sort_indx = np.argsort(pred)[::-1]
            
            values = values[sort_indx]
            
            # Now, order pred by holdout values
            ndcg = ndcg_at_k(values, k=20)
                    
            scores.append(ndcg)
            
    return scores

In [None]:
def ndcg_at_k(x, k):
    
    if k == 0:
        return .0
    elif k < 0:
        raise ValueError('k cannot be negative')
            
    # 1, 2, ..., k
    i = np.arange(1, k + 1)
    
    # Discount factor
    d = 1. / np.log2(i + 1)
    
    # Sorted for best possible scores
    x_best = np.sort(x)[::-1]
        
    # Compute normalization constant
    N = np.sum(d * x_best[:k])
    n = np.sum(d * x[:k])
    
    return n / N

In [None]:
als_scores = cross_val_ndcg(CustomALS(), R_df.values, n_splits=2)

In [None]:
als_scores

In [None]:
spark_als_scores = cross_val_ndcg(SparkALS(), R_df.values, n_splits=2)

In [None]:
for v in scores:
    mu = round(np.mean(v['scores']),2)
    std = round(np.std(v['scores']),2)
    
    plt.hist(v['scores'], color=v['color'],
             bins=50, alpha=0.4,
             label='{}, $\mu$={}, $\sigma$={}'.format(v['name'], mu, std))

plt.ylabel('Count')
plt.xlabel('NDCG@20')
plt.title('2 Fold Cross Validation')
plt.legend(loc='upper right')

In [None]:
model = CustomALS()

In [None]:
model.fit(R_df.values)