In [170]:
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als, item_knn as knn
from lenskit import topn
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import random

In [171]:
TRAIN = pd.read_parquet('part-00000-7df55dc3-a829-47cc-8efd-4dc73f5a20f5-c000.snappy.parquet')
TEST = pd.read_parquet('part-00000-07807155-469d-43a3-8913-4da643d5504a-c000.snappy.parquet')
TEST = TEST.dropna()

In [172]:
def sub_sample(train,test,sample_size):
    
    user_ids = pd.DataFrame(train.user_id.unique())
    
    training_ids = user_ids.sample(frac=sample_size, random_state=10)
    
    training_ids = training_ids.rename(columns={0: "user_id"})
    
    final_train = training_ids.merge(train, on = "user_id")
    
    final_test = training_ids.merge(test, on = "user_id")
    
    return final_train, final_test

In [173]:
# Initialize the ALS model according to the best parameters from our hyperparameter search
algo_als = als.ImplicitMF(features=205, iterations=10, reg=0.01, weight=100)

# Define model evaluation function
def eval(aname, algo, train, test):
    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)
    users = test.user.unique()
    recs = batch.recommend(fittable, users, 500)
    recs['Algorithm'] = aname
    return recs

In [185]:
ss = [.1*i for i in range(1,3)]

for s in ss:
    
    df_train, df_test = sub_sample(TRAIN,TEST,s)

    df_train = df_train.rename(columns={'user_id':'user', 'track_id':'item', 'count':'rating'})

    df_test = df_test.rename(columns={'user_id':'user', 'track_id':'item', 'count':'rating'})

    start_time = time.time()

    # Run LensKit model
    all_recs = []
    test_data = []

    test_data.append(df_test)

    all_recs.append(eval('ALS', algo_als, df_train, df_test))

    all_recs = pd.concat(all_recs, ignore_index=True)           
    test_data = pd.concat(test_data, ignore_index=True)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(all_recs, test_data)

    # Print nDCG and elapsed time for the run
    print('-----------------------------------------')
    nDCG = results.groupby('Algorithm').ndcg.mean()[0]
    print('nDCG = {}'.format(nDCG))

    end_time = time.time()
    elapsed = end_time - start_time
    print('Time taken for subsample {}: {}'.format(s, elapsed))

-----------------------------------------
nDCG = 0.06918827412331575
Time taken for subsample 0.1: 7.799734830856323
-----------------------------------------
nDCG = 0.08260906161500198
Time taken for subsample 0.2: 10.573343992233276


In [184]:
ss

[0.0, 0.1]