In [19]:
from lenskit import batch, topn, util
from lenskit.algorithms import Recommender, als
import pandas as pd
import time
import numpy as np

In [14]:
def sub_sample(train,test,sample_size):
    
    user_ids = pd.DataFrame(train.user_id.unique())
    
    training_ids = user_ids.sample(frac=sample_size, random_state=10)
    
    training_ids = training_ids.rename(columns={0: "user_id"})
    
    final_train = training_ids.merge(train, on = "user_id")
    
    final_test = training_ids.merge(test, on = "user_id")
    
    return final_train, final_test

In [15]:
def eval(aname, algo, train, test):
    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)
    users = test.user.unique()
    recs = batch.recommend(fittable, users, 500)
    recs['Algorithm'] = aname
    return recs

In [16]:
train = pd.read_parquet('Train_Subsample.parquet')
test = pd.read_parquet('Test_Subsample.parquet')
test = test.dropna()

In [31]:
# Initialize the ALS model according to the best parameters from our hyperparameter search
als_model = als.ImplicitMF(features=205, iterations=10, reg=0.01, weight=100)

sampleSize = [.1*i for i in range(1,9)]

for s in sampleSize:
    
    df_train, df_test = sub_sample(train,test,s)

    df_train = df_train.rename(columns={'user_id':'user', 'track_id':'item', 'count':'rating'})

    df_test = df_test.rename(columns={'user_id':'user', 'track_id':'item', 'count':'rating'})

    start_time = time.time()

    # Run LensKit model
    all_recs = []
    test_data = []

    test_data.append(df_test)

    all_recs.append(eval('ALS', als_model, df_train, df_test))

    all_recs = pd.concat(all_recs, ignore_index=True)           
    test_data = pd.concat(test_data, ignore_index=True)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(all_recs, test_data)

    # Print nDCG and elapsed time for the run
    print('-----------------------------------------')
    nDCG = results.ndcg.mean()
    print('nDCG = {}'.format(nDCG))

    end_time = time.time()
    elapsed = end_time - start_time
    print('Time taken for subsample {}: {}'.format(np.round(s,1), elapsed))

-----------------------------------------
nDCG = 0.05990125097411818
Time taken for subsample 0.1: 15.020716190338135
-----------------------------------------
nDCG = 0.08666989196191503
Time taken for subsample 0.2: 35.15086579322815
-----------------------------------------
nDCG = 0.10080064126964208
Time taken for subsample 0.3: 53.53020429611206
-----------------------------------------
nDCG = 0.10904326156800516
Time taken for subsample 0.4: 65.25157237052917
-----------------------------------------
nDCG = 0.11678215431925072
Time taken for subsample 0.5: 67.28724408149719
-----------------------------------------
nDCG = 0.1263111869483915
Time taken for subsample 0.6: 43.47783327102661
-----------------------------------------
nDCG = 0.136684219618948
Time taken for subsample 0.7: 80.91423344612122
-----------------------------------------
nDCG = 0.14081499331877956
Time taken for subsample 0.8: 95.11232948303223


In [32]:
# Initialize the ALS model according to the best parameters from our hyperparameter search
als_model = als.ImplicitMF(features=205, iterations=10, reg=0.01, weight=100)

sampleSize = [.1*i for i in range(1,9)]

for s in sampleSize:
    
    df_train, df_test = sub_sample(train,test,s)

    df_train = df_train.rename(columns={'user_id':'user', 'track_id':'item', 'count':'rating'})

    df_test = df_test.rename(columns={'user_id':'user', 'track_id':'item', 'count':'rating'})

    start_time = time.time()

    # Run LensKit model
    all_recs = []
    test_data = []

    test_data.append(df_test)

    all_recs.append(eval('ALS', als_model, df_train, df_test))

    all_recs = pd.concat(all_recs, ignore_index=True)           
    test_data = pd.concat(test_data, ignore_index=True)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    results = rla.compute(all_recs, test_data)

    # Print precision and elapsed time for the run
    print('-----------------------------------------')
    precision = results.precision.mean()
    print('Precision = {}'.format(precision))

    end_time = time.time()
    elapsed = end_time - start_time
    print('Time taken for subsample {}: {}'.format(np.round(s,1), elapsed))

-----------------------------------------
Precision = 0.0035111111111111124
Time taken for subsample 0.1: 25.794238805770874
-----------------------------------------
Precision = 0.005410526315789478
Time taken for subsample 0.2: 37.340272188186646
-----------------------------------------
Precision = 0.005912408759124091
Time taken for subsample 0.3: 47.62299180030823
-----------------------------------------
Precision = 0.006761904761904767
Time taken for subsample 0.4: 59.304797887802124
-----------------------------------------
Precision = 0.00704347826086957
Time taken for subsample 0.5: 67.10406970977783
-----------------------------------------
Precision = 0.007423357664233579
Time taken for subsample 0.6: 77.4145119190216
-----------------------------------------
Precision = 0.007866242038216543
Time taken for subsample 0.7: 84.49690341949463
-----------------------------------------
Precision = 0.008016483516483491
Time taken for subsample 0.8: 97.42567467689514


In [33]:
# Initialize the ALS model according to the best parameters from our hyperparameter search
als_model = als.ImplicitMF(features=205, iterations=10, reg=10, weight=100)

sampleSize = [.1*i for i in range(1,9)]

for s in sampleSize:
    
    df_train, df_test = sub_sample(train,test,s)

    df_train = df_train.rename(columns={'user_id':'user', 'track_id':'item', 'count':'rating'})

    df_test = df_test.rename(columns={'user_id':'user', 'track_id':'item', 'count':'rating'})

    start_time = time.time()

    # Run LensKit model
    all_recs = []
    test_data = []

    test_data.append(df_test)

    all_recs.append(eval('ALS', als_model, df_train, df_test))

    all_recs = pd.concat(all_recs, ignore_index=True)           
    test_data = pd.concat(test_data, ignore_index=True)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    results = rla.compute(all_recs, test_data)

    # Print precision and elapsed time for the run
    print('-----------------------------------------')
    precision = results.precision.mean()
    print('Precision = {}'.format(precision))

    end_time = time.time()
    elapsed = end_time - start_time
    print('Time taken for subsample {}: {}'.format(np.round(s,1), elapsed))

-----------------------------------------
Precision = 0.0036888888888888896
Time taken for subsample 0.1: 22.61983895301819
-----------------------------------------
Precision = 0.005578947368421057
Time taken for subsample 0.2: 35.450098752975464
-----------------------------------------
Precision = 0.006686131386861318
Time taken for subsample 0.3: 45.420161724090576
-----------------------------------------
Precision = 0.007121693121693127
Time taken for subsample 0.4: 55.59146046638489
-----------------------------------------
Precision = 0.007365217391304352
Time taken for subsample 0.5: 64.81741833686829
-----------------------------------------
Precision = 0.007861313868613131
Time taken for subsample 0.6: 73.16239047050476
-----------------------------------------
Precision = 0.008261146496815263
Time taken for subsample 0.7: 82.50107455253601
-----------------------------------------
Precision = 0.00849999999999997
Time taken for subsample 0.8: 93.51557922363281
