In [1]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 120

In [2]:
import findspark
findspark.init()

In [3]:
import pickle
import timeit
import numpy as np

In [4]:
from source.als_recommender import AlternatingLeastSquares
from source import utils

In [5]:
spark = utils.create_spark_session()

In [6]:
k = 5
als = AlternatingLeastSquares()

In [11]:
results = []

for i in [50, 100, 150]:
    print(f'Running {i*1000}...')

    train, test = utils.prepare_data(spark, i*1000)
    pd_train, pd_test = train.toPandas(), test.toPandas()

    start = timeit.default_timer()
    als.grid_search(train, rank=[10], max_iter=[5],
                    reg_param=np.linspace(0.05, 0.3, 10), num_folds=2,
                    metric='rmse', parallelism=2)
    stop = timeit.default_timer()
    runtime = stop - start
    print(f'{runtime/60 :.2f} min')

    pred_ratings, pred_rankings = als.predict(test)
    test_rmse = als.rmse()
#     test_precision = als.precision_at_k(k)

    pd_pred_ratings = pred_ratings.toPandas()
    pd_pred_rankings = pred_rankings.toPandas()

    rmse_by_user = utils.rmse_distribution(pd_pred_ratings, 'userId')
    rmse_by_movie = utils.rmse_distribution(pd_pred_ratings, 'movieId')

    pd_pred_rankings['userRanking'] = pd_pred_rankings.userRanking.apply(lambda x: x[:5])
    pd_pred_rankings['predictedRanking'] = pd_pred_rankings.predictedRanking.apply(lambda x: x[:5])
    
    top_k_precision_by_user = utils.top_k_precision_distribution(pd_pred_rankings, k)
    test_precision = top_k_precision_by_user.mean()
    
    test_coverage = utils.calculate_coverage(pd.merge(pd_test, pd_pred_rankings, on='userId'))

    experiment_title = f'ALS_{i}_Dist_RMSE_User'
    with open(f'data/results/{experiment_title}.pkl', 'wb') as f:
        pickle.dump([rmse_by_user, experiment_title], f)

    experiment_title = f'ALS_{i}_Dist_RMSE_Movie'
    with open(f'data/results/{experiment_title}.pkl', 'wb') as f:
        pickle.dump([rmse_by_movie, experiment_title], f)

    experiment_title = f'ALS_{i}_Dist_Precision_Movie'
    with open(f'data/results/{experiment_title}.pkl', 'wb') as f:
        pickle.dump([top_k_precision_by_user, experiment_title], f)

    experiment_title = f'ALS_{i}_Hyper_RMSE'
    reg_params = [r['regParam'] for r in als.results]
    rmses = [r['rmse'] for r in als.results]

    result = [experiment_title, pd.Series(reg_params), pd.Series(rmses)]
    with open(f'data/results/{experiment_title}.pkl', 'wb') as f:
        pickle.dump(result, f)

    _, pred_rankings = als.predict(train)
    train_rmse = als.rmse()
#     train_precision = als.precision_at_k(k)
    pd_pred_rankings = pred_rankings.toPandas()

    pd_pred_rankings['userRanking'] = pd_pred_rankings.userRanking.apply(lambda x: x[:5])
    pd_pred_rankings['predictedRanking'] = pd_pred_rankings.predictedRanking.apply(lambda x: x[:5])
    
    top_k_precision_by_user = utils.top_k_precision_distribution(pd_pred_rankings, k)
    train_precision = top_k_precision_by_user.mean()
    
    
    train_coverage = utils.calculate_coverage(pd.merge(pd_train, pd_pred_rankings, on='userId'))

    result = {'sample_size': i,
              'runtime': runtime,
              'rmse_train': train_rmse,
              'rmse_test': test_rmse,
              'top_k_precision_train': train_precision,
              'top_k_precision_test': test_precision,
              'coverage_train': train_coverage,
              'coverage_test': test_coverage}
    results.append(result)
    print(result)
    print()

Running 50000...
10.58 min
{'sample_size': 50, 'runtime': 634.6269052030002, 'rmse_train': 0.7727328968582378, 'rmse_test': 0.9614838862055153, 'top_k_precision_train': 0.33837837837837903, 'top_k_precision_test': 0.503783783783784, 'coverage_train': 0.17253398396653885, 'coverage_test': 0.24398745207389333}

Running 100000...
11.85 min
{'sample_size': 100, 'runtime': 710.8382729289997, 'rmse_train': 0.7930586364424529, 'rmse_test': 0.9262465358383585, 'top_k_precision_train': 0.29700427960057013, 'top_k_precision_test': 0.4987161198288162, 'coverage_train': 0.18527607361963191, 'coverage_test': 0.258159509202454}

Running 150000...
12.66 min


29:1: W293 blank line contains whitespace
32:1: W293 blank line contains whitespace
62:1: W293 blank line contains whitespace
65:1: W293 blank line contains whitespace
66:1: W293 blank line contains whitespace
67:5: E303 too many blank lines (2)


{'sample_size': 150, 'runtime': 759.6038995669996, 'rmse_train': 0.786878858418734, 'rmse_test': 0.9150003754361845, 'top_k_precision_train': 0.29083585095669556, 'top_k_precision_test': 0.5057401812688828, 'coverage_train': 0.17997293640054127, 'coverage_test': 0.2526580320896965}



In [12]:
pd.DataFrame(results).to_pickle('data/results/ALS_result.pkl')