In [1]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 120

In [2]:
import findspark
findspark.init()

In [3]:
import pickle
import timeit
import numpy as np

In [4]:
from source.als_recommender import AlternatingLeastSquares
from source import utils

In [5]:
spark = utils.create_spark_session()

In [6]:
k = 5
als = AlternatingLeastSquares()

In [None]:
results = []

for i in [50, 100, 150]:
    print(f'Running {i*1000}...')

    train, test = utils.prepare_data(spark, i*1000)

    start = timeit.default_timer()
    als.grid_search(train, rank=[10], max_iter=[5],
                    reg_param=np.linspace(0.05, 0.3, 10), num_folds=2,
                    metric='rmse', parallelism=2)
    stop = timeit.default_timer()
    runtime = stop - start
    print(f'\t{runtime/60 :.2f} min')

    pred_ratings, pred_rankings = als.predict(test)
    test_rmse = als.rmse()
    test_precision = als.precision_at_k(k)

    pd_pred_ratings = pred_ratings.toPandas()
    pd_pred_rankings = pred_rankings.toPandas()

    rmse_by_user = utils.rmse_distribution(pd_pred_ratings, 'userId')
    rmse_by_movie = utils.rmse_distribution(pd_pred_ratings, 'movieId')

    top_k_precision_by_user = utils.top_k_precision_distribution(pd_pred_rankings, k)
    test_coverage = utils.calculate_coverage(pd_pred_rankings)

    with open(f'data/results/als_{i}_dist_rmse_user.pkl', 'wb') as f:
        pickle.dump([rmse_by_user, 'als_dist_rmse_user'], f)

    with open(f'data/results/als_{i}_dist_rmse_movie.pkl', 'wb') as f:
        pickle.dump([rmse_by_movie, 'als_dist_rmse_movie'], f)

    with open(f'data/results/als_{i}_dist_topk_user.pkl', 'wb') as f:
        pickle.dump([top_k_precision_by_user, 'als_dist_topk_user'], f)

    experiment_title = 'als_{i}_hyper_rmse'
    reg_params = [r['regParam'] for r in als.results]
    rmses = [r['rmse'] for r in als.results]

    result = [experiment_title, pd.Series(reg_params), pd.Series(rmses)]
    with open(f'data/results/als_{i}_hyper_rmse_.pkl', 'wb') as f:
        pickle.dump(result, f)

    _, pred_rankings = als.predict(train)
    train_rmse = als.rmse()
    train_precision = als.precision_at_k(k)
    pd_pred_rankings = pred_rankings.toPandas()
    test_coverage = utils.calculate_coverage(pd_pred_rankings)

    results.append({'sample_size': i,
                    'runtime': runtime,
                    'rmse_train': train_rmse,
                    'rmse_test': test_rmse, 
                    'top_k_precision_train': train_precision,
                    'top_k_precision_test': test_precision,
                    'coverage_train': train_coverage,
                    'coverage_test': test_coverage})

Running 50000...
	13.66 min


In [None]:
result = pd.DataFrame(results)
result = result[['sample_size', 'runtime', 'rmse_train', 'rmse_test',
                 'top_k_precision_train', 'top_k_precision_test']]
experiment_title = f'result'
with open(f'data/results/{experiment_title}.pkl', 'wb') as f:
    pickle.dump(result, f)