In [1]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 120

In [2]:
import findspark
findspark.init()

In [3]:
import pickle
import timeit
import numpy as np

In [4]:
from source.als_recommender import AlternatingLeastSquares
from source import utils

In [5]:
spark = utils.create_spark_session()

In [6]:
k = 5
als = AlternatingLeastSquares()

In [10]:
results = []

for i in [50, 100, 150]:
    print(f'Running {i*1000}...')

    train, test = utils.prepare_data(spark, i*1000)
    pd_train, pd_test = train.toPandas(), test.toPandas()

    start = timeit.default_timer()
    als.grid_search(train, rank=[10], max_iter=[5],
                    reg_param=np.linspace(0.05, 0.3, 10), num_folds=2,
                    metric='rmse', parallelism=2)
    stop = timeit.default_timer()
    runtime = stop - start
    print(f'{runtime/60 :.2f} min')

    pred_ratings, pred_rankings = als.predict(test)
    test_rmse = als.rmse()
    test_precision = als.precision_at_k(k)

    pd_pred_ratings = pred_ratings.toPandas()
    pd_pred_rankings = pred_rankings.toPandas()

    rmse_by_user = utils.rmse_distribution(pd_pred_ratings, 'userId')
    rmse_by_movie = utils.rmse_distribution(pd_pred_ratings, 'movieId')

    top_k_precision_by_user = utils.top_k_precision_distribution(pd_pred_rankings, k)

    pd_pred_rankings['predictedRanking'] = pd_pred_rankings.predictedRanking.apply(lambda x: x[:5])
    test_coverage = utils.calculate_coverage(pd.merge(pd_test, pd_pred_rankings, on='userId'))

    experiment_title = f'ALS_{i}_Dist_RMSE_User'
    with open(f'data/results/{experiment_title}.pkl', 'wb') as f:
        pickle.dump([rmse_by_user, experiment_title], f)

    experiment_title = f'ALS_{i}_Dist_RMSE_Movie'
    with open(f'data/results/{experiment_title}.pkl', 'wb') as f:
        pickle.dump([rmse_by_movie, experiment_title], f)

    experiment_title = f'ALS_{i}_Dist_Precision_Movie'
    with open(f'data/results/{experiment_title}.pkl', 'wb') as f:
        pickle.dump([top_k_precision_by_user, experiment_title], f)

    experiment_title = f'ALS_{i}_Hyper_RMSE'
    reg_params = [r['regParam'] for r in als.results]
    rmses = [r['rmse'] for r in als.results]

    result = [experiment_title, pd.Series(reg_params), pd.Series(rmses)]
    with open(f'data/results/{experiment_title}.pkl', 'wb') as f:
        pickle.dump(result, f)

    _, pred_rankings = als.predict(train)
    train_rmse = als.rmse()
    train_precision = als.precision_at_k(k)
    pd_pred_rankings = pred_rankings.toPandas()

    pd_pred_rankings['predictedRanking'] = pd_pred_rankings.predictedRanking.apply(lambda x: x[:5])
    train_coverage = utils.calculate_coverage(pd.merge(pd_train, pd_pred_rankings, on='userId'))

    result = {'sample_size': i,
              'runtime': runtime,
              'rmse_train': train_rmse,
              'rmse_test': test_rmse,
              'top_k_precision_train': train_precision,
              'top_k_precision_test': test_precision,
              'coverage_train': train_coverage,
              'coverage_test': test_coverage}
    results.append(result)
    print(result)
    print()

Running 50000...
11.00 min
{'sample_size': 50, 'runtime': 660.1536158849999, 'rmse_train': 0.7679406645923914, 'rmse_test': 0.9610156344288839, 'top_k_precision_train': 1.0, 'top_k_precision_test': 0.9562162162162166, 'coverage_train': 0.17706517950505402, 'coverage_test': 0.24398745207389333}

Running 100000...
12.17 min
{'sample_size': 100, 'runtime': 729.9904157599999, 'rmse_train': 0.7870209361644016, 'rmse_test': 0.9212674183860323, 'top_k_precision_train': 1.0, 'top_k_precision_test': 0.9566333808844506, 'coverage_train': 0.18650306748466258, 'coverage_test': 0.2591411042944785}

Running 150000...
13.15 min
{'sample_size': 150, 'runtime': 788.9776838819998, 'rmse_train': 0.7894360085317811, 'rmse_test': 0.917336765301196, 'top_k_precision_train': 1.0, 'top_k_precision_test': 0.9611278952668676, 'coverage_train': 0.18190605064759327, 'coverage_test': 0.2522714092402861}



8:1: W293 blank line contains whitespace
28:1: W293 blank line contains whitespace
56:1: W293 blank line contains whitespace
59:1: W293 blank line contains whitespace
63:38: W291 trailing whitespace


In [15]:
pd.DataFrame(results).to_pickle('data/results/ALS_result.pkl')