In [1]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 120

In [2]:
import findspark
findspark.init()

In [3]:
import pickle
import timeit
import numpy as np

In [4]:
from source.als_recommender import AlternatingLeastSquares
from source import utils

In [5]:
spark = utils.create_spark_session()

In [6]:
k = 5
als = AlternatingLeastSquares()

In [12]:
results = []

for i in [50000, 100000, 150000]:
    print(f'Running {i}...')
    
    train, test = utils.prepare_data(spark, i)
    
    start = timeit.default_timer()
    als.grid_search(train, rank=[10], max_iter=[5],
                    reg_param=np.linspace(0.05, 0.3, 10), num_folds=2,
                    metric='rmse', parallelism=2)
    stop = timeit.default_timer()
    runtime = stop - start
    print(f'\t{runtime/60 :.2f} min')

    pred_ratings, pred_rankings = als.predict(test)
    test_rmse = als.rmse()
    test_precision = als.precision_at_k(k)
    
    pd_pred_ratings = pred_ratings.toPandas()
    pd_pred_rankings = pred_rankings.toPandas()

    rmse_by_user = utils.rmse_distribution(pd_pred_ratings, 'userId')
    rmse_by_movie = utils.rmse_distribution(pd_pred_ratings, 'movieId')

    top_k_precision_by_user = utils.top_k_precision_distribution(pd_pred_rankings, k)

    with open(f'data/results/als_{i}_distribution_rmse_user.pkl', 'wb') as f:
        pickle.dump([rmse_by_user, 'als_distribution_rmse_user'], f)

    with open(f'data/results/als_{i}_distribution_rmse_movie.pkl', 'wb') as f:
        pickle.dump([rmse_by_movie, 'als_distribution_rmse_movie'], f)

    with open(f'data/results/als_{i}_distribution_topk_user.pkl', 'wb') as f:
        pickle.dump([top_k_precision_by_user, 'als_distribution_topk_user'], f)

    experiment_title = 'als_{i}_hyperparameter_tuning_for_reg_param_rmse'
    reg_params = [r['regParam'] for r in als.results]
    rmses = [r['rmse'] for r in als.results]

    result = [experiment_title, pd.Series(reg_params), pd.Series(rmses)]
    with open(f'data/results/als_{i}_hyperparameter_tuning_for_reg_param_rmse_.pkl', 'wb') as f:
        pickle.dump(result, f)
    
    als.predict(train)
    train_rmse = als.rmse()
    train_precision = als.precision_at_k(k)
    
    results.append({'sample_size': i, 'runtime': runtime,
                    'train_rmse': train_rmse, 'test_rmse': test_rmse, 
                    'train_precision': train_precision, 'test_precision': test_precision,})

Running 50000...
	13.56 min
Running 100000...
	13.17 min
Running 150000...
	13.73 min


5:1: W293 blank line contains whitespace
7:1: W293 blank line contains whitespace
19:1: W293 blank line contains whitespace
44:1: W293 blank line contains whitespace
48:1: W293 blank line contains whitespace
50:70: W291 trailing whitespace
51:89: E231 missing whitespace after ','


In [13]:
pd.DataFrame(results)

Unnamed: 0,sample_size,runtime,train_rmse,test_rmse,train_precision,test_precision
0,50000,813.707301,0.774072,0.963989,1.0,0.956216
1,100000,790.146779,0.759134,0.916718,1.0,0.956633
2,150000,824.023821,0.784345,0.912994,1.0,0.961128
