import graphlab as gl
# set canvas to show sframes and sgraphs in ipython notebook
gl.canvas.set_target('ipynb')
import matplotlib.pyplot as plt
%matplotlib inline
[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1495272374.log
This non-commercial license of GraphLab Create for academic use is assigned to fduwangyue@126.com and will expire on May 12, 2018.
train_file = 'http://s3.amazonaws.com/dato-datasets/millionsong/10000.txt'
sf = gl.SFrame.read_csv(train_file, header=False, delimiter='\t', verbose=False)
sf.rename({'X1':'user_id', 'X2':'music_id', 'X3':'rating'}).show()
(train_set, test_set) = sf.random_split(0.8, seed=1)
popularity_model = gl.popularity_recommender.create(train_set, 'user_id', 'music_id', target = 'rating')
Recsys training: model = popularity
Preparing data set.
Data has 1599753 observations with 76085 users and 10000 items.
Data prepared in: 2.36449s
1599753 observations to process; with 10000 unique items.
item_sim_model = gl.item_similarity_recommender.create(train_set, 'user_id', 'music_id', target = 'rating',
similarity_type='cosine')
Recsys training: model = item_similarity
Preparing data set.
Data has 1599753 observations with 76085 users and 10000 items.
Data prepared in: 2.54953s
Training model from provided data.
Gathering per-item and per-user statistics.
+--------------------------------+------------+
| Elapsed Time (Item Statistics) | % Complete |
+--------------------------------+------------+
| 4.548ms | 1.25 |
| 108.944ms | 100 |
+--------------------------------+------------+
Setting up lookup tables.
Processing data in one pass using dense lookup tables.
+-------------------------------------+------------------+-----------------+
| Elapsed Time (Constructing Lookups) | Total % Complete | Items Processed |
+-------------------------------------+------------------+-----------------+
| 453.613ms | 0 | 0 |
| 1.45s | 76 | 7604 |
| 2.78s | 100 | 10000 |
+-------------------------------------+------------------+-----------------+
Finalizing lookup tables.
Generating candidate set for working with new users.
Finished training in 3.93128s
factorization_machine_model = gl.recommender.factorization_recommender.create(train_set, 'user_id', 'music_id',
target='rating')
Recsys training: model = factorization_recommender
Preparing data set.
Data has 1599753 observations with 76085 users and 10000 items.
Data prepared in: 2.75321s
Training factorization_recommender for recommendations.
+--------------------------------+--------------------------------------------------+----------+
| Parameter | Description | Value |
+--------------------------------+--------------------------------------------------+----------+
| num_factors | Factor Dimension | 8 |
| regularization | L2 Regularization on Factors | 1e-08 |
| solver | Solver used for training | sgd |
| linear_regularization | L2 Regularization on Linear Coefficients | 1e-10 |
| max_iterations | Maximum Number of Iterations | 50 |
+--------------------------------+--------------------------------------------------+----------+
Optimizing model using SGD; tuning step size.
Using 199969 / 1599753 points for tuning the step size.
+---------+-------------------+------------------------------------------+
| Attempt | Initial Step Size | Estimated Objective Value |
+---------+-------------------+------------------------------------------+
| 0 | 25 | No Decrease (225.972 >= 36.3851) |
| 1 | 6.25 | No Decrease (215.578 >= 36.3851) |
| 2 | 1.5625 | No Decrease (185.778 >= 36.3851) |
| 3 | 0.390625 | No Decrease (85.4938 >= 36.3851) |
| 4 | 0.0976562 | 11.4284 |
| 5 | 0.0488281 | 8.57698 |
| 6 | 0.0244141 | 21.9241 |
+---------+-------------------+------------------------------------------+
| Final | 0.0488281 | 8.57698 |
+---------+-------------------+------------------------------------------+
Starting Optimization.
+---------+--------------+-------------------+-----------------------+-------------+
| Iter. | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size |
+---------+--------------+-------------------+-----------------------+-------------+
| Initial | 127us | 43.795 | 6.61778 | |
+---------+--------------+-------------------+-----------------------+-------------+
| 1 | 386.939ms | 43.5329 | 6.59755 | 0.0488281 |
| 2 | 720.361ms | 40.8116 | 6.38804 | 0.0290334 |
| 3 | 1.07s | 37.7879 | 6.14682 | 0.0214205 |
| 4 | 1.43s | 35.0832 | 5.92271 | 0.0172633 |
| 5 | 1.77s | 32.6783 | 5.71605 | 0.014603 |
| 6 | 2.12s | 30.6436 | 5.53518 | 0.0127367 |
| 10 | 3.41s | 24.7246 | 4.97173 | 0.008683 |
| 11 | 3.70s | 23.6795 | 4.86547 | 0.00808399 |
| 15 | 5.22s | 20.3931 | 4.51507 | 0.00640622 |
| 20 | 6.59s | 17.6597 | 4.20142 | 0.00516295 |
| 25 | 8.00s | 15.7913 | 3.97279 | 0.00436732 |
| 30 | 9.34s | 14.5805 | 3.81732 | 0.00380916 |
| 35 | 10.70s | 13.4527 | 3.66659 | 0.00339327 |
| 40 | 12.05s | 12.4671 | 3.5296 | 0.00306991 |
| 45 | 13.36s | 10.3489 | 3.21553 | 0.00167105 |
| 50 | 14.71s | 9.4961 | 3.08003 | 0.00154408 |
+---------+--------------+-------------------+-----------------------+-------------+
Optimization Complete: Maximum number of passes through the data reached.
Computing final objective value and training RMSE.
Final objective value: 8.58098
Final training RMSE: 2.92771
result = gl.recommender.util.compare_models(test_set, [popularity_model, item_sim_model, factorization_machine_model],
user_sample=.1, skip_set=train_set)
compare_models: using 6871 users to estimate model performance PROGRESS: Evaluate model M0
recommendations finished on 1000/6871 queries. users per second: 2786.85
recommendations finished on 2000/6871 queries. users per second: 2799.73
recommendations finished on 3000/6871 queries. users per second: 2826.79
recommendations finished on 4000/6871 queries. users per second: 2697.5
recommendations finished on 5000/6871 queries. users per second: 2632.5
recommendations finished on 6000/6871 queries. users per second: 2571
Precision and recall summary statistics by cutoff +--------+-------------------+-------------------+ | cutoff | mean_precision | mean_recall | +--------+-------------------+-------------------+ | 1 | 0.000291078445641 | 4.30002249243e-05 | | 2 | 0.000363848057051 | 0.000138871300274 | | 3 | 0.000339591519915 | 0.000208175692094 | | 4 | 0.000291078445641 | 0.000224346716851 | | 5 | 0.000378401979333 | 0.000471763395646 | | 6 | 0.000339591519915 | 0.000544533007057 | | 7 | 0.000353452398278 | 0.000626196682084 | | 8 | 0.000363848057051 | 0.000666181054129 | | 9 | 0.000452788693219 | 0.000843663301179 | | 10 | 0.00042206374618 | 0.000880048106884 | +--------+-------------------+-------------------+ [10 rows x 3 columns] ('\nOverall RMSE: ', 6.179866268976481) Per User RMSE (best) +-------------------------------+-------+------+ | user_id | count | rmse | +-------------------------------+-------+------+ | c1fe152a39495e06fbe5b11523... | 1 | 0.0 | +-------------------------------+-------+------+ [1 rows x 3 columns] Per User RMSE (worst) +-------------------------------+-------+---------------+ | user_id | count | rmse | +-------------------------------+-------+---------------+ | 30a35306be06d61aaa49be6a5b... | 3 | 164.761381672 | +-------------------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (best) +--------------------+-------+------+ | music_id | count | rmse | +--------------------+-------+------+ | SOKEFFW12AB017F370 | 1 | 0.0 | +--------------------+-------+------+ [1 rows x 3 columns] Per Item RMSE (worst) +--------------------+-------+---------------+ | music_id | count | rmse | +--------------------+-------+---------------+ | SOXRHKP12A58A7F404 | 2 | 176.534827782 | +--------------------+-------+---------------+ [1 rows x 3 columns] PROGRESS: Evaluate model M1
recommendations finished on 1000/6871 queries. users per second: 2935.09
recommendations finished on 2000/6871 queries. users per second: 2891.64
recommendations finished on 3000/6871 queries. users per second: 2915.66
recommendations finished on 4000/6871 queries. users per second: 2924.84
recommendations finished on 5000/6871 queries. users per second: 2920.94
recommendations finished on 6000/6871 queries. users per second: 2889
Precision and recall summary statistics by cutoff +--------+-----------------+-----------------+ | cutoff | mean_precision | mean_recall | +--------+-----------------+-----------------+ | 1 | 0.0525396594382 | 0.0153154955288 | | 2 | 0.0647649541551 | 0.0336995149653 | | 3 | 0.0735458205987 | 0.0536246092139 | | 4 | 0.0757531654781 | 0.0698533685656 | | 5 | 0.0755057487993 | 0.0855811446142 | | 6 | 0.0721146849076 | 0.0959987353779 | | 7 | 0.069713287731 | 0.106131564343 | | 8 | 0.0673300829574 | 0.114397455289 | | 9 | 0.0651692297741 | 0.122622577034 | | 10 | 0.0629311599476 | 0.129816343793 | +--------+-----------------+-----------------+ [10 rows x 3 columns] ('\nOverall RMSE: ', 6.920528945094911) Per User RMSE (best) +-------------------------------+-------+-----------------+ | user_id | count | rmse | +-------------------------------+-------+-----------------+ | 832d6b062acf658394675bf08c... | 1 | 0.0214611490568 | +-------------------------------+-------+-----------------+ [1 rows x 3 columns] Per User RMSE (worst) +-------------------------------+-------+---------------+ | user_id | count | rmse | +-------------------------------+-------+---------------+ | 30a35306be06d61aaa49be6a5b... | 3 | 169.092256553 | +-------------------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (best) +--------------------+-------+-----------------+ | music_id | count | rmse | +--------------------+-------+-----------------+ | SOZZLTY12A67AE0AD0 | 1 | 0.0224747419357 | +--------------------+-------+-----------------+ [1 rows x 3 columns] Per Item RMSE (worst) +--------------------+-------+---------------+ | music_id | count | rmse | +--------------------+-------+---------------+ | SOXRHKP12A58A7F404 | 2 | 178.192311843 | +--------------------+-------+---------------+ [1 rows x 3 columns] PROGRESS: Evaluate model M2
recommendations finished on 1000/6871 queries. users per second: 2694.73
recommendations finished on 2000/6871 queries. users per second: 2543.9
recommendations finished on 3000/6871 queries. users per second: 2540.85
recommendations finished on 4000/6871 queries. users per second: 2550.03
recommendations finished on 5000/6871 queries. users per second: 2548.94
recommendations finished on 6000/6871 queries. users per second: 2514.5
Precision and recall summary statistics by cutoff +--------+-------------------+-------------------+ | cutoff | mean_precision | mean_recall | +--------+-------------------+-------------------+ | 1 | 0.000291078445641 | 1.98462576573e-05 | | 2 | 0.000509387279872 | 0.000403908095656 | | 3 | 0.000388104594188 | 0.000408066359165 | | 4 | 0.000400232862757 | 0.000474771836291 | | 5 | 0.000407509823898 | 0.000538243108466 | | 6 | 0.000388104594188 | 0.000572548782416 | | 7 | 0.000415826350916 | 0.000819156909973 | | 8 | 0.000473002474167 | 0.0011654189776 | | 9 | 0.000501301767493 | 0.00134976865984 | | 10 | 0.000523941202154 | 0.00161312534875 | +--------+-------------------+-------------------+ [10 rows x 3 columns] ('\nOverall RMSE: ', 8.223231010188494) Per User RMSE (best) +-------------------------------+-------+------------------+ | user_id | count | rmse | +-------------------------------+-------+------------------+ | 29d269f4fdc75580b1bb96d6f0... | 1 | 0.00137363099447 | +-------------------------------+-------+------------------+ [1 rows x 3 columns] Per User RMSE (worst) +-------------------------------+-------+---------------+ | user_id | count | rmse | +-------------------------------+-------+---------------+ | e48e5aeb5a3d9e1425ea541d65... | 3 | 148.315295088 | +-------------------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (best) +--------------------+-------+-------------------+ | music_id | count | rmse | +--------------------+-------+-------------------+ | SODQSFF12A67020461 | 1 | 0.000333939725071 | +--------------------+-------+-------------------+ [1 rows x 3 columns] Per Item RMSE (worst) +--------------------+-------+---------------+ | music_id | count | rmse | +--------------------+-------+---------------+ | SOXRHKP12A58A7F404 | 2 | 167.122554479 | +--------------------+-------+---------------+ [1 rows x 3 columns]
K = 10
users = gl.SArray(sf['user_id'].unique().head(100))
recs = item_sim_model.recommend(users=users, k=K)
recs.head()
user_id | music_id | score | rank |
---|---|---|---|
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SOXUQNR12AF72A69D6 | 0.302626844715 | 1 |
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SOFISNS12A67ADE5FF | 0.129972689292 | 2 |
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SOGXSWA12A6D4FBC99 | 0.126114996041 | 3 |
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SOHOTTD12A6D4F7035 | 0.115846942453 | 4 |
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SODZBJH12AF72A9CF7 | 0.111501108198 | 5 |
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SONYKOW12AB01849C9 | 0.104462311548 | 6 |
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SOLFXKT12AB017E3E0 | 0.104228854179 | 7 |
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SOAXGDH12A8C13F8A1 | 0.094238214633 | 8 |
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SOPDIDL12A58A7ABF0 | 0.093481474063 | 9 |
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ... |
SOTEZXJ12A8C1365AA | 0.0933470936383 | 10 |