In [113]:
import scipy
import scipy.sparse
import cPickle
import logging
import implicit
reload(implicit)
import implicit.approximate_als
reload(implicit.approximate_als)
import implicit.bpr
reload(implicit.bpr)
import sklearn
import numpy as np
import itertools
import random
import pandas as pd
import time
import lightfm
import lightfm.evaluation
from tqdm import tqdm as mytqdm
%matplotlib inline
import seaborn as sns
import numpy as np
import util as C
reload(C)
import pymongo

# Load Data

In [2]:
logging.basicConfig(level = logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [3]:
channel_helper = C.ChannelHelper()

In [4]:
channel_helper.load_meta_data()
channel_helper.load_index_data()

In [5]:
channel_cid2idx, channel_idx2cid = channel_helper.channel_cid2idx, channel_helper.channel_idx2cid
channel_data, channel_pid_data = channel_helper.channel_data, channel_helper.channel_pid_data

In [27]:
%%time
mat_csr = C.load_mat(C.USER_ITEM_MAT_PATH)

CPU times: user 392 ms, sys: 200 ms, total: 592 ms
Wall time: 593 ms


In [28]:
%%time
mat_coo = mat_csr.tocoo()

CPU times: user 320 ms, sys: 288 ms, total: 608 ms
Wall time: 605 ms


In [29]:
%%time
#mat_coo2 = implicit.nearest_neighbours.bm25_weight(mat_coo)
#mat_coo2 = implicit.nearest_neighbours.tfidf_weight(mat_coo)
mat_coo2 = implicit.nearest_neighbours.bm25_weight(mat_coo, K1 = 2.0, B = 0.8)
mat_csr2 = mat_coo2.tocsr()

CPU times: user 35.3 s, sys: 1.98 s, total: 37.3 s
Wall time: 37.2 s


In [55]:
mat_csr

<4609357x137344 sparse matrix of type '<type 'numpy.float32'>'
	with 42015431 stored elements in Compressed Sparse Row format>

In [10]:
def item_recomm(model, cid, N=20,  verbose = True):
    idx = channel_cid2idx[cid]
    if model.__module__.startswith('implicit'):
        related = model.similar_items(idx, N=N)
        res = []
        for rid, score in related:
            pid= channel_idx2cid[rid]
            title = channel_pid_data[pid]['title']
            res.append((pid, title, score))
            
    else: # lightfm model.
        item_embeddings = model.item_embeddings
        scores = item_embeddings.dot(item_embeddings[idx])
        norm_scores = -scores / np.linalg.norm(item_embeddings, axis = -1)
        scores_idx = norm_scores.argpartition(N)
        scores_idx = scores_idx[:N]
        scores_idx_value = list(zip(scores_idx, [norm_scores[x] for x in scores_idx]))
        scores_idx_value = sorted(scores_idx_value, key = lambda x: x[1])
        res = []
        for idx, value in scores_idx_value:
            pid= channel_idx2cid[idx]
            title = channel_pid_data[pid]['title']
            res.append((pid, title, value))

    if verbose:
        for pid, title, score in res:
            print('pid={}, title={}, score={:.2f}'.format(pid, title, score))
    return res

# LightFM Model

In [30]:
lfm_model = C.MyLightFM(no_components=100, loss = 'warp', random_state=42)

In [31]:
%%time
lfm_model.fit_partial(mat_csr2, epochs = 20, num_threads = 8)

Epoch 0 in 80.35s
Epoch 1 in 52.32s
Epoch 2 in 51.38s
Epoch 3 in 51.86s
Epoch 4 in 53.61s
Epoch 5 in 54.27s
Epoch 6 in 54.26s
Epoch 7 in 54.33s
Epoch 8 in 52.68s
Epoch 9 in 55.29s
Epoch 10 in 53.02s
Epoch 11 in 51.38s
Epoch 12 in 52.61s
Epoch 13 in 51.39s
Epoch 14 in 52.52s
Epoch 15 in 50.04s
Epoch 16 in 48.88s
Epoch 17 in 49.16s
Epoch 18 in 48.90s
Epoch 19 in 49.21s
CPU times: user 1h 50min 11s, sys: 15.6 s, total: 1h 50min 27s
Wall time: 18min 7s


<util.MyLightFM at 0x7f139d31b110>

In [32]:
lfm_model.item_embeddings.shape

(137344, 100)

In [33]:
ref_pids = [486852, 1241617, 285553, 1215926]
for pid in ref_pids:
    _ = item_recomm(lfm_model, pid, N = 50)
    print('\n')

pid=486852, title=Deep Learning: Zero to One, score=-3.38
pid=538016, title=Machine Learning – Software Engineering Daily, score=-2.51
pid=494706, title=RARE PERSPECTIVES: The AI and Machine Learning Podcast, score=-2.47
pid=650841, title=Machine Learning Guide, score=-2.45
pid=430829, title=Machine Learning, score=-2.44
pid=571199, title=NLP Highlights, score=-2.37
pid=410810, title=Python Bytes, score=-2.37
pid=1031182, title=Machine Learning for Physicists 2017 (Audio), score=-2.35
pid=1062110, title=Brightest Singularity - Blockchain and Machine Learning, score=-2.32
pid=934188, title=Data Science Imposters Podcast, score=-2.30
pid=402010, title=This Week in Machine Learning & Artificial Intelligence (AI) Podcast, score=-2.30
pid=2348, title=Learning Machines 101, score=-2.28
pid=1516, title=O'Reilly Data Show - O'Reilly Media Podcast, score=-2.27
pid=934168, title=Startup Data Science, score=-2.25
pid=2315, title=Talk Python To Me - Python conversations for passionate developers, 

# ALS Model

In [34]:
np.random.seed(42)
als_model = implicit.als.AlternatingLeastSquares(
    factors= 100, 
    regularization = 0.01,
    iterations = 10)

In [35]:
%%time
als_model.fit(mat_csr2.T)

DEBUG:implicit:Converting input to CSR format
DEBUG:implicit:Converted input to CSR in 2.975s
DEBUG:implicit:Calculated transpose in 5.930s
DEBUG:implicit:initialize factors in 9.09478712082
DEBUG:implicit:finished iteration 0 in 85.829s
DEBUG:implicit:finished iteration 1 in 87.765s
DEBUG:implicit:finished iteration 2 in 87.213s
DEBUG:implicit:finished iteration 3 in 88.893s
DEBUG:implicit:finished iteration 4 in 85.435s
DEBUG:implicit:finished iteration 5 in 87.168s
DEBUG:implicit:finished iteration 6 in 87.334s
DEBUG:implicit:finished iteration 7 in 88.568s
DEBUG:implicit:finished iteration 8 in 86.934s
DEBUG:implicit:finished iteration 9 in 87.034s


CPU times: user 1h 36min 34s, sys: 2h 12min 10s, total: 3h 48min 44s
Wall time: 14min 50s


In [36]:
ref_pids = [486852, 1241617, 285553, 1215926]
for pid in ref_pids:
    _ = item_recomm(als_model, pid, N = 50)
    print('\n')

pid=486852, title=Deep Learning: Zero to One, score=1.00
pid=430829, title=Machine Learning, score=1.00
pid=2348, title=Learning Machines 101, score=0.99
pid=538016, title=Machine Learning – Software Engineering Daily, score=0.99
pid=650841, title=Machine Learning Guide, score=0.99
pid=485455, title=This Week in Machine Learning & AI, score=0.99
pid=421762, title=Data Science Storytime, score=0.99
pid=402010, title=This Week in Machine Learning & Artificial Intelligence (AI) Podcast, score=0.99
pid=378094, title=The O'Reilly Data Show Podcast, score=0.99
pid=933690, title=Data Science, score=0.99
pid=934188, title=Data Science Imposters Podcast, score=0.99
pid=532150, title="Data Science" - Google News, score=0.99
pid=1136953, title=Data Science at Home, score=0.99
pid=571199, title=NLP Highlights, score=0.99
pid=1093155, title=Power Generation and Storage, score=0.99
pid=1062110, title=Brightest Singularity - Blockchain and Machine Learning, score=0.99
pid=650739, title=Talking Machin

In [118]:
_ = item_recomm(als_model, 19705, N = 50)

pid=19705, title=万有引力, score=1.00
pid=20239, title=一席, score=1.00
pid=13156, title=糖蒜夜话, score=1.00
pid=963249, title=故事 FM, score=1.00
pid=258168, title=观复嘟嘟, score=1.00
pid=486714, title=小说连播, score=1.00
pid=361664, title=晓松奇谈, score=1.00
pid=19416, title=轻阅读, score=1.00
pid=325713, title=锵锵三人行, score=1.00
pid=1103385, title=岳云鹏相声2010～2018（清晰版）, score=1.00
pid=974434, title=网易轻松一刻, score=1.00
pid=1073415, title=德云社清晰相声--睡眠专用, score=1.00
pid=19427, title=环球故事会, score=1.00
pid=1003329, title=涡轮说车2017, score=1.00
pid=483140, title=天朝史趣【乐乐呵呵学历史】, score=1.00
pid=973839, title=大内密谈, score=1.00
pid=1154420, title=过三情感脱口秀, score=1.00
pid=180699, title=张召忠开讲, score=1.00
pid=436305, title=郭德纲于谦精选相声, score=1.00
pid=385877, title=汪诘：科学有故事, score=1.00
pid=441088, title=雷音寺 梁宏达, score=1.00
pid=122776, title=狗熊月读(Video), score=1.00
pid=444644, title=晓松说, score=1.00
pid=1220635, title=蒋勋细说红楼梦, score=1.00
pid=1001122, title=江一燕：小江的时间缝隙, score=1.00
pid=104287, title=内涵段子, score=1.00
pid=983170, title=

# BPR Model

优点是训练非常快，similar_items效果还不错，但是在recomm_user里面则非常差

现在看起来这个参数还算不错，在多迭代几轮效果就变差了，稳定性很差

但是观察效果好像比ALS,LFM都要好，最好是确保输出的时候prob < 1.0

In [57]:
np.random.seed(42)
bpr_model = implicit.bpr.BayesianPersonalizedRanking(factors = 100, iterations = 30, regularization = 0.01)

In [58]:
%%time
bpr_model.fit(mat_csr2.T)

DEBUG:implicit:Converting input to COO format
DEBUG:implicit:Converted input to COO in 0.703s
DEBUG:implicit:fit epoch 0 in 4.025s (50.65% ranked correctly)
DEBUG:implicit:fit epoch 1 in 4.087s (51.59% ranked correctly)
DEBUG:implicit:fit epoch 2 in 4.006s (52.42% ranked correctly)
DEBUG:implicit:fit epoch 3 in 4.077s (53.43% ranked correctly)
DEBUG:implicit:fit epoch 4 in 4.017s (54.86% ranked correctly)
DEBUG:implicit:fit epoch 5 in 4.085s (56.91% ranked correctly)
DEBUG:implicit:fit epoch 6 in 4.001s (59.53% ranked correctly)
DEBUG:implicit:fit epoch 7 in 4.092s (62.46% ranked correctly)
DEBUG:implicit:fit epoch 8 in 4.000s (65.48% ranked correctly)
DEBUG:implicit:fit epoch 9 in 4.063s (68.36% ranked correctly)
DEBUG:implicit:fit epoch 10 in 4.003s (71.03% ranked correctly)
DEBUG:implicit:fit epoch 11 in 4.066s (73.42% ranked correctly)
DEBUG:implicit:fit epoch 12 in 4.005s (75.53% ranked correctly)
DEBUG:implicit:fit epoch 13 in 4.056s (77.38% ranked correctly)
DEBUG:implicit:fit e

CPU times: user 32min 57s, sys: 4.07 s, total: 33min 1s
Wall time: 2min 53s


In [59]:
ref_pids = [486852, 1241617, 285553, 1215926]
for pid in ref_pids:
    _ = item_recomm(bpr_model, pid, N = 50)
    print('\n')

pid=486852, title=Deep Learning: Zero to One, score=1.00
pid=2348, title=Learning Machines 101, score=0.86
pid=538016, title=Machine Learning – Software Engineering Daily, score=0.85
pid=402010, title=This Week in Machine Learning & Artificial Intelligence (AI) Podcast, score=0.84
pid=650841, title=Machine Learning Guide, score=0.82
pid=25629, title=Artificial Intelligence in Industry with Dan Faggella, score=0.81
pid=1516, title=O'Reilly Data Show - O'Reilly Media Podcast, score=0.81
pid=396480, title=SuperDataScience, score=0.81
pid=430829, title=Machine Learning, score=0.80
pid=26120, title=Linear Digressions, score=0.80
pid=433488, title=The AI Podcast, score=0.79
pid=650739, title=Talking Machines, score=0.78
pid=1136953, title=Data Science at Home, score=0.77
pid=1152077, title=Data Crunch | Artificial Intelligence | AI | Machine Learning | Big Data | Data Science, score=0.77
pid=104511, title=Becoming A Data Scientist Podcast, score=0.76
pid=2315, title=Talk Python To Me - Pytho

# 输出推荐结果

In [100]:
def pid_to_key(pid):
    assert(pid in channel_pid_data)
    data = channel_pid_data[pid]
    return data['key']

def get_track_keys(feeds):
    tracks = []
    for x in feeds:
        track = channel_data.get(x, {}).get('track')
        if not track: continue
        tracks.append('{}-{}'.format(x, track))
    return tracks

def deploy_to_db(table, recomm_data):
    ops = []
    for (f, feeds, tracks) in mytqdm(recomm_data):
        ops.append(pymongo.UpdateOne({'_id': f}, {'$set': {'feeds': feeds, 'tracks': tracks}}, upsert=True))
        if len(ops) == 100:
            table.bulk_write(ops, ordered=False)
            ops = []
    if ops:
        table.bulk_write(ops, ordered=False)

In [None]:
recomm_data = []
for pid in mytqdm(channel_idx2cid):
    key = pid_to_key(pid)
    res= item_recomm(bpr_model, pid, N = 200, verbose = False)
    for x in res:
        assert(x[-1] < 1.01)            
    recomm_keys = [pid_to_key(x[0]) for x in res]
    recomm_keys = recomm_keys[1:]
    feed_keys = recomm_keys
    track_keys = get_track_keys(feed_keys)
    recomm_data.append((key, feed_keys, track_keys))

In [None]:
from recomm import app
table = app.CBRecommDB['sim_feeds']
deploy_to_db(table, recomm_data)