In [21]:
import scipy
import scipy.sparse
import cPickle
import logging
import implicit
reload(implicit)
import implicit.approximate_als
reload(implicit.approximate_als)
import implicit.bpr
reload(implicit.bpr)
import sklearn
import numpy as np
import itertools
import random
import pandas as pd
import time
import lightfm
import lightfm.evaluation
from tqdm import tqdm as mytqdm
import seaborn as sns
%matplotlib inline
import util as C
reload(C)
import functools
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pymongo

# Load Data

In [2]:
logging.basicConfig(level = logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [3]:
channel_helper = C.ChannelHelper()
user_helper = C.UserHelper(channel_helper)

In [4]:
channel_helper.load_index_data()
channel_helper.load_meta_data()

In [5]:
channel_cid2idx, channel_idx2cid = channel_helper.channel_cid2idx, channel_helper.channel_idx2cid
channel_data, channel_pid_data = channel_helper.channel_data, channel_helper.channel_pid_data

In [6]:
%%time
mat_csr = C.load_mat(C.EVST_USER_ITEM_PATH)
mat_coo = mat_csr.tocoo()

CPU times: user 252 ms, sys: 204 ms, total: 456 ms
Wall time: 453 ms


In [7]:
mat_coo2 = implicit.nearest_neighbours.bm25_weight(mat_coo, K1= 2.0, B = 0.8)

In [8]:
mat_csr2 = mat_coo2.tocsr()

In [9]:
mat_csr2

<1884585x137344 sparse matrix of type '<type 'numpy.float64'>'
	with 15979221 stored elements in Compressed Sparse Row format>

In [10]:
train_mat_coo2 = mat_coo2
train_mat_csr2 = mat_csr2
lfm_item_ids = np.arange(train_mat_csr2.shape[1])

In [11]:
# user_helper.dump_evst_user_email()

In [12]:
evst_email_df = user_helper.load_evst_user_email()

In [13]:
evst_email_df.head(3)

Unnamed: 0,uid,email
0,84443254a35142ae8172065eac95b1ea,shty8@126.com
1,1fcad777842f4707bc47dc5604f8ef07,tommycoue100@gmail.com
2,98962c5791ec43e4af56aba9e654f992,w20163@gmail.com


In [14]:
evst_df = user_helper.load_evst_subs_idx_df()

In [15]:
uid2idx = {x: idx for (idx, x) in enumerate(evst_df['uid'])}
idx2uid = evst_df['uid']

In [16]:
email2uid = dict(zip(evst_email_df['email'], evst_email_df['uid']))
uid2email = dict(zip(evst_email_df['uid'], evst_email_df['email']))

In [17]:
def ensure_uid(t):
    if '@' in t:
        email = t
        uid = email2uid[t]
    else:
        uid = t
        email = uid2email.get(t, '')
    print('email = {}, uid = {}'.format(email, uid))
    return uid
def get_user_idx(t):
    uid = ensure_uid(t)
    return uid2idx[uid]

In [22]:
uid2email['c42d3c9fee07479fb4ac9713ceb7e999']

'xuan.yang@castbox.fm'

# Evaluation

In [18]:
def compute_pred(model, user_id, N = 10):
    if model.__module__.startswith('implicit'):
        user = model._user_factor(user_id, user_items = train_mat_csr2)
        scores = model.item_factors.dot(user)
    else:
        scores = model.predict(user_ids = user_id, item_ids = lfm_item_ids)
    scores = -scores
    pred_idxs = np.argpartition(scores, N)[:N]
    pred = [(x, -scores[x]) for x in pred_idxs]
    pred.sort(key = lambda x: -x[1])
    return pred
    
def eval_user_recomm(model, user_id, N = 20, verbose = True):
    if verbose:
        user_id = get_user_idx(user_id)
        print('user_idx = {}'.format(user_id))
    base_idxs = set(train_mat_csr2[user_id].indices)
    pred = compute_pred(model, user_id, N = 200)
    pred = [x for x in pred if x[0] not in base_idxs][:N]
    pred = pred[:N]
    pred_idxs = [x[0] for x in pred]    
    if verbose:
        print('base ...')
        base_pids = [channel_idx2cid[x] for x in base_idxs]
        for pid in base_pids:
            print('- {} {}'.format(pid, channel_pid_data[pid]['title']))

        print('pred ...')
        pred_pids = [channel_idx2cid[x] for x in pred_idxs]
        for idx, pid in enumerate(pred_pids):
            print('- {} {} {} {}'.format(pid, channel_pid_data[pid]['title'], pred_idxs[idx], pred[idx][1]))
    else:
        return [channel_idx2cid[x] for x in pred_idxs]
            
def item_recomm(model, cid, N=20,  verbose = True):
    idx = channel_cid2idx[cid]
    if model.__module__.startswith('implicit'):
        related = model.similar_items(idx, N=N)
        res = []
        for rid, score in related:
            pid= channel_idx2cid[rid]
            title = channel_pid_data[pid]['title']
            res.append((pid, title, score))
            
    else: # lightfm model.
        item_embeddings = model.item_embeddings
        scores = item_embeddings.dot(item_embeddings[idx])
        norm_scores = -scores / np.linalg.norm(item_embeddings, axis = -1)
        scores_idx = norm_scores.argpartition(N)
        scores_idx = scores_idx[:N]
        scores_idx_value = list(zip(scores_idx, [norm_scores[x] for x in scores_idx]))
        scores_idx_value = sorted(scores_idx_value, key = lambda x: x[1])
        res = []
        for idx, value in scores_idx_value:
            pid= channel_idx2cid[idx]
            title = channel_pid_data[pid]['title']
            res.append((pid, title, value))

    if verbose:
        for pid, title, score in res:
            print('pid={}, title={}, score={:.2f}'.format(pid, title, score))
    return res

# Train ALS Model

In [19]:
np.random.seed(42)
als_model = implicit.als.AlternatingLeastSquares(
    factors= 100, 
    regularization = 0.01,
    iterations = 10)

In [20]:
%%time
als_model.fit(train_mat_csr2.T)

DEBUG:implicit:Converting input to CSR format
DEBUG:implicit:Converted input to CSR in 0.830s
DEBUG:implicit:Calculated transpose in 1.556s
DEBUG:implicit:initialize factors in 3.87056994438
DEBUG:implicit:finished iteration 0 in 35.213s
DEBUG:implicit:finished iteration 1 in 35.084s
DEBUG:implicit:finished iteration 2 in 33.976s
DEBUG:implicit:finished iteration 3 in 34.115s
DEBUG:implicit:finished iteration 4 in 34.092s
DEBUG:implicit:finished iteration 5 in 33.504s
DEBUG:implicit:finished iteration 6 in 34.998s
DEBUG:implicit:finished iteration 7 in 34.863s
DEBUG:implicit:finished iteration 8 in 34.244s
DEBUG:implicit:finished iteration 9 in 34.191s


CPU times: user 39min 7s, sys: 51min 23s, total: 1h 30min 30s
Wall time: 5min 50s


# ALS Validation

In [41]:
# uid = 'afc3a7f884d441ecbc4fbd34a208e657'
# uid = 'yan.zhang@castbox.fm'
uid = 'yuanyuan.sun@castbox.fm'
uid = 'wenchao.du@castbox.fm'
uid = '6c6e4d6f97904e018b06643032d4e85d'
eval_user_recomm(als_model, uid, N = 40)

email = , uid = 6c6e4d6f97904e018b06643032d4e85d
user_idx = 1
base ...
- 3175 Serial
- 378324 Albert Breer, The MMQB Podcast
- 19416 轻阅读
- 973326 Dkey
- 4382 Raise Your Hand Say Yes with Tiffany Han
- 472318 NESBROS: A Positive Gaming & Nintendo Switch Community
- 499 The Yogahealer Podcast l Ayurveda l Yoga l Healthy Foods | Yoga teachers with Cate Stillman
- 479560 Adult Sound
- 340105 TPOK Live!
- 486338 Price Of A Mile
- 476304 Unshakeable by Tony Robbins
- 7826 Nightmare Magazine - Horror and Dark Fantasy Story Podcast (Audiobook | Short Stories)
- 112984 Unscrewed
pred ...
- 492411 S-Town 124780 0.413534998894
- 262988 The Tony Robbins Podcast 3855 0.293162405491
- 257325 The GaryVee Audio Experience 135326 0.20967143774
- 14983 酷狗音乐新歌榜 11057 0.208269014955
- 15554 段子来了 11414 0.179603695869
- 570881 晓说2018 31554 0.153581976891
- 973334 看电影，听音乐 79384 0.147670000792
- 1086228 潘吉Jenny告诉你|学英语聊美国|开言英语 · Podcast 25720 0.138017266989
- 935281 罗辑思维/得到App 26108 0.134647160769
- 1814 This 

In [25]:
ref_pids = [486852, 1241617, 285553, 1215926]
_= item_recomm(als_model, 1241617, N = 20)

pid=1241617, title=Blockchain Inside, score=1.00
pid=1240826, title=TFECON 2018: Blockchain Technology and Future Economy, score=0.97
pid=1174008, title=Humans of Bitcoin, score=0.94
pid=442885, title=Blockchain Billions Podcast, score=0.92
pid=954020, title=The Let's Talk Bitcoin Network, score=0.92
pid=1162122, title=Cryptos Weekly, score=0.92
pid=1180347, title=The Coin Pod, score=0.92
pid=1081950, title=The Trust Technology TTT #blockchain, score=0.92
pid=1195023, title=The Doug Polk Podcast, score=0.92
pid=1270670, title=The Bitcoin Game, score=0.92
pid=573252, title=Blockchain Innovation: Interviewing The Brightest Minds In Blockchain, score=0.92
pid=1209713, title=Blockchain, score=0.92
pid=1080143, title=The Blockcast Show: Everything about Blockchain, Bitcoin, Ethereum, and Cryptocurrency, score=0.92
pid=1057334, title=Middle Market Growth Conversations, score=0.91
pid=575583, title=Dash: Detailed (Digital Cash), score=0.91
pid=484165, title=Blockgeekslab Podcast, score=0.91
p

# 输出推荐结果

In [28]:
def pid_to_key(pid):
    assert(pid in channel_pid_data)
    data = channel_pid_data[pid]
    return data['key']

def get_track_keys(feeds):
    tracks = []
    for x in feeds:
        track = channel_data.get(x, {}).get('track')
        if not track: continue
        tracks.append('{}-{}'.format(x, track))
    return tracks

def deploy_to_db(table, recomm_data):
    ops = []
    for (f, feeds, tracks) in mytqdm(recomm_data):
        ops.append(pymongo.UpdateOne({'_id': f}, {'$set': {'feeds': feeds, 'tracks': tracks}}, upsert=True))
        if len(ops) == 100:
            table.bulk_write(ops, ordered=False)
            ops = []
    if ops:
        table.bulk_write(ops, ordered=False)

In [None]:
recomm_data = []
uid_size = len(uid2idx)
for uid_idx in mytqdm(xrange(uid_size)):
    res = eval_user_recomm(als_model, uid_idx, N = 200, verbose = False)
    recomm_keys = [pid_to_key(x) for x in res]
    recomm_keys = recomm_keys[1:]
    feed_keys = recomm_keys
    track_keys = get_track_keys(feed_keys)
    uid = idx2uid[uid_idx]
    recomm_data.append((uid, feed_keys, track_keys))

  0%|          | 1936/1884585 [00:08<2:18:31, 226.50it/s]Exception KeyError: KeyError(<weakref at 0x7f8618162c58; to 'tqdm' at 0x7f85d314f450>,) in <bound method tqdm.__del__ of   0%|          | 1175/1884585 [00:40<17:54:40, 29.21it/s]> ignored
  8%|▊         | 144755/1884585 [10:48<2:09:56, 223.16it/s]

In [None]:
from recomm import app
table = app.CBRecommDB['user_id']
table.create_index('_id')
deploy_to_db(table, recomm_data)

In [57]:
'OK'

'OK'

In [58]:
email2uid['wenchao.du@castbox.fm']

'838fa37e63d34783baeed268fc9de9f4'