In [1]:
import scipy
import scipy.sparse
import logging
import implicit
import implicit.bpr
import numpy as np
import itertools
import random
import pandas as pd
import time
import lightfm
%matplotlib inline
from collections import Counter

In [2]:
logging.basicConfig(level = logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [3]:
df = pd.read_csv('49b02dcb-d47e-40e3-a1da-e230dc1928d2.csv')

In [4]:
df.rename_axis({'key_word': 'kw'}, axis = 1, inplace = True)
df['kw'] = df['kw'].str.lower()

In [5]:
kw_dist = df['kw'].value_counts()

In [6]:
kw_dist[kw_dist > 10].shape

(6411,)

In [7]:
keyword_counter = Counter()
for x in df.itertuples():
    uid = x.uid
    kw = x.kw
    keyword_counter[kw] += 1

In [8]:
keyword_counter.most_common(10)

[('joe rogan', 3751),
 ('true crime', 3103),
 ('npr', 2555),
 ('this american life', 1887),
 ('bbc', 1566),
 ('hardcore history', 1498),
 ('ted', 1474),
 ('serial', 1395),
 ('serial killers', 1377),
 ('history', 1345)]

In [9]:
def organize_data(data):
    keyword_index = {}
    user_index = {}
    odata = []
    uid_index = 0
    item_index = 0
    for x in df.itertuples():
        uid = x.uid
        keyword = x.kw
        if keyword_counter.get(keyword, 0) < 10: continue
        if uid not in user_index:
            ouid = uid_index
            uid_index += 1
            user_index[uid] = ouid
        else:
            ouid = user_index[uid]
        if keyword not in keyword_index:
            kid = item_index
            item_index += 1
            keyword_index[keyword] = kid
        else:
            kid = keyword_index[keyword]
        odata.append((ouid, kid))
    return user_index, keyword_index, odata

In [10]:
ui, ki, odata = organize_data(df)

In [11]:
len(ui.items()), len(ki.items())

(115954, 7046)

In [12]:
def build_coo_matrix(uid_count, item_count, organized_data):
    uids = []
    itemids = []
    data = []
    for uid, kid in organized_data:
        data.append(1)
        uids.append(uid)
        itemids.append(kid)
    coo_matrix = scipy.sparse.coo_matrix((data, (uids, itemids)), shape=(uid_count, item_count), dtype=np.float32)
    return coo_matrix

In [13]:
coo_matrix = build_coo_matrix(len(ui), len(ki), odata)
# coo_matrix = implicit.nearest_neighbours.bm25_weight(coo_matrix, K1 = 2.0, B = 0.8)

In [14]:
coo_matrix

<115954x7046 sparse matrix of type '<type 'numpy.float32'>'
	with 304678 stored elements in COOrdinate format>

In [15]:
ki_recerse = {}
for keyword, kid in ki.items():
    ki_recerse[kid] = keyword

In [16]:
def recomm_keyword(model, word):
    word = word.lower().strip()
    word_id = ki.get(word)
    if word_id:
        if model.__module__.startswith('implicit'):
            recs = model.similar_items(word_id, 20)
            for rec_id, prob in recs:
                print("%s\t\t%.2f" % (ki_recerse.get(rec_id), prob))
        else:
            item_embeddings = model.item_embeddings
            scores = item_embeddings.dot(item_embeddings[word_id])
            norm_scores = -scores / np.linalg.norm(item_embeddings, axis = -1)
            N = 20
            scores_idx = norm_scores.argpartition(N)
            scores_idx = scores_idx[:N]
            scores_idx_value = list(zip(scores_idx, [norm_scores[x] for x in scores_idx]))
            scores_idx_value = sorted(scores_idx_value, key = lambda x: x[1])
            for rec_id, prob in scores_idx_value:
                print("%s\t\t%.2f" % (ki_recerse.get(rec_id), prob))
    else:
        print("keyword not supported")

In [27]:
np.random.seed(42)
bpr_model = implicit.bpr.BayesianPersonalizedRanking(factors = 30, iterations = 10)

In [28]:
bpr_model.fit(coo_matrix.T)

DEBUG:implicit:fit epoch 0 in 0.017s (50.56% ranked correctly)
DEBUG:implicit:fit epoch 1 in 0.016s (51.85% ranked correctly)
DEBUG:implicit:fit epoch 2 in 0.012s (52.92% ranked correctly)
DEBUG:implicit:fit epoch 3 in 0.010s (54.31% ranked correctly)
DEBUG:implicit:fit epoch 4 in 0.010s (55.40% ranked correctly)
DEBUG:implicit:fit epoch 5 in 0.010s (56.37% ranked correctly)
DEBUG:implicit:fit epoch 6 in 0.010s (57.75% ranked correctly)
DEBUG:implicit:fit epoch 7 in 0.015s (58.53% ranked correctly)
DEBUG:implicit:fit epoch 8 in 0.012s (59.42% ranked correctly)
DEBUG:implicit:fit epoch 9 in 0.016s (60.35% ranked correctly)


In [19]:
lfm_model = lightfm.LightFM(no_components=30, loss = 'warp', random_state=42)

In [20]:
lfm_model.fit_partial(coo_matrix, epochs = 20, num_threads = 8, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


<lightfm.lightfm.LightFM at 0x7f501865e210>

In [29]:
kw = 'joe rogan'
# kw = 'Legion of Skanks Podcast'
# kw = 'this sounds serious'
# kw = '99% invisible'
print('BPR Model Recomm')
recomm_keyword(bpr_model, kw)
print('\n\nALS Model Recomm')
recomm_keyword(lfm_model, kw)

BPR Model Recomm
joe rogan		1.00
criminal		0.97
npr		0.96
couples therapy with candice and casey		0.96
serial killers		0.96
hardcore history		0.96
true crime		0.96
bill burr		0.96
serial		0.96
in the dark		0.96
ted		0.96
radiolab		0.96
lore		0.95
this american life		0.95
reply all		0.95
critical role		0.95
sword and scale		0.95
conspiracy theories		0.95
gimlet		0.95
revisionist history		0.95


ALS Model Recomm
joe rogan		-1.69
ufc unfiltered with jim norton and matt serra		-1.19
ari shaffir's skeptic tank		-1.15
atp science		-1.11
mma beat		-1.08
the fighter		-1.08
my wife hates me		-1.07
the fighter & the kid		-1.07
brendan schaub, bryan callen		-1.07
military		-1.05
fitzdog radio		-1.03
brendan schaub		-1.03
bisping		-1.02
the bill simmons podcast		-1.02
mma hour		-1.01
mike tyson: bite the mic with peter rosenberg		-1.01
your mom's house with christina p. and tom segura		-1.01
below the belt		-1.00
conspiracy farm		-1.00
tim dillon is going to hell		-0.99
