<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Implicit" data-toc-modified-id="Implicit-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Implicit</a></span><ul class="toc-item"><li><span><a href="#Movielens-100K" data-toc-modified-id="Movielens-100K-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Movielens 100K</a></span></li><li><span><a href="#Model-Interpretation" data-toc-modified-id="Model-Interpretation-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Model Interpretation</a></span></li><li><span><a href="#LastFM" data-toc-modified-id="LastFM-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>LastFM</a></span></li><li><span><a href="#BPR" data-toc-modified-id="BPR-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>BPR</a></span></li></ul></li></ul></div>

In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from subprocess import call
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib

Ethen 2018-08-06 13:49:07 

CPython 3.6.4
IPython 6.4.0

numpy 1.14.1
pandas 0.23.0
sklearn 0.19.1
matplotlib 2.2.2


# Implicit

## Movielens 100K

In [2]:
# download the data if it's not in the same local directory
file_dir = 'ml-100k'
file_path = os.path.join(file_dir, 'u.data')
if not os.path.isdir(file_dir):
    call(['curl', '-O', 'http://files.grouplens.org/datasets/movielens/' + file_dir + '.zip'])
    call(['unzip', file_dir + '.zip'])

user_col = 'user_id'
item_col = 'item_id'
value_col = 'rating'
timestamp_col = 'timestamp'
names = [user_col, item_col, value_col, timestamp_col]
df = pd.read_csv(file_path, sep = '\t', names = names)
print('data dimension: \n', df.shape)
df.head()

data dimension: 
 (100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
# remove things < min_rating, and convert to implicit dataset
# by considering ratings as a binary preference only
min_rating = 3
df = df[df[value_col] >= min_rating]
print('data dimension: \n', df.shape)
df.head()

data dimension: 
 (82520, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
5,298,474,4,884182806
7,253,465,5,891628467
8,305,451,3,886324817


In [4]:
from scipy.sparse import csr_matrix

for col in (user_col, item_col):
    df[col] = df[col].astype('category')

# implicit expects item-user format, i.e.
# the each rows represent an item and
# each column represents a user
rows = df[item_col].cat.codes
cols = df[user_col].cat.codes
values = df[value_col].astype(np.float32)
ratings = csr_matrix((values, (rows, cols)))
ratings

<1574x943 sparse matrix of type '<class 'numpy.float32'>'
	with 82520 stored elements in Compressed Sparse Row format>

In [5]:
from implicit.als import AlternatingLeastSquares


model = AlternatingLeastSquares(factors=50, regularization=0.05,
                                iterations=5, calculate_training_loss=True)
model.fit(ratings)

100%|██████████| 5.0/5 [00:00<00:00, 58.73it/s, loss=0.0453]


## Model Interpretation

The `u.item` contains meta-data about the items (movies). This file is a tab separated list of:

movie id, movie title, release date, video release date, IMDb URL, unknown, Action, Adventure, Animation, Children's, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western, The last 19 fields are the genres, a 1 indicates the movie is of that genre, a 0 indicates it is not; movies can be in several genres at once.

The movie ids are the ones used in the `u.data` data set.

In [6]:
names = [
    'movie_id', 'movie_title', 'release_date', 'video_release_date',
    'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
    'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
    'Thriller', 'War', 'Western']
item_metadata = pd.read_csv(file_path, sep='|', encoding='latin-1', names=names)
print('dimension: ', item_metadata.shape)
item_metadata.head()

dimension:  (100000, 24)


Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196\t242\t3\t881250949,,,,,,,,,,...,,,,,,,,,,
1,186\t302\t3\t891717742,,,,,,,,,,...,,,,,,,,,,
2,22\t377\t1\t878887116,,,,,,,,,,...,,,,,,,,,,
3,244\t51\t2\t880606923,,,,,,,,,,...,,,,,,,,,,
4,166\t346\t1\t886397596,,,,,,,,,,...,,,,,,,,,,


In [7]:
index2movie = {}
file_path = os.path.join(file_dir, 'u.item')

with open(file_path, encoding='latin1') as f:
    for line in f.readlines():
        info = line.split('|')
        index = int(info[0]) - 1
        movie = info[1]
        index2movie[index] = movie

index2movie[1]

'GoldenEye (1995)'

In [8]:
movieid = 1
if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
    title = index2movie[movieid]
    for similar_item, score in model.similar_items(movieid, N=5):
        print(title, index2movie[similar_item], score)

GoldenEye (1995) GoldenEye (1995) 1.0000001
GoldenEye (1995) Under Siege (1992) 0.7582087
GoldenEye (1995) Mary Shelley's Frankenstein (1994) 0.6529077
GoldenEye (1995) Die Hard 2 (1990) 0.6431297
GoldenEye (1995) Body Snatchers (1993) 0.643068


In [15]:
from sklearn.decomposition import PCA

pca = PCA()
item_factors_pca = pca.fit_transform(model.item_factors)
item_factors_pca.shape

(1574, 50)

In [16]:
pca.explained_variance_ratio_

array([0.06453663, 0.04607013, 0.04138038, 0.03779353, 0.03071558,
       0.02808258, 0.02616329, 0.02490851, 0.02360559, 0.022663  ,
       0.02179698, 0.02113239, 0.02059747, 0.01997002, 0.01972391,
       0.01963055, 0.01926829, 0.01899473, 0.01808385, 0.01802022,
       0.01780541, 0.01738336, 0.01732925, 0.01693604, 0.01690188,
       0.01676378, 0.01657589, 0.01649823, 0.01616263, 0.01610222,
       0.01602447, 0.01594338, 0.01569973, 0.01561498, 0.01545062,
       0.01521774, 0.01513517, 0.0150071 , 0.01494769, 0.01480642,
       0.01465308, 0.01440753, 0.0143018 , 0.01415103, 0.01403544,
       0.0138078 , 0.01368873, 0.01329734, 0.01316874, 0.00904493],
      dtype=float32)

In [10]:
topn = 5
scores = item_factors_pca[:, 0]
highest_score_items = np.argpartition(scores, -topn)[-topn:]
highest_score_items

array([284, 267, 268, 301, 285])

In [11]:
for item in highest_score_items:
    print(index2movie[item])

Secrets & Lies (1996)
Chasing Amy (1997)
Full Monty, The (1997)
L.A. Confidential (1997)
English Patient, The (1996)


In [12]:
lowest_score_items = np.argpartition(scores, topn)[:topn]
lowest_score_items

array([ 72, 391, 383, 160,  93])

In [13]:
for item in lowest_score_items:
    print(index2movie[item])

Maverick (1994)
Man Without a Face, The (1993)
Naked Gun 33 1/3: The Final Insult (1994)
Top Gun (1986)
Home Alone (1990)


## LastFM

In [2]:
def load_lastfm():
    """
    Load lastfm data and returns the dataset
    in dataframe and csr_matrix format, will
    download it if it doesn't exist in the same
    folder. Depending on our network speed, the
    download might take a while.
    """

    file_dir = 'lastfm-dataset-360K'
    file_path = os.path.join(file_dir, 'usersha1-artmbid-artname-plays.tsv')
    if not os.path.isdir(file_dir):
        call(['curl', '-O', 'http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz'])     
        call(['tar', '-xvzf', 'lastfm-dataset-360K.tar.gz'])

    # read in triplets of user/artist/playcount from the input dataset
    col_names = ['user', 'artist', 'plays']
    df = pd.read_csv(file_path, sep = '\t', usecols = [0, 2, 3], names = col_names)

    # there are 2 rows that contains NaN value, simply drop them
    df = df.dropna(axis = 0)
    return df


df = load_lastfm()
print('dimensions:', df.shape)
df.head()

dimensions: (17535451, 3)


Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [3]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.4, random_state=1234)
print('dimension:', df_train.shape)

dimension: (10521270, 3)


In [4]:
import numpy as np
from scipy.sparse import csr_matrix
from implicit.bpr import BayesianPersonalizedRanking


class MatrixFactorizationPipeline:

    def __init__(self, user_col, item_col, value_col):
        self.user_col = user_col
        self.item_col = item_col
        self.value_col = value_col

    def fit(self, data):
        for col in (self.user_col, self.item_col):
            data[col] = data[col].astype('category')

        rows = data[self.item_col].cat.codes
        cols = data[self.user_col].cat.codes
        values = data[self.value_col].astype(np.float32)
        item_users = csr_matrix((values, (rows, cols)))

        n_factors = 50
        model = BayesianPersonalizedRanking(factors=n_factors, iterations=10, num_threads=0)
        model.fit(item_users)

        self.user_factors_ = model.user_factors
        self.item_factors_ = model.item_factors
        self.index2user_ = data[self.user_col].cat.categories
        self.index2item_ = data[self.item_col].cat.categories

        self.user2index_ = {v: k for k, v in enumerate(self.index2user_)}
        self.item2index_ = {v: k for k, v in enumerate(self.index2item_)}
        return self

In [5]:
user_col = 'user'
item_col = 'artist'
value_col = 'plays'
model1 = MatrixFactorizationPipeline(user_col, item_col, value_col)
model1.fit(df_train.copy())

100%|██████████| 10/10 [00:14<00:00,  1.43s/it, correct=62.41%, skipped=1.33%]


<__main__.MatrixFactorizationPipeline at 0x112529400>

In [6]:
model2 = MatrixFactorizationPipeline(user_col, item_col, value_col)
model2.fit(df_test.copy())

100%|██████████| 10/10 [00:09<00:00,  1.10it/s, correct=50.46%, skipped=0.90%]


<__main__.MatrixFactorizationPipeline at 0x10c756b00>

In [32]:
print(len(model1.index2item_))
print(len(model2.index2item_))

items_in_both_embedding = set(model1.index2item_) & set(model2.index2item_)
print(len(items_in_both_embedding))

232689
194393


In [17]:
model1_index = [model1.item2index_[item] for item in items_in_both_embedding]
model2_index = [model2.item2index_[item] for item in items_in_both_embedding]

assert model2.index2item_[model2_index[0]] == model1.index2item_[model1_index[0]]

In [18]:
model1_common_item_factors = model1.item_factors_[model1_index]
model2_common_item_factors = model2.item_factors_[model2_index]

In [25]:
L0 = model1_common_item_factors
L1 = model2_common_item_factors

L1_inv = np.linalg.pinv(L1)
T = np.dot(L1_inv, L0)
u, s, v = np.linalg.svd(T)
si = np.eye(u.shape[0], v.shape[0])
T_rot = np.dot(u, np.dot(si, v))
L1_rot = np.dot(L1, T_rot)

In [26]:
L1_rot.shape

(134719, 51)

In [21]:
model1_common_item_factors.shape

(134719, 51)

In [33]:
topn = 10
scores = model1_common_item_factors[0, :]
highest_score_items = np.argpartition(scores, -topn)[-topn:]
highest_score_items

array([21, 10, 39, 30, 16, 13, 37, 15, 25, 50])

In [34]:
topn = 10
scores = model2_common_item_factors[0, :]
highest_score_items = np.argpartition(scores, -topn)[-topn:]
highest_score_items

array([43, 36,  2,  1, 11, 29, 19,  0, 45, 50])

In [35]:
topn = 10
scores = L1_rot[0, :]
highest_score_items = np.argpartition(scores, -topn)[-topn:]
highest_score_items

array([ 9, 11, 34, 21, 36, 41, 20, 26,  6,  0])

In [29]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
item_factors_pca = pca.fit_transform(model1_common_item_factors)
scores = item_factors_pca[:, 0]
highest_score_items = np.argpartition(scores, -topn)[-topn:]
highest_score_items

array([ 89696, 124989,  70130,  27935, 102439])

In [32]:
item_factors_pca.shape

(134719, 5)

In [30]:
item_factors_pca = pca.fit_transform(model2_common_item_factors)
scores = item_factors_pca[:, 0]
highest_score_items = np.argpartition(scores, -topn)[-topn:]
highest_score_items

array([ 53474, 119670,  26137, 116516, 101698])

In [31]:
item_factors_pca = pca.fit_transform(L1_rot)
scores = item_factors_pca[:, 0]
highest_score_items = np.argpartition(scores, -topn)[-topn:]
highest_score_items

array([ 53474, 119670,  26137, 116516, 101698])

In [None]:
# 1. split the dataset into two parts
# 2. train two embeddings and see if the embeddings meaning change
# e.g. use pca to check or see whether for a given item, its embedding
# feature's distribution is similar, e.g. it will have a high score for dimension 0
# on both embeddings
# 3. if they do, train the rotation solution (find the common item ids
# and find the rotation matrix, then rotate both the new item and user embedding)

## BPR

In [None]:
import numpy as np
from implicit.bpr import BayesianPersonalizedRanking
from implicit.datasets.movielens import get_movielens

titles, ratings = get_movielens(variant='100k')
print(titles)
ratings

In [None]:
# remove things < min_rating, and convert to implicit dataset
# by considering ratings as a binary preference only
min_rating = 4.0
ratings.data[ratings.data < min_rating] = 0
ratings.eliminate_zeros()
ratings.data = np.ones(len(ratings.data))

In [None]:
model = BayesianPersonalizedRanking(factors=30, iterations=10)
model.fit(ratings)

In [None]:
movieid = 1
if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
    title = titles[movieid]
    for other, score in model.similar_items(movieid, N=5):
        print(title, titles[other], score)