# Hybrid Recommender (Content Based + Collaborative Filtering)

In this notebook we will combine both content and collaborative filtering embedding in an attempt to provide better recommendations.

Basically, we will concatenate the embedding features and compute cosine similarity over them.

In [1]:
import pandas as pd
import numpy as np
import faiss
from sklearn.preprocessing import normalize

## Prepare Data

Let's import embedding features and prepare them to be used in the dashboard. We will save movie ID as Pandas index to allow fast search and embedding features as Numpy arrays to facilitate computations.

### Create Movie Indexes

In [2]:
content_meta = pd.read_csv('output/content_embedding_meta.tsv', sep='\t')
cb_meta = pd.read_csv('output/cb_embedding_meta.tsv', sep='\t')

col_names = ['factor_{}'.format(i) for i in range(300)]
content_embedding = pd.read_csv('output/tfidf_embedding_vectors.tsv', sep='\t', names=col_names)
cb_embedding = pd.read_csv('output/collab_filt_vectors.tsv', sep='\t', names=col_names)

content_features = pd.concat([content_meta, content_embedding], axis='columns')
cb_features = pd.concat([cb_meta, cb_embedding], axis='columns')

display(content_features.head())
display(cb_features.head())

Unnamed: 0,id,title,factor_0,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,factor_290,factor_291,factor_292,factor_293,factor_294,factor_295,factor_296,factor_297,factor_298,factor_299
0,2,Jumanji,0.25728,0.14228,-0.05616,-0.22411,-0.04458,-0.09323,-0.04937,0.14945,...,0.00801,-0.03211,0.01339,-0.09783,0.00916,0.04669,0.05414,0.1403,0.01279,-0.01858
1,3,Grumpier Old Men,0.28551,-0.00267,-0.01574,-0.07095,-0.13337,0.00573,0.04262,0.01975,...,0.00297,0.02775,-0.04234,-0.00028,0.00164,-0.03241,-0.02644,-0.03867,-0.00156,-0.00992
2,4,Waiting to Exhale,0.26453,-0.00371,0.04027,-0.19423,-0.00221,0.02685,-0.02611,-0.06494,...,0.00939,0.01479,0.04143,-0.00658,0.00286,0.00697,-0.02153,0.04587,-0.02261,-0.05878
3,5,Father of the Bride Part II,0.26318,-0.01039,0.04838,-0.08738,-0.10563,0.00361,0.03256,0.00233,...,-0.0184,0.01913,-0.00092,0.07639,-0.01378,-0.11314,0.05678,-0.05565,0.02857,0.01363
4,6,Heat,0.28532,0.0599,-0.17971,-0.02468,0.08128,0.07527,-0.10816,-0.03918,...,-0.07329,0.06308,-0.03473,-0.03315,-0.0364,0.00197,0.04664,-0.06036,-0.00945,-0.03277


Unnamed: 0,id,title,factor_0,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,factor_290,factor_291,factor_292,factor_293,factor_294,factor_295,factor_296,factor_297,factor_298,factor_299
0,2,Jumanji,0.13718,-0.03387,-0.03017,0.05062,0.03955,-0.00645,-0.16905,0.03667,...,0.04458,-0.00807,-0.02188,0.0118,0.06933,0.1184,-0.09077,0.0754,0.00299,-0.07051
1,3,Grumpier Old Men,0.03843,0.027,-0.01565,0.02153,-0.09215,0.00628,0.00412,-0.04694,...,-0.01289,0.06133,-0.12022,-0.02296,-0.00038,0.10698,-0.09957,0.01236,0.07305,-0.00638
2,4,Waiting to Exhale,0.07394,-0.06563,-0.05454,0.03857,0.06494,-0.03999,-0.12235,-0.0759,...,-0.05933,0.02223,-0.02603,0.00313,0.08757,0.00446,-0.05378,-0.09055,0.08543,-0.05654
3,5,Father of the Bride Part II,0.05019,-0.07952,-0.03727,0.05436,-0.09217,-0.02115,-0.06872,-0.03769,...,0.00114,0.04767,-0.15716,-0.04998,0.01877,0.0881,-0.11753,-0.02773,0.05437,-0.03121
4,6,Heat,0.00977,0.00258,-0.04626,-0.0333,-0.04444,0.14469,0.09473,0.02177,...,0.03475,0.06998,-0.03812,0.02543,-0.00995,0.09979,-0.02725,-0.03844,0.00601,0.03093


We will consider only those movies that are present in both embedding datasets:

In [3]:
ids = list(set(content_meta['id']).intersection(set(cb_meta['id'])))
print(len(ids))
print(ids[:10])

10698
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [4]:
content_features = content_features.loc[content_features['id'].isin(ids)].drop_duplicates('id')
content_features.sort_values('id', inplace=True)
content_features.reset_index(inplace=True, drop=True)
content_features['row'] = content_features.index

cb_features = cb_features.loc[cb_features['id'].isin(ids)].drop_duplicates('id')
cb_features.sort_values('id', inplace=True)

content_features.head()

Unnamed: 0,id,title,factor_0,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,factor_291,factor_292,factor_293,factor_294,factor_295,factor_296,factor_297,factor_298,factor_299,row
0,2,Jumanji,0.25728,0.14228,-0.05616,-0.22411,-0.04458,-0.09323,-0.04937,0.14945,...,-0.03211,0.01339,-0.09783,0.00916,0.04669,0.05414,0.1403,0.01279,-0.01858,0
1,3,Grumpier Old Men,0.28551,-0.00267,-0.01574,-0.07095,-0.13337,0.00573,0.04262,0.01975,...,0.02775,-0.04234,-0.00028,0.00164,-0.03241,-0.02644,-0.03867,-0.00156,-0.00992,1
2,4,Waiting to Exhale,0.26453,-0.00371,0.04027,-0.19423,-0.00221,0.02685,-0.02611,-0.06494,...,0.01479,0.04143,-0.00658,0.00286,0.00697,-0.02153,0.04587,-0.02261,-0.05878,2
3,5,Father of the Bride Part II,0.26318,-0.01039,0.04838,-0.08738,-0.10563,0.00361,0.03256,0.00233,...,0.01913,-0.00092,0.07639,-0.01378,-0.11314,0.05678,-0.05565,0.02857,0.01363,3
4,6,Heat,0.28532,0.0599,-0.17971,-0.02468,0.08128,0.07527,-0.10816,-0.03918,...,0.06308,-0.03473,-0.03315,-0.0364,0.00197,0.04664,-0.06036,-0.00945,-0.03277,4


In [5]:
embedding_index = content_features[['id', 'row', 'title']].copy()
embedding_index.set_index('id', drop=True, inplace=True)
embedding_index.to_pickle('output/embedding_index.pkl')

embedding_index.head()

Unnamed: 0_level_0,row,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0,Jumanji
3,1,Grumpier Old Men
4,2,Waiting to Exhale
5,3,Father of the Bride Part II
6,4,Heat


In [6]:
embedding_index.loc[260]

row            204
title    Star Wars
Name: 260, dtype: object

### Create Embedding Matrices

The hybrid matrix is just the combination of content and collaborative mebedding features.

In [7]:
content_matrix = content_features.drop(['id','row','title'], axis='columns').to_numpy().astype('float32').copy(order='C')
cb_matrix = cb_features.drop(['id','title'], axis='columns').to_numpy().astype('float32').copy(order='C')
hybrid_matrix = np.hstack([content_matrix, cb_matrix])
hybrid_matrix = normalize(hybrid_matrix, norm="l2", axis=1, copy=False)

with open('output/embedding_matrix.npy', 'wb') as f:
    np.save(f, content_matrix)
    np.save(f, cb_matrix)
    np.save(f, hybrid_matrix)

print(content_matrix.shape)
print(cb_matrix.shape)
print(hybrid_matrix.shape)

(10698, 300)
(10698, 300)
(10698, 600)


In [8]:
hybrid_matrix[embedding_index.loc[260].row, :10]

array([ 0.28336674,  0.18142289, -0.08367924, -0.19149917,  0.0212769 ,
       -0.11614261, -0.03336846,  0.22099975,  0.05528883, -0.00441943],
      dtype=float32)

### Create ANN Index

Now, let's use the `faiss` library to prepare the embedding vectors for efficient search in application time:

In [9]:
%%time

content_ann = faiss.IndexFlatIP(content_matrix.shape[1])
content_ann.add(content_matrix)
faiss.write_index(content_ann, "output/content_ann.bin")

cb_ann = faiss.IndexFlatIP(cb_matrix.shape[1])
cb_ann.add(cb_matrix)
faiss.write_index(cb_ann, "output/cb_ann.bin")

hybrid_ann = faiss.IndexFlatIP(hybrid_matrix.shape[1])
hybrid_ann.add(hybrid_matrix)
faiss.write_index(hybrid_ann, "output/hybrid_ann.bin")

CPU times: user 16 ms, sys: 84 ms, total: 100 ms
Wall time: 96.4 ms


## Hybrid Similarity

Let's make some tests with the hybrid embedding features:

In [10]:
MOVIE_ID = 260
K = 10

row = embedding_index.loc[MOVIE_ID].row
dist, idx = hybrid_ann.search(hybrid_matrix[[row], :], K+1)

neighbors = embedding_index.iloc[idx.flatten()].copy()
neighbors['similarity'] = dist.flatten()
display(neighbors)

Unnamed: 0_level_0,row,title,similarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
260,204,Star Wars,1.0
1196,836,The Empire Strikes Back,0.899729
1210,847,Return of the Jedi,0.840677
122886,9828,Star Wars: The Force Awakens,0.680989
166528,10564,Rogue One: A Star Wars Story,0.641269
33493,6336,Star Wars: Episode III - Revenge of the Sith,0.637462
5378,3836,Star Wars: Episode II - Attack of the Clones,0.617322
2628,1897,Star Wars: Episode I - The Phantom Menace,0.606788
68358,7720,Star Trek,0.520273
1198,838,Raiders of the Lost Ark,0.472758


In [11]:
dist, idx = content_ann.search(content_matrix[[row], :], K+1)

neighbors = embedding_index.iloc[idx.flatten()].copy()
neighbors['similarity'] = dist.flatten()
display(neighbors)

Unnamed: 0_level_0,row,title,similarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
260,204,Star Wars,0.999993
1196,836,The Empire Strikes Back,0.900626
1210,847,Return of the Jedi,0.876032
5378,3836,Star Wars: Episode II - Attack of the Clones,0.808551
2628,1897,Star Wars: Episode I - The Phantom Menace,0.795053
33493,6336,Star Wars: Episode III - Revenge of the Sith,0.787766
1200,840,Aliens,0.697495
113345,9609,Jupiter Ascending,0.697349
68358,7720,Star Trek,0.674341
122886,9828,Star Wars: The Force Awakens,0.665957


In [12]:
dist, idx = cb_ann.search(cb_matrix[[row], :], K+1)

neighbors = embedding_index.iloc[idx.flatten()].copy()
neighbors['similarity'] = dist.flatten()
display(neighbors)

Unnamed: 0_level_0,row,title,similarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
260,204,Star Wars,0.999996
1196,836,The Empire Strikes Back,0.898831
1210,847,Return of the Jedi,0.805315
122886,9828,Star Wars: The Force Awakens,0.696018
166528,10564,Rogue One: A Star Wars Story,0.646935
561,431,Killer,0.54425
31247,6171,The Fighting Sullivans,0.525195
33493,6336,Star Wars: Episode III - Revenge of the Sith,0.487151
5199,3724,The Long Riders,0.470113
136485,10082,Robot Chicken: Star Wars,0.467294


It looks that the hybrid similarity works as an average between the content based and the collaborative filtering.