# Similarity Match with Approximate Nearest Neighbors

In [1]:
import pandas as pd
import numpy as np

from src.similarity import create_search_index, find_top_similar

In [2]:
content_meta = pd.read_csv('output/content_embedding_meta.tsv', sep='\t')
cb_meta = pd.read_csv('output/cb_embedding_meta.tsv', sep='\t')

col_names = ['factor_{}'.format(i) for i in range(300)]
cb_embedding = pd.read_csv('output/collab_filt_vectors.tsv', sep='\t', names=col_names)
tfidf_embedding = pd.read_csv('output/tfidf_embedding_vectors.tsv', sep='\t', names=col_names)
tfidf_crew_embedding = pd.read_csv('output/tfidf_crew_embedding_vectors.tsv', sep='\t', names=col_names)
tfidf_crew_cast_embedding = pd.read_csv('output/tfidf_crew_cast_embedding_vectors.tsv', sep='\t', names=col_names)

cb_embedding.head()

Unnamed: 0,factor_0,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,factor_8,factor_9,...,factor_290,factor_291,factor_292,factor_293,factor_294,factor_295,factor_296,factor_297,factor_298,factor_299
0,0.13721,-0.03493,-0.02982,0.05093,0.03932,-0.00622,-0.16899,0.03577,0.06585,-0.00798,...,0.0444,-0.00539,-0.0228,0.01039,0.06972,0.11879,-0.09154,0.07443,0.00554,-0.07066
1,0.0403,0.02628,-0.01588,0.02088,-0.09249,0.00531,0.00538,-0.04937,0.07632,-0.04975,...,-0.01155,0.05924,-0.11981,-0.02154,0.00139,0.10848,-0.09925,0.01413,0.07187,-0.00866
2,0.07504,-0.06499,-0.05445,0.03758,0.0655,-0.04139,-0.12247,-0.07431,0.08577,-0.01587,...,-0.05923,0.02245,-0.02385,0.00472,0.08617,0.00339,-0.05371,-0.09074,0.08682,-0.05612
3,0.05055,-0.07874,-0.03753,0.05399,-0.09293,-0.02188,-0.07106,-0.03828,0.04357,-0.06199,...,0.00149,0.04665,-0.15634,-0.05171,0.01962,0.09012,-0.11807,-0.02655,0.053,-0.03082
4,0.00942,0.00265,-0.04745,-0.03342,-0.04625,0.14784,0.09784,0.02144,-0.00651,0.01417,...,0.03555,0.06852,-0.04027,0.02531,-0.00936,0.09686,-0.02702,-0.03737,0.00824,0.03044


In [3]:
%%time

cb_index = create_search_index(cb_embedding)

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 11.2 ms


In [4]:
%%time

similars = find_top_similar([260], cb_index, 15, cb_embedding, cb_meta)
similars.head()

CPU times: user 412 ms, sys: 16 ms, total: 428 ms
Wall time: 77 ms


Unnamed: 0,id_right,similarity,id_left
836,1196,0.899685,260
847,1210,0.805527,260
9828,122886,0.696327,260
10564,166528,0.646531,260
431,561,0.542919,260


In [5]:
display(cb_meta.loc[cb_meta['id']==260])
display(pd.merge(similars, cb_meta, left_on='id_right', right_on='id', how='inner'))

Unnamed: 0,id,title
204,260,Star Wars


Unnamed: 0,id_right,similarity,id_left,id,title
0,1196,0.899685,260,1196,The Empire Strikes Back
1,1210,0.805527,260,1210,Return of the Jedi
2,122886,0.696327,260,122886,Star Wars: The Force Awakens
3,166528,0.646531,260,166528,Rogue One: A Star Wars Story
4,561,0.542919,260,561,Killer
5,31247,0.527497,260,31247,The Fighting Sullivans
6,33493,0.485193,260,33493,Star Wars: Episode III - Revenge of the Sith
7,5199,0.471667,260,5199,The Long Riders
8,136485,0.464898,260,136485,Robot Chicken: Star Wars
9,168026,0.453394,260,168026,Marvel One-Shot: Agent Carter


In [6]:
%%time

tfidf_index = create_search_index(tfidf_embedding)

CPU times: user 908 ms, sys: 44 ms, total: 952 ms
Wall time: 52 ms


In [7]:
%%time

similars = find_top_similar([260], tfidf_index, 15, tfidf_embedding, content_meta)
similars.head()

CPU times: user 228 ms, sys: 0 ns, total: 228 ms
Wall time: 11.5 ms


Unnamed: 0,id_right,similarity,id_left
849,1196,0.90088,260
860,1210,0.876365,260
3933,5378,0.809109,260
1929,2628,0.799275,260
6955,33493,0.788824,260


In [8]:
display(content_meta.loc[content_meta['id']==260])
display(pd.merge(similars, content_meta, left_on='id_right', right_on='id', how='inner'))

Unnamed: 0,id,title
204,260,Star Wars


Unnamed: 0,id_right,similarity,id_left,id,title
0,1196,0.90088,260,1196,The Empire Strikes Back
1,1210,0.876365,260,1210,Return of the Jedi
2,5378,0.809109,260,5378,Star Wars: Episode II - Attack of the Clones
3,2628,0.799275,260,2628,Star Wars: Episode I - The Phantom Menace
4,33493,0.788824,260,33493,Star Wars: Episode III - Revenge of the Sith
5,1200,0.697805,260,1200,Aliens
6,113345,0.696866,260,113345,Jupiter Ascending
7,68358,0.679082,260,68358,Star Trek
8,122886,0.669143,260,122886,Star Wars: The Force Awakens
9,112852,0.658093,260,112852,Guardians of the Galaxy


In [9]:
target = pd.read_csv('output/movie_similarity.csv', usecols=['id_left'])

target_ids = set(target['id_left'])
target_ids = target_ids.intersection(set(content_meta['id'].values))
target_ids = list(target_ids.intersection(set(cb_meta['id'].values)))

print(target_ids)

[40962, 6, 7, 10, 2058, 16, 19, 2067, 21, 122904, 45081, 2072, 26, 25, 30749, 100383, 32, 2081, 26662, 39, 2087, 41, 122918, 47, 51255, 2105, 116797, 2115, 70, 116823, 4186, 30810, 2140, 30812, 95, 92259, 30825, 110, 2160, 49272, 2171, 49280, 131, 2186, 4235, 141, 151, 55451, 163, 4262, 8361, 176, 8370, 8371, 2231, 8376, 185, 6333, 2245, 106696, 4306, 112852, 6373, 96488, 2288, 246, 253, 2301, 260, 4361, 2318, 94478, 2324, 282, 296, 2348, 76077, 2352, 168250, 316, 33085, 318, 2366, 86332, 329, 72011, 342, 4440, 2395, 349, 356, 2405, 2406, 6502, 100714, 364, 135532, 367, 49530, 2429, 6535, 6539, 4506, 55721, 440, 457, 4553, 55765, 8665, 475, 480, 2541, 497, 499, 8711, 55820, 527, 529, 532, 535, 541, 72226, 2599, 96811, 6709, 6711, 68157, 4679, 587, 588, 590, 592, 593, 4701, 608, 41569, 6754, 72294, 2671, 2683, 2692, 648, 68237, 94864, 2706, 6802, 663, 2716, 673, 8865, 2724, 4776, 8879, 2747, 2762, 6863, 750, 2804, 8949, 54004, 8961, 778, 2829, 4878, 4886, 6934, 6936, 2857, 2858, 99114, 

In [10]:
%%time

cb_similars = find_top_similar(target_ids, cb_index, 30, cb_embedding, cb_meta)
cb_similars.to_csv('output/movie_similarity_cb.csv', index=False)

CPU times: user 5.01 s, sys: 124 ms, total: 5.14 s
Wall time: 245 ms


In [11]:
%%time

tfidf_similars = find_top_similar(target_ids, tfidf_index, 30, tfidf_embedding, content_meta)
tfidf_similars.to_csv('output/movie_similarity_tfidf.csv', index=False)

CPU times: user 3.37 s, sys: 72 ms, total: 3.44 s
Wall time: 174 ms


In [12]:
%%time

tfidf_crew_index = create_search_index(tfidf_crew_embedding)

tfidf_crew_similars = find_top_similar(target_ids, tfidf_crew_index, 30, tfidf_crew_embedding, content_meta)
tfidf_crew_similars.to_csv('output/movie_similarity_tfidf_crew.csv', index=False)

CPU times: user 3.3 s, sys: 84 ms, total: 3.39 s
Wall time: 149 ms


In [13]:
%%time

tfidf_crew_cast_index = create_search_index(tfidf_crew_cast_embedding)

tfidf_crew_cast_similars = find_top_similar(target_ids, tfidf_crew_cast_index, 30, tfidf_crew_cast_embedding, content_meta)
tfidf_crew_cast_similars.to_csv('output/movie_similarity_tfidf_crew_cast.csv', index=False)

CPU times: user 4.16 s, sys: 88 ms, total: 4.25 s
Wall time: 187 ms
