In [1]:
from nltk.corpus import stopwords
import numpy as np
import pickle
import re
import string
from sklearn.metrics.pairwise import cosine_similarity

Load pickle object from `subtitle_processing.ipynb`

In [2]:
def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
d = load_obj("processed_video_subtitle2")

In [4]:
subtitles = list(d.values())

In [5]:
videonames = list(d.keys())

In [6]:
test = subtitles[0]

In [7]:
def process_word_list(list_of_words):
    """
    Given a list of words, remove stop words, punctuation, and empty strings
    """
    STOPWORDS = stopwords.words("english")
    word_list = [w.lower().strip(string.punctuation) for w in list_of_words]
    word_list = [w for w in word_list if w not in STOPWORDS and w!='']
    return word_list

In [8]:
process_word_list(test)[:10]

['hey', 'everyone', 'phil', 'wong', 'fu', 'want', 'say', 'long', 'time', 'see']

In [9]:
def load_glove(filename):
    """
    Given filename, load glove vector into a dictionary where the key is the word
    and the value is a numpy array
    """

    f = open(filename)
    lines = f.readlines()
    f.close()

    d = {}

    for line in lines:
        words = line.split(' ')
        word = words[0]
        vector = np.array(words[1:], dtype = 'float')
        d[word] = vector

    return d

I use https://nlp.stanford.edu/projects/glove/ 300-dimensional vectors trained on Wikipedia articles

In [10]:
glove = load_glove("glove.6B.300d.txt")

In [11]:
glove['the'][:10]

array([ 0.04656  ,  0.21318  , -0.0074364, -0.45854  , -0.035639 ,
        0.23643  , -0.28836  ,  0.21521  , -0.13486  , -1.6413   ])

In [12]:

def subtitle2vec(subtitletext, gloves):
    """
    Compute the average word embedding of the subtitles. Note that if the word
    is not in the glove dictionary, it will be ignored.
    """

    subtitlevec = [gloves[word] for word in subtitletext if word in gloves]

    lengthvec = len(subtitlevec)
    centroid = [sum(x)/lengthvec for x in zip(*subtitlevec)]

    return centroid

In [13]:
embedding_dict = dict()

In [14]:
for i, subtitle in enumerate(subtitles):
    embedding_dict[videonames[i]] = np.array(subtitle2vec(process_word_list(subtitle), glove)).reshape(1,-1)

In [15]:
cosine_similarity(embedding_dict[videonames[0]], embedding_dict[videonames[1]])[0][0]

0.9244825430387438

In [16]:
def compute_cosine_sim(v1,v2):
    """
    Given two numpy arrays, compute cosine similarity and output an integer
    """
    return cosine_similarity(v1, v2)[0][0]

In [17]:
def closest_video(video, n):
    """
    Given a video subtitle file, recommend the closest n video based on the subtitle content
    """
    cosine_sim_list = []
    for i, subtitle in enumerate(subtitles):
        cosine_sim_list.append(((videonames[i], i), compute_cosine_sim(embedding_dict[video], embedding_dict[videonames[i]])))
    
    cosine_sim_list.sort(key = lambda x: x[1], reverse = True)
    
    return video, cosine_sim_list[1:n+1]

In [18]:
def recommend(select_video, n):
    input_video, recommended = closest_video(select_video, n)
    print("For the video:" + input_video + ",")
    print("https://www.youtube.com/watch?v=" + re.findall(r'\-([\d\w]+)\.en.vtt', input_video)[0])
    print("\nYour recommended videos are:\n")
    
    for video, score in recommended:
        print("Video Number %s" % video[1], ':', video[0], '|', str(np.round(score, 3)))
        print("https://www.youtube.com/watch?v=" + re.findall(r'\-([\d\w]+)\.en.vtt', video[0])[0], "\n")

In [19]:
recommend(videonames[0], 5)

For the video:How Many People Know Wong Fu-g7nod5mS46Y.en.vtt,
https://www.youtube.com/watch?v=g7nod5mS46Y

Your recommended videos are:

Video Number 74 : COMMENTS IN CARS - 'Untouchable'-VVTiiMEq4pE.en.vtt | 0.971
https://www.youtube.com/watch?v=VVTiiMEq4pE 

Video Number 86 : The Long Lost Member-f9WVtmQvM0I.en.vtt | 0.969
https://www.youtube.com/watch?v=f9WVtmQvM0I 

Video Number 76 : Asian Bachelorette-ag1IisyP1ak.en.vtt | 0.965
https://www.youtube.com/watch?v=ag1IisyP1ak 

Video Number 15 : Goodbye 3 Million Subscribers-rjlxc-vxNBA.en.vtt | 0.964
https://www.youtube.com/watch?v=vxNBA 

Video Number 14 : Girls Can't Take a Hint!-3DSYNfFu_NM.en.vtt | 0.963
https://www.youtube.com/watch?v=3DSYNfFu_NM 



In [24]:
recommend(videonames[99], 5)

For the video:Away We Happened - Ep 2-9Ka0aGyFGOk.en.vtt,
https://www.youtube.com/watch?v=9Ka0aGyFGOk

Your recommended videos are:

Video Number 92 : Away We Happened - Ep 5-uNkx6OKoUME.en.vtt | 0.984
https://www.youtube.com/watch?v=uNkx6OKoUME 

Video Number 96 : Away We Happened - Ep 4-tkPd-alWwmA.en.vtt | 0.982
https://www.youtube.com/watch?v=alWwmA 

Video Number 97 : Away We Happened - Ep 3-jh4hjR-s7hY.en.vtt | 0.981
https://www.youtube.com/watch?v=s7hY 

Video Number 48 : Just Another Nice Guy - Part 1-yU58jrx4pXs.en.vtt | 0.981
https://www.youtube.com/watch?v=yU58jrx4pXs 

Video Number 2 : From Here On Out-RboSq7vxKqs.en.vtt | 0.981
https://www.youtube.com/watch?v=RboSq7vxKqs 

