In [187]:
from nltk.corpus import stopwords
import numpy as np
import pickle
import re
import string
from sklearn.metrics.pairwise import cosine_similarity

Load pickle object from `subtitle_processing.ipynb`

In [2]:
def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [5]:
d = load_obj("processed_video_subtitle")

In [10]:
subtitles = list(d.values())

In [13]:
videonames = list(d.keys())

In [15]:
test = subtitles[0]

In [24]:
def process_word_list(list_of_words):
    """
    Given a list of words, remove stop words, punctuation, and empty strings
    """
    STOPWORDS = stopwords.words("english")
    word_list = [w.lower().strip(string.punctuation) for w in list_of_words]
    word_list = [w for w in word_list if w not in STOPWORDS and w!='']
    return word_list

In [32]:
process_word_list(test)[:10]

['hello',
 'everyone',
 'time',
 'kizuna',
 'ai',
 'received',
 'work',
 'offer',
 'bridge',
 'connect']

In [26]:
def load_glove(filename):
    """
    Given filename, load glove vector into a dictionary where the key is the word
    and the value is a numpy array
    """

    f = open(filename)
    lines = f.readlines()
    f.close()

    d = {}

    for line in lines:
        words = line.split(' ')
        word = words[0]
        vector = np.array(words[1:], dtype = 'float')
        d[word] = vector

    return d

I use https://nlp.stanford.edu/projects/glove/ 300-dimensional vectors trained on Wikipedia articles

In [27]:
glove = load_glove("glove.6B.300d.txt")

In [31]:
glove['the'][:10]

array([ 0.04656  ,  0.21318  , -0.0074364, -0.45854  , -0.035639 ,
        0.23643  , -0.28836  ,  0.21521  , -0.13486  , -1.6413   ])

In [33]:

def subtitle2vec(subtitletext, gloves):
    """
    Compute the average word embedding of the subtitles. Note that if the word
    is not in the glove dictionary, it will be ignored.
    """

    subtitlevec = [gloves[word] for word in subtitletext if word in gloves]

    lengthvec = len(subtitlevec)
    centroid = [sum(x)/lengthvec for x in zip(*subtitlevec)]

    return centroid

In [37]:
embedding_dict = dict()

In [74]:
for i, subtitle in enumerate(subtitles):
    embedding_dict[videonames[i]] = np.array(subtitle2vec(process_word_list(subtitle), glove)).reshape(1,-1)

In [75]:
cosine_similarity(embedding_dict[videonames[0]], embedding_dict[videonames[1]])[0][0]

0.8936984438988281

In [69]:
def compute_cosine_sim(v1,v2):
    """
    Given two numpy arrays, compute cosine similarity and output an integer
    """
    return cosine_similarity(v1, v2)[0][0]

In [171]:
def closest_video(video, n):
    """
    Given a video subtitle file, recommend the closest n video based on the subtitle content
    """
    cosine_sim_list = []
    for i, subtitle in enumerate(subtitles):
        cosine_sim_list.append(((videonames[i], i), compute_cosine_sim(embedding_dict[video], embedding_dict[videonames[i]])))
    
    cosine_sim_list.sort(key = lambda x: x[1], reverse = True)
    
    return video, cosine_sim_list[1:n+1]

In [218]:
def recommend(select_video, n):
    input_video, recommended = closest_video(select_video, n)
    print("For the video:" + input_video + ",")
    print("https://www.youtube.com/watch?v=" + re.findall(r'\-([\d\w]+)\.en.vtt', input_video)[0])
    print("\nYour recommended videos are:\n")
    
    for video, score in recommended:
        print("Video Number %s" % video[1], ':', video[0], '|', str(np.round(score, 3)))
        print("https://www.youtube.com/watch?v=" + re.findall(r'\-([\d\w]+)\.en.vtt', video[0])[0], "\n")

In [219]:
recommend(videonames[0], 5)

For the video:【ご指名】訪日促進大使、Kizuna AIです！【ありがとうございます】-kebFBXPprIo.en.vtt,
https://www.youtube.com/watch?v=kebFBXPprIo

Your recommended videos are:

Video Number 103 : 仮想少女的懸賞生活！〜バーチャルでも当選できるか？〜 その２-iZykFCJsOks.en.vtt | 0.966
https://www.youtube.com/watch?v=iZykFCJsOks 

Video Number 172 : 【LIVE】A.I.Channel 1st Anniversary!!!【12_1】-1M6J5gIM1Bk.en.vtt | 0.962
https://www.youtube.com/watch?v=1M6J5gIM1Bk 

Video Number 76 : 【未公開】私の素顔大公開！！【NG集】＃129-CrgeWAp19pU.en.vtt | 0.958
https://www.youtube.com/watch?v=CrgeWAp19pU 

Video Number 328 : 【ねんどろいど】１年越しの願いが叶いました！【ついに完成】-_BN5pFwV_k4.en.vtt | 0.957
https://www.youtube.com/watch?v=_BN5pFwV_k4 

Video Number 168 : 【流行語大賞】私もノミネートされたい！-P1LhzO7PGRg.en.vtt | 0.957
https://www.youtube.com/watch?v=P1LhzO7PGRg 

