In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib

Ethen 2018-05-05 18:21:27 

CPython 3.6.4
IPython 6.3.1

numpy 1.14.2
pandas 0.22.0
sklearn 0.19.1
matplotlib 2.2.2


In [2]:
import json

sequence_file = '/Users/mingyuliu/MovieTaster-Open/datas/doulist_0804_09.json'
movie_file = '/Users/mingyuliu/MovieTaster-Open/datas/movie_0804_09.json'

In [3]:
with open(sequence_file) as f:
    line = f.readline().strip()

json.loads(line)

{'movie_names': ['盖文·斯通复活',
  '海滩救护队 第一季',
  '检查站',
  '大黄蜂',
  '战狼2',
  '海军陆战队员5：杀戮战场',
  '毁灭者',
  '难以置信',
  '审讯',
  '搏击之王',
  '忍者神龟2：破影而出',
  '掠夺者',
  '姐妹',
  '生活残骸',
  '老爸当家',
  '倒数追击',
  '摩登原始人：石器时代大乱斗',
  '死囚大逃杀2',
  '12回合3：致命禁闭',
  '球手们 第一季',
  '怪客',
  '圣诞追缉令',
  '海军陆战队员4',
  '地心历险记3',
  '海滩游侠'],
 'list_name': 'WWE出品以及WWE明星主演电影',
 'list_id': 1741812}

In [4]:
# extract the movie name and its corresponding count
movie_counter = {}
with open(sequence_file) as f:
    for line in f:
        line = line.strip()
        for movie_name in json.loads(line)['movie_names']:
            if movie_name not in movie_counter:
                movie_counter[movie_name] = 0

            movie_counter[movie_name] += 1

In [5]:
min_word_freq = 0
movie2index = {}
index2movie = []
for movie, count in movie_counter.items():
    if count > min_word_freq:
        movie2index[movie] = len(movie2index)
        index2movie.append(movie)

unknown_token = '<unk>'
index2movie.append(unknown_token)
movie2index[unknown_token] = len(movie2index)
print('Found {} distinct movies from {}'.format(len(movie2index), sequence_file))

Found 130206 distinct movies from /Users/mingyuliu/MovieTaster-Open/datas/doulist_0804_09.json


In [6]:
# movie_freq = filter(lambda _:_[1] >= min_word_freq, movie_counter.items())
# movie_counter_sorted = sorted(movie_freq, key=lambda x: (-x[1], x[0]))
# movies, _ = list(zip(*movie_counter_sorted))
# movie_name_id_dict = dict(zip(movies, range(len(movies))))
# movie_name_id_dict['<unk>'] = len(movies)
# print('movie_name_id_dict is %d from [%s]' % (len(movie_name_id_dict), sequence_file))

In [7]:
# convert the movie into ids
with open(sequence_file) as f:
    for line in f:
        movie_names = json.loads(line.strip())['movie_names']
        # note this is only converting one of them
        movie_ids = [movie2index[movie_name] for movie_name in movie_names]

movie_ids

[4848,
 2597,
 1793,
 4208,
 5311,
 9806,
 13810,
 6791,
 13619,
 9273,
 8019,
 7134,
 5337,
 13105,
 13700,
 7929,
 7923,
 544,
 3972,
 796,
 4268,
 3371,
 4576]

In [8]:
vec_file = '/Users/mingyuliu/MovieTaster-Open/models/fasttext_model_0804_09_skipgram.vec'
with open(vec_file) as f:
    # for fasttext's word vector, the first line is a header containing the number
    # of words and the dimensionality of the vectors
    header = f.readline()
    line = f.readline()

print('header: ', header)
print('line: ', line)

header:  30641 100

line:  </s> -0.21131 -0.2162 0.04122 -0.32946 0.060737 0.19275 0.24728 -0.054955 -0.064975 0.021295 -0.052426 -0.12792 0.0095121 0.24508 0.019768 0.13504 0.33133 0.14172 -0.054049 0.059114 0.050887 0.0049271 0.25136 -0.022339 0.032896 0.009978 0.06102 -0.11051 0.053359 -0.20209 0.066269 -0.055175 0.23177 -0.2138 -0.17304 0.53412 -0.04332 -0.085039 0.37848 -0.3774 0.064045 -0.1722 0.00013164 0.084356 0.099483 0.085978 -0.1023 -0.13675 0.095966 0.079536 0.050399 -0.1413 -0.0603 0.0060839 -0.01489 0.28865 -0.15171 -0.043863 0.098124 0.25111 0.032686 -0.00070122 -0.057389 0.015229 -0.18102 -0.20692 -0.010707 -0.053809 -0.046034 0.068155 -0.14465 -0.29584 -0.15645 0.2327 -0.089968 0.19759 0.13405 -0.013056 -0.12529 -0.010635 -0.075038 0.12613 -0.0071746 -0.027426 -0.2734 0.016698 0.033973 -0.41272 0.2506 -0.07961 -0.10346 -0.15892 -0.3009 -0.18918 0.10125 -0.06634 0.19005 -0.20425 -0.10684 0.29993 



In [9]:
vec_file = '/Users/mingyuliu/MovieTaster-Open/models/fasttext_model_0804_09_skipgram.vec'
vectors = {}
with open(vec_file) as f:
    # for fasttext's word vector, the first line is a header containing the number
    # of words and the dimensionality of the vectors
    f.readline()
    for line in f:
        splitted = line.strip().split()
        if not splitted[0].isdigit():
            continue

        movie_id = int(splitted[0])
        vector = np.array(splitted[1:], dtype = np.float32)
        vectors[movie_id] = vector

In [10]:
movie_name = '小时代'
if movie_name not in movie2index:
    print('movie name not in corpus')

current_movie_id = movie2index[movie_name]
current_vector = vectors[current_movie_id]
current_vector

array([ 0.018763 , -0.82378  ,  0.21688  , -0.45     , -0.30275  ,
       -0.27608  ,  0.38869  , -0.41264  ,  0.014629 ,  0.41903  ,
       -1.1321   , -0.25654  ,  0.73778  ,  1.0085   , -0.29896  ,
        0.36036  ,  0.10631  , -0.055913 , -0.2414   , -0.24269  ,
        0.17245  ,  0.69607  , -0.031655 ,  0.034579 ,  0.85329  ,
        0.96645  ,  0.29768  , -0.85085  ,  0.60223  , -0.21128  ,
        0.98841  ,  0.27903  ,  0.093009 , -0.68712  , -0.54967  ,
        0.1639   ,  0.48586  ,  0.24759  , -0.54613  , -0.28554  ,
        0.53195  , -0.52514  , -0.5339   , -0.075233 , -0.063432 ,
        0.50362  ,  0.39234  , -0.078006 ,  0.96622  , -0.14415  ,
       -0.075358 , -0.13686  ,  0.15022  ,  1.2478   ,  0.25723  ,
        0.53085  ,  0.004477 , -0.48371  , -0.14993  ,  0.67801  ,
       -0.3439   , -0.72657  ,  0.36315  ,  0.07711  , -0.54221  ,
        0.073791 , -0.4507   , -0.09035  ,  0.21172  , -0.13133  ,
       -0.23066  , -0.84336  , -0.080367 ,  0.056479 , -0.8752

In [11]:
class minHeap:
    def __init__(self, k):
        self._k = k
        self._heap = []

    def add(self, item):
        if len(self._heap) < self._k:
            self._heap.append(item)
            heapq.heapify(self._heap)
        else:
            if item > self._heap[0]:
                self._heap[0] = item
                heapq.heapify(self._heap)

    def get_min(self):
        if len(self._heap) > 0:
            return self._heap[0]
        else:
            return -2

    def get_all(self):
        return self._heap

def similarity(v1, v2):
    n1 = np.linalg.norm(v1)
    n2 = np.linalg.norm(v2)
    return np.dot(v1, v2) / n1 / n2

In [15]:
import heapq

k = 5
min_heap = minHeap(k)
like_candidates = []
for movie_id, vector in vectors.items():
    if movie_id == current_movie_id:
        continue
    sim = similarity(current_vector, vector)
    if len(like_candidates) < k or sim > min_heap.get_min():
        min_heap.add(sim)
        like_candidates.append((movie_id, sim))
        
for t in sorted(like_candidates, reverse=True, key=lambda _:_[1])[:k]:
    print('[%d]%s %f' % (t[0], index2movie[t[0]], t[1]))

[133]赛尔号大电影6：圣者无敌 0.571262
[19437]老师不是人 0.566645
[493]伊甸湖 0.560819
[21414]初代吸血鬼 第一季 0.559760
[2108]油脂 0.558465
