In [20]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib

Ethen 2018-05-08 20:32:30 

CPython 3.6.4
IPython 6.3.1

numpy 1.14.3
pandas 0.22.0
sklearn 0.19.1
matplotlib 2.2.2


Information about the items (movies); this is a tab separated
              list of
              movie id | movie title | release date | video release date |
              IMDb URL | unknown | Action | Adventure | Animation |
              Children's | Comedy | Crime | Documentary | Drama | Fantasy |
              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
              Thriller | War | Western |
              The last 19 fields are the genres, a 1 indicates the movie
              is of that genre, a 0 indicates it is not; movies can be in
              several genres at once.
              The movie ids are the ones used in the u.data data set.

In [30]:
import os
file_dir = 'ml-100k'
file_path = os.path.join(file_dir, 'u.item')

names = [
    'movie_id', 'movie_title', 'release_date', 'video_release_date',
    'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
    'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
    'Thriller', 'War', 'Western']
item_metadata = pd.read_csv(file_path, sep = '|', encoding='latin-1', names=names)
item_metadata.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [32]:
index2movie = {}
with open(file_path, encoding = 'latin1') as f:
    for line in f.readlines():
        info = line.split('|')
        index = int(info[0]) - 1
        movie = info[1]
        index2movie[index] = movie
        
index2movie[1]

'GoldenEye (1995)'

In [2]:
import numpy as np
from spotlight.datasets.movielens import get_movielens_dataset

max_sequence_length = 200
min_sequence_length = 20
step_size = 200
random_state = np.random.RandomState(100)

dataset = get_movielens_dataset('100K')
dataset

<Interactions dataset (944 users x 1683 items x 100000 interactions)>

In [5]:
from spotlight.cross_validation import user_based_train_test_split

train, rest = user_based_train_test_split(dataset, test_percentage = 0.2,
                                          random_state=random_state)
test, validation = user_based_train_test_split(rest,
                                               test_percentage=0.5,
                                               random_state=random_state)
train

<Interactions dataset (944 users x 1683 items x 80200 interactions)>

In [6]:
# sort first by user id, then by timestamp
sort_indices = np.lexsort((train.timestamps, train.user_ids))
user_ids = train.user_ids[sort_indices]
item_ids = train.item_ids[sort_indices]
user_ids, indices, counts = np.unique(user_ids, return_index = True, return_counts = True)

In [34]:
i = 0

item_sequences = []
for i in range(len(indices) - 1):
    start = indices[i]
    stop = indices[i + 1]
    item_names = [index2movie[item_id] for item_id in item_ids[start:stop]]
    item_sequences.append(item_names)

item_names = [index2movie[item_id] for item_id in item_ids[stop:]]
item_sequences.append(item_names)
item_sequences[0]

['Wrong Trousers, The (1993)',
 'Princess Bride, The (1987)',
 'Manon of the Spring (Manon des sources) (1986)',
 'Platoon (1986)',
 'Graduate, The (1967)',
 'Private Benjamin (1980)',
 'Full Metal Jacket (1987)',
 "Mr. Holland's Opus (1995)",
 'Shall We Dance? (1996)',
 'Supercop (1992)',
 'Twister (1996)',
 'GoodFellas (1990)',
 'Operation Dumbo Drop (1995)',
 'GoldenEye (1995)',
 'Turbo: A Power Rangers Movie (1997)',
 'Austin Powers: International Man of Mystery (1997)',
 'Contact (1997)',
 'Legends of the Fall (1994)',
 'Batman & Robin (1997)',
 'Fifth Element, The (1997)',
 'Steel (1997)',
 '101 Dalmatians (1996)',
 'Home Alone (1990)',
 'Phenomenon (1996)',
 'Angels and Insects (1995)',
 'D3: The Mighty Ducks (1996)',
 'Lone Star (1996)',
 'Long Kiss Goodnight, The (1996)',
 'Babe (1995)',
 'Citizen Ruth (1996)',
 'French Twist (Gazon maudit) (1995)',
 'Air Bud (1997)',
 'Hunt for Red October, The (1990)',
 'Birdcage, The (1996)',
 'Chasing Amy (1997)',
 'Godfather, The (1972)',

In [49]:
from time import time
from joblib import cpu_count
from collections import Counter
from gensim.models import Word2Vec

workers = cpu_count()

start = time()
word2vec = Word2Vec(item_sequences, workers = workers, size = 30, iter = 15)
elapse = time() - start
print('elapse time:', elapse)

elapse time: 0.5821020603179932


In [50]:
word_vectors = pd.DataFrame(word2vec.wv.vectors, index = word2vec.wv.index2word)
print('word vector dimension: ', word_vectors.shape)
word_vectors.head()

word vector dimension:  (1286, 30)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
Legends of the Fall (1994),0.434255,0.332639,1.744802,0.443935,1.197831,0.385917,-1.056814,0.60427,0.866952,0.387967,...,-0.004121,2.100556,0.844141,-0.122742,-0.663271,0.031462,0.034988,0.465601,1.041341,0.946296
George of the Jungle (1997),-0.425796,0.022332,1.214881,-0.113738,1.426701,1.215612,0.226714,1.291551,1.413425,0.169951,...,0.892137,2.129802,0.551192,-0.739781,-0.818935,1.859506,-0.709876,0.228494,0.734214,0.31705
GoodFellas (1990),0.976687,0.327187,1.948321,0.421996,0.983316,0.318718,-1.324569,0.647954,0.709073,0.382202,...,-0.14529,2.281569,0.865944,0.004113,-0.347576,-0.008367,0.323535,0.32994,1.08084,0.796611
Heavy Metal (1981),0.620179,-0.336303,1.529305,1.163059,0.434813,0.522439,-1.438472,1.54172,0.536492,0.292472,...,0.428304,2.055949,0.617862,0.437238,0.170796,1.141402,-0.391122,0.338362,0.711422,0.844327
Breakdown (1997),0.288688,0.146565,1.014495,0.685795,0.810605,2.122872,0.087833,0.035605,1.755361,0.102847,...,1.78717,2.803189,-0.033281,-0.241128,-1.381096,0.432927,-1.282368,0.180537,0.344829,0.8414


In [51]:
word2vec.wv.most_similar(positive=['Snow White and the Seven Dwarfs (1937)'])

[('It Happened One Night (1934)', 0.9227335453033447),
 ('Manchurian Candidate, The (1962)', 0.9186711311340332),
 ('Mighty Aphrodite (1995)', 0.9163558483123779),
 ("What's Eating Gilbert Grape (1993)", 0.9145344495773315),
 ('Pump Up the Volume (1990)', 0.901369035243988),
 ('Blues Brothers, The (1980)', 0.9005205035209656),
 ('North by Northwest (1959)', 0.9000575542449951),
 ('Full Metal Jacket (1987)', 0.8984601497650146),
 ('Priest (1994)', 0.89666748046875),
 ('Spawn (1997)', 0.8962401151657104)]