<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>

<i>Licensed under the MIT License.</i>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import requests
import sys
sys.path.append('../..')

import pandas as pd

from reco_utils.dataset import movielens

In [3]:
# TODO add Tag <Parameter>
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'
TOP_K = 10
PLOT_TYPE = "&plot=full"  # for short version, use ""

In [4]:
# We cache data for later use
DATA_DIR = os.path.join("data", MOVIELENS_DATA_SIZE)
os.makedirs(DATA_DIR, exist_ok=True)

In [5]:
data = movielens.load_pandas_df(
    MOVIELENS_DATA_SIZE,
    title_col='title',
    genres_col='genres',
    year_col='year',
    local_cache_path=DATA_DIR,
)
# Remove years from title: "Kolya (1996)" --> "Kolya"
# TODO some movies have strange title. Verify them.
data['title'] = data['title'].apply(lambda x: x[:x.rfind("(", 0)].rstrip())
data.head()

Unnamed: 0,userID,itemID,rating,timestamp,title,genres,year
0,196,242,3.0,881250949,Kolya,Comedy,1996
1,63,242,3.0,875747190,Kolya,Comedy,1996
2,226,242,5.0,883888671,Kolya,Comedy,1996
3,154,242,3.0,879138235,Kolya,Comedy,1996
4,306,242,5.0,876503793,Kolya,Comedy,1996


In [6]:
movies = (data[['itemID', 'title', 'year']]
          .drop_duplicates()
          .sort_values(by='itemID')
          .reset_index(drop=True))
print("Number of movies:", len(movies))
display(movies.head())

Number of movies: 1682


Unnamed: 0,itemID,title,year
0,1,Toy Story,1995
1,2,GoldenEye,1995
2,3,Four Rooms,1995
3,4,Get Shorty,1995
4,5,Copycat,1995


We grab movie info from http://www.omdbapi.com/   (1,000 daily limit)
TODO: Check w/ CELA for copyright

In [7]:
OMDb_URL = "http://www.omdbapi.com/?i=tt3896198&apikey=f4ab835f&t={0}&y={1}{2}"

# First 900 movies (0 to 899)
# movie_desc = [
#     requests.get(OMDb_URL.format(m.title, m.year, PLOT_TYPE)).json() for _, m in movies[:900].iterrows()
# ]

# Remaining movies (999 to end)
movie_desc = [
    requests.get(OMDb_URL.format(m.title, m.year, PLOT_TYPE)).json() for _, m in movies[900:].iterrows()
]

In [8]:
len(movie_desc)

782

In [9]:
movie_desc[0]

{'Title': 'Mr. Magoo',
 'Year': '1997',
 'Rated': 'PG',
 'Released': '25 Dec 1997',
 'Runtime': '87 min',
 'Genre': 'Adventure, Comedy, Family',
 'Director': 'Stanley Tong',
 'Writer': 'Pat Proft, Tom Sherohman',
 'Actors': 'Leslie Nielsen, Kelly Lynch, Matt Keeslar, Nick Chinlund',
 'Plot': 'Mr.Magoo is an eccentric millionaire with very bad eyesight who refuses to use eyeglasses and therefore always gets into trouble. During the museum robbery he accidentally gets a priceless gem called the Star of Kurdistan, and begins to trace the way for the arch-criminals whose idea was to steal the gem - Austin Cloquet and Ortega "The Piranha" Peru, while two federal agents Stupak and Anders lead the manhunt for Mr.Magoo himself.',
 'Language': 'English',
 'Country': 'USA',
 'Awards': '2 nominations.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BODFiOWY2OTctMzBlNi00MjhhLTg4MWQtMTI5YTUzZTI2N2E1XkEyXkFqcGdeQXVyNTUyMzE4Mzg@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 

In [10]:
import pickle

with open('movie_901.pkl', 'wb') as f:
    pickle.dump(movie_desc, f)

In [11]:
with open('movie_900.pkl','rb') as f:
     first_900 = pickle.load(f)

first_900[0]

{'Title': 'Toy Story',
 'Year': '1995',
 'Rated': 'G',
 'Released': '22 Nov 1995',
 'Runtime': '81 min',
 'Genre': 'Animation, Adventure, Comedy, Family, Fantasy',
 'Director': 'John Lasseter',
 'Writer': 'John Lasseter (original story by), Pete Docter (original story by), Andrew Stanton (original story by), Joe Ranft (original story by), Joss Whedon (screenplay by), Andrew Stanton (screenplay by), Joel Cohen (screenplay by), Alec Sokolow (screenplay by)',
 'Actors': 'Tom Hanks, Tim Allen, Don Rickles, Jim Varney',
 'Plot': 'A little boy named Andy loves to be in his room, playing with his toys, especially his doll named "Woody". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy\'s family moving, and what Woody does not know is about Andy\'s birthday party. Woody does not realize that Andy\'s mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a

In [16]:
valid = 0
for i in first_900:
    if 'Title' in i:
        valid += 1
valid

"""Note,
Movie titles start w/ 'The' or 'A' are shifted in MovieLens dataset, e.g.

Original movie title: "The English Patient"
MovieLens data: "English Patient, The"

This cause invalid movie plot query
"""

634