In [2]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display, HTML
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

In [1]:
import tmdbsimple as tmdb
tmdb.API_KEY = '08d3df3441c114a6cdb87682cb5b8013'

from editdistance import eval as editdistance

In [3]:
from surprise import Dataset
from surprise import accuracy
import numpy as np
import time
import pandas as pd
from pathlib import Path
from surprise.model_selection import train_test_split
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=.25)
from tqdm import tqdm,tqdm_notebook
from joblib import Parallel, delayed
print(data.ratings_file)

path = Path(data.ratings_file)
ml_1m_dir = path.resolve().parents[1]
files = list(ml_1m_dir.glob('**/*.dat'))

users = [f for f in files if "users.dat" in str(f)][0]
movies = [f for f in files if "movies.dat" in str(f)][0]
ratings = [f for f in files if "ratings.dat" in str(f)][0]

users = pd.read_csv(str(users),sep="::", header=None, names=["user_id", "gender", "age", "occupation", "zip"], engine='python')
movies = pd.read_csv(str(movies),sep="::", header=None, names=["movie_id", "title", "genres"], engine='python')
ratings = pd.read_csv(str(ratings),sep="::", header=None, names=["user_id", "movie_id", "rating", "timestamp"], engine='python')

movies['genres'] = movies['genres'].apply(lambda x: x.lower().split('|'))
users['user_id'] = users['user_id'].astype(str)
movies['movie_id'] = movies['movie_id'].astype(str)
ratings['movie_id'] = ratings['movie_id'].astype(str)
ratings['user_id'] = ratings['user_id'].astype(str)
# CountVectorizer and make 1 column for each genre

print(users.shape, movies.shape, ratings.shape)

user_item_affinities = list(map(lambda x: tuple([x[0], x[1], x[2]]), data.raw_ratings))


/Users/ahemf/.surprise_data/ml-1m/ml-1m/ratings.dat
(6040, 5) (3883, 3) (1000209, 4)


In [4]:
movies.head(3)
movies['year'] = movies.title.apply(lambda x: x[-5:-1])
movies.title = movies.title.apply(lambda x: x[:-7])
movies.head(3)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"[animation, children's, comedy]"
1,2,Jumanji (1995),"[adventure, children's, fantasy]"
2,3,Grumpier Old Men (1995),"[comedy, romance]"


Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,"[animation, children's, comedy]",1995
1,2,Jumanji,"[adventure, children's, fantasy]",1995
2,3,Grumpier Old Men,"[comedy, romance]",1995


In [5]:
np.sum(movies.title.isna())
np.sum(movies.year.isna())
np.sum(movies.title.apply(len)==0)
np.sum(movies.year.apply(len)==0)

0

0

0

0

In [30]:
def get_movie_details_from_tmdb(title, year):
    year = int(year)
    search = tmdb.Search()
    response = search.movie(query=title)
    # title edit distance and year
    results = response['results']
    from functools import cmp_to_key
    def cmp(m1, m2):
        edst_1 = editdistance(title, m1['title'])
        edst_2 = editdistance(title, m1['title'])
        if 'release_date' not in m1:
            return 1
        if 'release_date' not in m2:
            return -1
        year_diff_1 = np.abs(pd.to_datetime(m1['release_date']).year - year)
        year_diff_2 = np.abs(pd.to_datetime(m2['release_date']).year - year)
        score_1 = 0.2 * edst_1 + year_diff_1
        score_2 = 0.2 * edst_2 + year_diff_2
        return -1 if score_1 <= score_2 else 1
    results = list(sorted(results, key=cmp_to_key(cmp)))
    result = results[0] if len(results) > 0 else {'overview':''}
    return result


In [31]:
def slow_api_calls(title, year):
    time.sleep(0.1)
    return get_movie_details_from_tmdb(title, year)

In [34]:
slow_api_calls(title="Toy Story", year=1995)

{'popularity': 35,
 'vote_count': 11214,
 'video': False,
 'poster_path': '/rhIRbceoE9lR4veEXuwCC2wARtG.jpg',
 'id': 862,
 'adult': False,
 'backdrop_path': '/dji4Fm0gCDVb9DQQMRvAI8YNnTz.jpg',
 'original_language': 'en',
 'original_title': 'Toy Story',
 'genre_ids': [16, 35, 10751],
 'title': 'Toy Story',
 'vote_average': 7.9,
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'release_date': '1995-11-22'}

In [None]:
movies.head(3).apply(lambda m:get_movie_details_from_tmdb(m['title'],m['year'])['overview'], axis=1)

In [None]:
# overviews = []
# failed = []
# for title,year in tqdm_notebook(titles_years):
#     ov = slow_api_calls(title=title, year=year)['overview']
#     ov = ov.replace('\t',' ').replace("\n", " ").replace("(", " ").replace(")", " ").replace("\r", " ")
#     overviews.append(ov)
#     if len(ov) == 0:
#         failed.append((title,year))
    
# print(len(failed))
# print(failed)

In [35]:

titles_years = list(zip(movies['title'],movies['year']))
# overviews = Parallel(n_jobs=8)(delayed(get_movie_details_from_tmdb)(title,year) for title,year in tqdm_notebook(titles_years))

overviews = [get_movie_details_from_tmdb(title=title, year=year)['overview'] for title,year in tqdm_notebook(titles_years)]



    


HBox(children=(IntProgress(value=0, max=3883), HTML(value='')))




In [36]:
movies['overview'] = overviews
movies[movies['overview']==''].shape

(830, 5)

In [39]:
movies.loc[movies['overview']=='','overview'] = movies[movies['overview']==''].apply(lambda m:get_movie_details_from_tmdb(m['title'].split(",")[0],m['year'])['overview'], axis=1)




In [40]:
movies[movies['overview']==''].shape

(219, 5)

In [None]:
movies.loc[movies['overview']=='','overview'] = movies[movies['overview']==''].apply(lambda m:get_movie_details_from_tmdb(m['title'].split("(")[0],m['year'])['overview'], axis=1)


In [None]:
movies[movies['overview']==''].shape


In [None]:
movies[movies['overview']=='']

In [44]:
slow_api_calls(title="Shanghai Triad ", year=1995)

{'popularity': 4.598,
 'id': 37557,
 'video': False,
 'vote_count': 33,
 'vote_average': 6.9,
 'title': 'Shanghai Triad',
 'release_date': '1995-12-22',
 'original_language': 'zh',
 'original_title': '摇啊摇，摇到外婆桥',
 'genre_ids': [18, 80],
 'backdrop_path': '/n78lIMVBMhZT7nMvEwxUcgGaPkh.jpg',
 'adult': False,
 'overview': 'Shanghai, China, 1930. When young Shuisheng arrives from the countryside, his uncle Liushu puts him at the service of Bijou, the mistress of Laoda, supreme boss of the Tang Triad, constantly threatened by his enemies, both those he knows and those lurking in the shadows.',
 'poster_path': '/qcoOCoN7viOhboGwhYXyApdDuiq.jpg'}