In [1]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display, HTML
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

In [6]:
import tmdbsimple as tmdb
tmdb.API_KEY = '08d3df3441c114a6cdb87682cb5b8013'

from editdistance import eval as editdistance

from surprise import Dataset
from surprise import accuracy
import numpy as np
import time
import pandas as pd
from pathlib import Path
from surprise.model_selection import train_test_split
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=.25)
from tqdm import tqdm,tqdm_notebook
from joblib import Parallel, delayed

from hwer.utils import clean_text

In [3]:



path = Path(data.ratings_file)
ml_1m_dir = path.resolve().parents[1]
files = list(ml_1m_dir.glob('**/*.dat'))

users = [f for f in files if "users.dat" in str(f)][0]
movies = [f for f in files if "movies.dat" in str(f)][0]
ratings = [f for f in files if "ratings.dat" in str(f)][0]

users = pd.read_csv(str(users),sep="::", header=None, names=["user_id", "gender", "age", "occupation", "zip"], engine='python')
movies = pd.read_csv(str(movies),sep="::", header=None, names=["movie_id", "title", "genres"], engine='python')
ratings = pd.read_csv(str(ratings),sep="::", header=None, names=["user_id", "movie_id", "rating", "timestamp"], engine='python')

movies['genres'] = movies['genres'].apply(lambda x: x.lower().split('|'))
users['user_id'] = users['user_id'].astype(str)
movies['movie_id'] = movies['movie_id'].astype(str)
ratings['movie_id'] = ratings['movie_id'].astype(str)
ratings['user_id'] = ratings['user_id'].astype(str)

print(users.shape, movies.shape, ratings.shape)

user_item_affinities = list(map(lambda x: tuple([x[0], x[1], x[2]]), data.raw_ratings))


(6040, 5) (3883, 3) (1000209, 4)


In [4]:
movies.head(3)
movies['year'] = movies.title.apply(lambda x: x[-5:-1])
movies.title = movies.title.apply(lambda x: x[:-7])
movies.head(3)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"[animation, children's, comedy]"
1,2,Jumanji (1995),"[adventure, children's, fantasy]"
2,3,Grumpier Old Men (1995),"[comedy, romance]"


Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,"[animation, children's, comedy]",1995
1,2,Jumanji,"[adventure, children's, fantasy]",1995
2,3,Grumpier Old Men,"[comedy, romance]",1995


In [5]:
np.sum(movies.title.isna())
np.sum(movies.year.isna())
np.sum(movies.title.apply(len)==0)
np.sum(movies.year.apply(len)==0)

0

0

0

0

In [202]:
users.to_csv("users.csv", sep="\t", index=False)
ratings.to_csv("ratings.csv", sep="\t", index=False)

## Enhancing item features with TMDB

In [14]:
import re

def clean_text(text):
    EMPTY = ' '
    if text is None:
        return EMPTY
    
    text = text.replace("\n", " ").replace("(", " ").replace(")", " ").replace("\r", " ").replace("\t", " ").lower()
    text = re.sub('<pre><code>.*?</code></pre>', EMPTY, text)
    text = re.sub('<code>.*?</code>', EMPTY, text)

    def replace_link(match):
        return EMPTY if re.match('[a-z]+://', match.group(1)) else match.group(1)

    text = re.sub('<a[^>]+>(.*)</a>', replace_link, text)
    text = re.sub('<.*?>', EMPTY, text)
    return text

In [23]:
def get_movie_details_from_tmdb(movie_id, title, year):
    year = int(year)
    
    search = tmdb.Search()
    response = search.movie(query=title)
    results = response['results']
    
    if len(results) == 0:
        tn = title.split(",")[0]
        search = tmdb.Search()
        response = search.movie(query=tn)
        results = response['results']
        
    if len(results) == 0:
        tn = title.split(":")[0]
        search = tmdb.Search()
        response = search.movie(query=tn)
        results = response['results']
        
    if len(results) == 0:
        tn = title.split("(")[0]
        search = tmdb.Search()
        response = search.movie(query=tn)
        results = response['results']
    
    
    
    from functools import cmp_to_key
    def cmp(m1, m2):
        edst_1 = editdistance(title.lower(), m1['title'].lower())
        edst_2 = editdistance(title.lower(), m2['title'].lower())
        if 'release_date' not in m1:
            return 1
        if 'release_date' not in m2:
            return -1
        year_diff_1 = np.abs(pd.to_datetime(m1['release_date']).year - year)
        year_diff_2 = np.abs(pd.to_datetime(m2['release_date']).year - year)
        score_1 = 0.3 * edst_1 + year_diff_1
        score_2 = 0.3 * edst_2 + year_diff_2
        return -1 if score_1 <= score_2 else 1
    results = list(sorted(results, key=cmp_to_key(cmp)))
    if len(results) > 0:
        movie = tmdb.Movies(results[0]['id'])
        keywords = [k['name'] for k in movie.keywords()['keywords']]
        info = movie.info()
        original_language = info['original_language']
        overview = clean_text(info['overview'])
        runtime = info['runtime']
        tagline = clean_text(info['tagline'])
        original_title = info['original_title']
        title = info['title']
        release_date = info['release_date']
        return {"movie_id":movie_id,"title":title, "keywords":keywords, "original_language":original_language, 
                "overview":overview, "runtime":runtime, "tagline":tagline, 
                'original_title':original_title, "release_date":release_date,
               "success":True}
    else:
        return {"movie_id":movie_id,"title":title, "keywords":[], "original_language":'', 
                "overview":'', "runtime":-1, "tagline":'', 
                'original_title':'',"release_date":str(year), 
                "success":False}
    


In [24]:
get_movie_details_from_tmdb(movie_id=100,title="Toy Story", year=1995)

{'movie_id': 100,
 'title': 'Toy Story',
 'keywords': ['martial arts',
  'jealousy',
  'toy',
  'boy',
  'friendship',
  'bullying',
  'elementary school',
  'friends',
  'rivalry',
  'rescue',
  'mission',
  'walkie talkie',
  'boy next door',
  'new toy',
  'neighborhood',
  'toy comes to life',
  'resourcefulness'],
 'original_language': 'en',
 'overview': "led by woody, andy's toys live happily in his room until andy's birthday brings buzz lightyear onto the scene. afraid of losing his place in andy's heart, woody plots against buzz. but when circumstances separate buzz and woody from their owner, the duo eventually learns to put aside their differences.",
 'runtime': 81,
 'tagline': '',
 'original_title': 'Toy Story',
 'release_date': '1995-10-30',
 'success': True}

In [108]:
movies.head(3).apply(lambda m:get_movie_details_from_tmdb(m['title'],m['year'])['overview'], axis=1)

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
dtype: object

In [None]:
tmdb_data = {}


In [29]:

titles_years = list(zip(movies['movie_id'], movies['title'],movies['year']))
# overviews = Parallel(n_jobs=8)(delayed(get_movie_details_from_tmdb)(title,year) for title,year in tqdm_notebook(titles_years))


for movie_id,title,year in tqdm_notebook(titles_years):
    if movie_id in tmdb_data:
        continue
    movie_detail = get_movie_details_from_tmdb(movie_id=movie_id, title=title, year=year)
    tmdb_data[movie_id] = movie_detail




HBox(children=(IntProgress(value=0, max=3883), HTML(value='')))




In [34]:
unsuccessful =[k for k,v in tmdb_data.items() if not v['success']]
len(unsuccessful)


30

In [76]:
movies[movies.movie_id.isin(unsuccessful)].head(30).tail(5)

Unnamed: 0,movie_id,title,genres,year
3508,3577,Two Moon Juction,[drama],1988
3693,3762,Daughter of Dr. Jeckyll,[horror],1957
3730,3799,Pok�mon the Movie 2000,"[animation, children's]",2000
3780,3850,Whatever Happened to Aunt Alice?,"[crime, thriller]",1969
3784,3854,Aim�e & Jaguar,"[drama, romance]",1999


In [75]:
movie_id = "3854"
movie_detail = get_movie_details_from_tmdb(movie_id=movie_id, title="Jaguar", year=1999)
movie_detail
if movie_detail["success"]:
    tmdb_data[movie_id] = movie_detail
else:
    print("Fail")


{'movie_id': '3854',
 'title': 'Aimee & Jaguar',
 'keywords': ['berlin germany',
  'world war ii',
  'jew',
  'forbidden love',
  'lesbian relationship',
  'homosexuality',
  'nazi germany',
  'lesbian interest'],
 'original_language': 'de',
 'overview': 'in 1943, while the allies are bombing berlin and the gestapo is purging the capital of jews, a dangerous love affair blossoms between two women - one a jewish member of the underground, the other an exemplar of nazi motherhood.',
 'runtime': 125,
 'tagline': 'bound by desire. torn apart by war.',
 'original_title': 'Aimee & Jaguar',
 'release_date': '1999-02-10',
 'success': True}

In [77]:
unsuccessful =[k for k,v in tmdb_data.items() if not v['success']]
len(unsuccessful)


12

In [83]:
tmdb_df = pd.DataFrame.from_records(list(tmdb_data.values()))
tmdb_df.drop(columns=["success"], inplace=True)
tmdb_df.shape
assert tmdb_df.shape[0] == len(list(tmdb_data.values()))

tmdb_df.to_csv("tmdb_data.csv", sep="\t", index=False)


(3883, 9)

In [85]:
movies.shape
tmdb_df.rename(columns={"title":"tmdb_title"}, inplace=True)
movies = movies.merge(tmdb_df, on="movie_id")
movies.shape


(3883, 4)

(3883, 12)

In [87]:
movies.to_csv("movies.csv", sep="\t", index=False)

## Enhancing item features with "the-movies-dataset"

- https://www.kaggle.com/rounakbanik/the-movies-dataset
- [Google Drive Mirror](https://drive.google.com/open?id=1aBT4ojTiY-2I5NxUJAq2R1BtxbU7mpIQ)



In [None]:
movies.merge(meta_df, on="title", )

In [79]:
len(set(movies['title']) - set(meta_df['title']))

len(set(movies['title']) - set(meta_df['original_title']))

fs = set(meta_df['original_title'])
fs.update(set(meta_df['title']))

len(set(movies['title']) - fs)

1337

1389

1323

In [81]:
meta_df.head()


Unnamed: 0,id,original_language,original_title,title,overview,release_date,runtime,keywords
0,862,en,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,81.0,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,en,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,1995-12-15,104.0,"[board game, disappearance, based on children'..."
2,15602,en,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,1995-12-22,101.0,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,en,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,127.0,"[based on novel, interracial relationship, sin..."
4,11862,en,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,1995-02-10,106.0,"[baby, midlife crisis, confidence, aging, daug..."


In [73]:
meta_df['keywords'] = meta_df['keywords'].apply(literal_eval).apply(lambda x: [o['name'] for o in x])
meta_df['keywords'].head()


0    [jealousy, toy, boy, friendship, friends, riva...
1    [board game, disappearance, based on children'...
2    [fishing, best friend, duringcreditsstinger, o...
3    [based on novel, interracial relationship, sin...
4    [baby, midlife crisis, confidence, aging, daug...
Name: keywords, dtype: object

In [72]:
meta_df.head()

Unnamed: 0,id,original_language,original_title,title,overview,release_date,runtime,keywords
0,862,en,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,81.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,en,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,1995-12-15,104.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,en,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,1995-12-22,101.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,en,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,127.0,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,en,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,1995-02-10,106.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [71]:
import re


meta_df["id"] = meta_df["id"].apply(lambda x: int(re.sub('[^0-9]+','',x))).astype(int)
key_df.id = key_df.id.astype(int)
meta_df = meta_df.merge(key_df, on="id")



In [57]:
meta_df["id"].head().apply(int).values

array([  862,  8844, 15602, 31357, 11862])

In [35]:
key_df.shape
key_df.columns
key_df.head()


(46419, 2)

Index(['id', 'keywords'], dtype='object')

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [37]:

meta_df = meta_df[["id","original_language","original_title","title","overview","release_date","runtime"]]


In [34]:
meta_df.shape
meta_df.columns
meta_df.head(3)

(45466, 24)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [31]:
meta_df = pd.read_csv("../downloaded-data/the-movies-dataset/movies_metadata.csv")
key_df = pd.read_csv("../downloaded-data/the-movies-dataset/keywords.csv")


In [28]:
movies.genres.head().values


array([list(['animation', "children's", 'comedy']),
       list(['adventure', "children's", 'fantasy']),
       list(['comedy', 'romance']), list(['comedy', 'drama']),
       list(['comedy'])], dtype=object)

In [25]:
from ast import literal_eval

movies.genres = movies.genres.fillna("[]").apply(literal_eval)
movies['year'] = movies['year'].fillna(-1).astype(int)



In [5]:
movies = pd.read_csv("movies.csv", sep="\t", engine="python")
users = pd.read_csv("users.csv", sep="\t")
ratings = pd.read_csv("ratings.csv", sep="\t")


