In [1]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display, HTML
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

In [27]:
import tmdbsimple as tmdb
tmdb.API_KEY = '08d3df3441c114a6cdb87682cb5b8013'

from editdistance import eval as editdistance

from surprise import Dataset
from surprise import accuracy
import numpy as np
import time
import pandas as pd
from pathlib import Path
from surprise.model_selection import train_test_split
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25)
from tqdm import tqdm,tqdm_notebook
from joblib import Parallel, delayed
import os
import re

from hwer.utils import clean_text

In [12]:
%cd ..

import movielens_torch as movielens

/Users/ahemf/mygit/Hybrid-Weighted-Embedding-Recommender/examples/Movie-Lens


In [14]:
from inspect import getsourcefile
import os.path as path, sys
current_dir = path.dirname(path.abspath(getsourcefile(lambda:0)))
sys.path.insert(0, current_dir[:current_dir.rfind(path.sep)])
import movielens_torch as movielens
sys.path.pop(0)

'/Users/ahemf/mygit/Hybrid-Weighted-Embedding-Recommender/examples'

In [24]:
genre_names = np.array(["unknown", "action", "adventure", "animation",
                   "children", "comedy", "crime", "documentary", "drama", "fantasy",
                   "film-noir", "horror", "musical", "mystery", "romance", "sci-fi",
                   "thriller", "war", "western" ])


In [174]:
def ml100k_default_reader(directory):
    def read_user_line(l):
        id_, age, gender, occupation, zip_ = l.strip().split('|')
        age = np.searchsorted([10, 20, 30, 40, 50, 60], age)  # bin the ages into <20, 20-30, 30-40, ..., >60
        return {'id': int(id_), 'gender': gender, 'age': age, 'occupation': occupation, 'zip': zip_}

    def read_product_line(l):
        fields = l.strip().split('|')
        id_ = fields[0]
        title = fields[1]
        genres = fields[-19:]
        genres = list(map(int, genres))
        genres = np.array(list(map(bool, genres)))
        genres = genre_names[genres]

        # extract year
        if re.match(r'.*\([0-9]{4}\)$', title):
            year = title[-5:-1]
            title = title[:-6].strip()
        else:
            year = 0

        data = {'id': int(id_), 'title': title, 'year': year, 'genres': genres}
        return data

    def read_rating_line(l):
        user_id, product_id, rating, timestamp = l.split()
        return {'user_id': int(user_id), 'product_id': int(product_id), 'rating': float(rating),
                'timestamp': int(timestamp)}

    users = []
    products = []
    ratings = []

    # read ratings
    with open(os.path.join(directory, 'ua.base')) as f:
        for l in f:
            rating = read_rating_line(l)
            ratings.append(rating)
    with open(os.path.join(directory, 'ua.test')) as f:
        for l in f:
            rating = read_rating_line(l)
            ratings.append(rating)

    ratings = pd.DataFrame(ratings)
    product_count = ratings['product_id'].value_counts()
    product_count.name = 'product_count'
    ratings = ratings.join(product_count, on='product_id')

    # read users - if user feature does not exist, we find all unique user IDs
    # appeared in the rating table and create an empty table from that.
    user_file = os.path.join(directory, 'u.user')
    with open(user_file) as f:
        for l in f:
            users.append(read_user_line(l))
    users = pd.DataFrame(users).astype('category')

    # read products
    with open(os.path.join(directory, 'u.item'), encoding='latin1') as f:
        for l in f:
            products.append(read_product_line(l))
    products = (
        pd.DataFrame(products)
            .astype({'year': 'category'}))
    genres = products.columns[products.dtypes == bool]
    return users, products, ratings

In [175]:
users, products, ratings = ml100k_default_reader("100K/ml-100k")

In [84]:
movies = products

In [85]:
products.head()

Unnamed: 0,id,title,year,genres
0,1,Toy Story,1995,"[animation, children, comedy]"
1,2,GoldenEye,1995,"[action, adventure, thriller]"
2,3,Four Rooms,1995,[thriller]
3,4,Get Shorty,1995,"[action, comedy, drama]"
4,5,Copycat,1995,"[crime, drama, thriller]"


In [86]:
np.sum(movies.title.isna())
np.sum(movies.year.isna())
np.sum(movies.title.apply(len)==0)


0

0

0

In [176]:
users.to_csv("users.csv", sep="\t", index=False)
ratings.to_csv("ratings.csv", sep="\t", index=False)

## Enhancing item features with TMDB

In [88]:
import re

def clean_text(text):
    EMPTY = ' '
    if text is None:
        return EMPTY
    
    text = text.replace("\n", " ").replace("(", " ").replace(")", " ").replace("\r", " ").replace("\t", " ").lower()
    text = re.sub('<pre><code>.*?</code></pre>', EMPTY, text)
    text = re.sub('<code>.*?</code>', EMPTY, text)

    def replace_link(match):
        return EMPTY if re.match('[a-z]+://', match.group(1)) else match.group(1)

    text = re.sub('<a[^>]+>(.*)</a>', replace_link, text)
    text = re.sub('<.*?>', EMPTY, text)
    return text

In [108]:
def get_movie_details_from_tmdb(movie_id, title, year):
    year = int(year)
    
    search = tmdb.Search()
    response = search.movie(query=title)
    results = response['results']
    
    if len(results) == 0:
        tn = title.split(",")[0]
        search = tmdb.Search()
        response = search.movie(query=tn)
        results = response['results']
        
    if len(results) == 0:
        tn = title.split(":")[0]
        search = tmdb.Search()
        response = search.movie(query=tn)
        results = response['results']
        
    if len(results) == 0:
        tn = title.split("(")[0]
        search = tmdb.Search()
        response = search.movie(query=tn)
        results = response['results']
        
    if len(results) == 0:
        search = tmdb.Search()
        response = search.tv(query=title)
        results = response['results']
    
    
    
    from functools import cmp_to_key
    def cmp(m1, m2):
        edst_1 = editdistance(title.lower(), m1['title'].lower())
        edst_2 = editdistance(title.lower(), m2['title'].lower())
        if 'release_date' not in m1:
            return 1
        if 'release_date' not in m2:
            return -1
        year_diff_1 = np.abs(pd.to_datetime(m1['release_date']).year - year)
        year_diff_2 = np.abs(pd.to_datetime(m2['release_date']).year - year)
        score_1 = 0.3 * edst_1 + year_diff_1
        score_2 = 0.3 * edst_2 + year_diff_2
        return -1 if score_1 <= score_2 else 1
    results = list(sorted(results, key=cmp_to_key(cmp)))
    if len(results) > 0:
        movie = tmdb.Movies(results[0]['id'])
        keywords = [k['name'] for k in movie.keywords()['keywords']]
        info = movie.info()
        original_language = info['original_language']
        overview = clean_text(info['overview'])
        runtime = info['runtime']
        tagline = clean_text(info['tagline'])
        original_title = info['original_title']
        title = info['title']
        release_date = info['release_date']
        return {"movie_id":movie_id,"title":title, "keywords":keywords, "original_language":original_language, 
                "overview":overview, "runtime":runtime, "tagline":tagline, 
                'original_title':original_title, "release_date":release_date,
               "success":True}
    else:
        return {"movie_id":movie_id,"title":title, "keywords":[], "original_language":'', 
                "overview":'', "runtime":-1, "tagline":'', 
                'original_title':'',"release_date":str(year), 
                "success":False}
    


In [90]:
get_movie_details_from_tmdb(movie_id=100,title="Toy Story", year=1995)

[PID: 7808] [2020-01-20 22:52:11] [DEBUG] [urllib3.connectionpool]: Starting new HTTPS connection (1): api.themoviedb.org:443
[PID: 7808] [2020-01-20 22:52:12] [DEBUG] [urllib3.connectionpool]: https://api.themoviedb.org:443 "GET /3/search/movie?query=Toy+Story&api_key=08d3df3441c114a6cdb87682cb5b8013 HTTP/1.1" 200 None
[PID: 7808] [2020-01-20 22:52:12] [DEBUG] [urllib3.connectionpool]: Starting new HTTPS connection (1): api.themoviedb.org:443
[PID: 7808] [2020-01-20 22:52:13] [DEBUG] [urllib3.connectionpool]: https://api.themoviedb.org:443 "GET /3/movie/862/keywords?api_key=08d3df3441c114a6cdb87682cb5b8013 HTTP/1.1" 200 573
[PID: 7808] [2020-01-20 22:52:13] [DEBUG] [urllib3.connectionpool]: Starting new HTTPS connection (1): api.themoviedb.org:443
[PID: 7808] [2020-01-20 22:52:14] [DEBUG] [urllib3.connectionpool]: https://api.themoviedb.org:443 "GET /3/movie/862?api_key=08d3df3441c114a6cdb87682cb5b8013 HTTP/1.1" 200 None


{'movie_id': 100,
 'title': 'Toy Story',
 'keywords': ['martial arts',
  'jealousy',
  'toy',
  'boy',
  'friendship',
  'bullying',
  'elementary school',
  'friends',
  'rivalry',
  'rescue',
  'mission',
  'walkie talkie',
  'boy next door',
  'new toy',
  'neighborhood',
  'toy comes to life',
  'resourcefulness'],
 'original_language': 'en',
 'overview': "led by woody, andy's toys live happily in his room until andy's birthday brings buzz lightyear onto the scene. afraid of losing his place in andy's heart, woody plots against buzz. but when circumstances separate buzz and woody from their owner, the duo eventually learns to put aside their differences.",
 'runtime': 81,
 'tagline': '',
 'original_title': 'Toy Story',
 'release_date': '1995-10-30',
 'success': True}

In [None]:
movies.head(3).apply(lambda m:get_movie_details_from_tmdb(m['id'], m['title'],m['year'])['overview'], axis=1)

In [95]:
tmdb_data = {}


In [129]:

titles_years = list(zip(movies['id'], movies['title'],movies['year']))
# overviews = Parallel(n_jobs=8)(delayed(get_movie_details_from_tmdb)(title,year) for title,year in tqdm_notebook(titles_years))


for movie_id,title,year in tqdm_notebook(titles_years):
    if movie_id in tmdb_data:
        continue
    print(movie_id, title, year)
    movie_detail = get_movie_details_from_tmdb(movie_id=movie_id, title=title, year=year)
    tmdb_data[movie_id] = movie_detail




HBox(children=(IntProgress(value=0, max=1682), HTML(value='')))




In [163]:
unsuccessful =[k for k,v in tmdb_data.items() if not v['success']]
len(unsuccessful)


2

In [164]:
movies[movies.id.isin(unsuccessful)].head(30).tail(5)

Unnamed: 0,id,title,year,genres
1330,1331,"Last Klezmer: Leopold Kozlowski, His Life and ...",1995,[documentary]
1358,1359,Boys in Venice,1996,[drama]


In [166]:
movie_id = 1331
movie_detail = get_movie_details_from_tmdb(movie_id=movie_id, title="Klezmer", year=1995)
movie_detail
if movie_detail["success"]:
    tmdb_data[movie_id] = movie_detail
else:
    print("Fail")


[PID: 7808] [2020-01-21 11:43:03] [DEBUG] [urllib3.connectionpool]: Starting new HTTPS connection (1): api.themoviedb.org:443
[PID: 7808] [2020-01-21 11:43:03] [DEBUG] [urllib3.connectionpool]: https://api.themoviedb.org:443 "GET /3/search/movie?query=Klezmer&api_key=08d3df3441c114a6cdb87682cb5b8013 HTTP/1.1" 200 None
[PID: 7808] [2020-01-21 11:43:03] [DEBUG] [urllib3.connectionpool]: Starting new HTTPS connection (1): api.themoviedb.org:443
[PID: 7808] [2020-01-21 11:43:03] [DEBUG] [urllib3.connectionpool]: https://api.themoviedb.org:443 "GET /3/movie/9367/keywords?api_key=08d3df3441c114a6cdb87682cb5b8013 HTTP/1.1" 200 227
[PID: 7808] [2020-01-21 11:43:03] [DEBUG] [urllib3.connectionpool]: Starting new HTTPS connection (1): api.themoviedb.org:443
[PID: 7808] [2020-01-21 11:43:04] [DEBUG] [urllib3.connectionpool]: https://api.themoviedb.org:443 "GET /3/movie/9367?api_key=08d3df3441c114a6cdb87682cb5b8013 HTTP/1.1" 200 None


{'movie_id': 1331,
 'title': 'El Mariachi',
 'keywords': ['united states–mexico barrier',
  'killer',
  'legs',
  'cult film',
  'arms',
  'guitar case'],
 'original_language': 'es',
 'overview': 'el mariachi just wants to play his guitar and carry on the family tradition. unfortunately, the town he tries to find work in has another visitor, a killer who carries his guns in a guitar case. the drug lord and his henchmen mistake el mariachi for the killer, azul, and chase him around town trying to kill him and get his guitar case.',
 'runtime': 81,
 'tagline': "he didn't come looking for trouble, but trouble came looking for him.",
 'original_title': 'El Mariachi',
 'release_date': '1992-09-04',
 'success': True}

In [167]:
unsuccessful =[k for k,v in tmdb_data.items() if not v['success']]
len(unsuccessful)


1

In [168]:
tmdb_df = pd.DataFrame.from_records(list(tmdb_data.values()))
tmdb_df.drop(columns=["success"], inplace=True)
tmdb_df.shape
assert tmdb_df.shape[0] == len(list(tmdb_data.values()))

tmdb_df.to_csv("tmdb_data.csv", sep="\t", index=False)


(1682, 9)

In [170]:
movies.shape
tmdb_df.rename(columns={"title":"tmdb_title"}, inplace=True)
movies = movies.merge(tmdb_df, left_on="id", right_on="movie_id")
movies.shape


(1682, 4)

(1682, 13)

In [171]:
movies.to_csv("movies.csv", sep="\t", index=False)

In [172]:
movies = pd.read_csv("movies.csv", sep="\t", engine="python")
users = pd.read_csv("users.csv", sep="\t")
ratings = pd.read_csv("ratings.csv", sep="\t")




In [173]:
movies.head()

Unnamed: 0,id,title,year,genres,movie_id,tmdb_title,keywords,original_language,overview,runtime,tagline,original_title,release_date
0,1,Toy Story,1995,['animation' 'children' 'comedy'],1,Toy Story,"['martial arts', 'jealousy', 'toy', 'boy', 'fr...",en,"led by woody, andy's toys live happily in his ...",81.0,,Toy Story,1995-10-30
1,2,GoldenEye,1995,['action' 'adventure' 'thriller'],2,GoldenEye,"['cuba', 'falsely accused', 'secret identity',...",en,james bond must unmask the mysterious head of ...,130.0,no limits. no fears. no substitutes.,GoldenEye,1995-11-16
2,3,Four Rooms,1995,['thriller'],3,Four Rooms,"['hotel', ""new year's eve"", 'witch', 'bet', 'h...",en,it's ted the bellhop's first night on the job....,98.0,twelve outrageous guests. four scandalous requ...,Four Rooms,1995-12-09
3,4,Get Shorty,1995,['action' 'comedy' 'drama'],4,Get Shorty,"['gambling', 'miami', 'based on novel or book'...",en,chili palmer is a miami mobster who gets sent ...,105.0,"the mob is tough, but it’s nothing like show b...",Get Shorty,1995-10-20
4,5,Copycat,1995,['crime' 'drama' 'thriller'],5,Copycat,"['police brutality', 'psychology', 'police ope...",en,an agoraphobic psychologist and a female detec...,124.0,one man is copying the most notorious killers ...,Copycat,1995-10-27
