In [8]:
import numpy as np
import pandas as pd
import pickle
import requests
import json

from cleantext import cleantext
from os.path import exists
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [19]:
#2500
url_reviews_api = 'https://imdb-api.com/en/API/Reviews/k_af2q00ae/'
url_rating_api = 'https://imdb-api.com/en/API/Ratings/k_af2q00ae/'
#50
#url_reviews_api = 'https://imdb-api.com/en/API/Reviews/k_60sb8gc9/'
#url_rating_api = 'https://imdb-api.com/en/API/Ratings/k_60sb8gc9/'
plk_path = 'data/movies.plk'
csv_path = 'data/dataset/mpst_full_data.csv'
model_path = 'data/model.sav'
tv_path = 'data/tv.sav'
final_model_path = 'data/final_model.plk'
sig_path = 'data/sig.plk'

In [11]:
model = pickle.load(open(model_path, 'rb'))
tv = pickle.load(open(tv_path, 'rb'))

In [12]:
# Check if my dataset was already cleaned and saved as plk. If so, then open it
if exists(plk_path):
    movies_cleaned_df = pd.read_pickle(plk_path)
    print(f'File {plk_path} opened successfully')
    
# Otherwise, open the original csv data file, and clean it up
else:
    movies_df = pd.read_csv(csv_path)
    movies_cleaned_df = movies_df.drop(
        columns=['tags', 'split', 'synopsis_source']
    )
    movies_cleaned_df['reviews'] = np.empty((len(movies_cleaned_df), 0)).tolist()
    movies_cleaned_df['imDb_rate'] = 0
    movies_cleaned_df['metacritic_rate'] = 0
    movies_cleaned_df['theMovieDb_rate'] = 0
    movies_cleaned_df['rottenTomatoes_rate'] = 0
    movies_cleaned_df['filmAffinity_rate'] = 0
    movies_cleaned_df['reviews_avg_rate'] = 0
    movies_cleaned_df['sentiment_avg_rate'] = 0
    movies_cleaned_df.to_pickle(plk_path)
    print(f'File {plk_path} created successfully')

File data/movies.plk opened successfully


In [13]:
movies_cleaned_df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,reviews,imDb_rate,metacritic_rate,theMovieDb_rate,rottenTomatoes_rate,filmAffinity_rate,reviews_avg_rate,sentiment_avg_rate,avg_rate
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,[This terrifying film with plenty of vampires ...,7.0,8.2,7.2,8.8,6.5,8.5,9.2,7.91
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...","[Alright, given the reviews and the ratings on...",4.4,0.0,4.3,0.0,3.3,7.1,2.4,4.3
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",[The Stewart /Sullavan relationship and the wa...,8.1,9.6,8.4,10.0,8.0,9.4,10.0,9.07
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","[""Mr. Holland's Opus"" is the story of a musici...",7.3,5.9,7.0,7.5,6.3,8.9,10.0,7.56
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","[""Scarface"" has a major cult following even no...",8.3,6.5,8.2,8.1,8.2,8.8,9.6,8.24


In [14]:
"""
max = 10
current = 0
"""
for i in range(len(movies_cleaned_df)):
    """
    if(current == max):
        break
    """
    # If the film hasn't been treated yet
    if(len(movies_cleaned_df.loc[i, 'reviews']) == 0):
        
        # get the id
        movie_id = movies_cleaned_df.loc[i, 'imdb_id']
        
        # Build the paths for the API requests
        url_review = url_reviews_api + movie_id
        url_rating = url_rating_api + movie_id
        
        #get the data for the movie
        reviews = requests.get(url_review).text
        rating = requests.get(url_rating).text
        
        json_reviews = json.loads(reviews)
        json_rating = json.loads(rating)
        
        # Check if the ratings where properly gotten from the API and, if so, 
        # Insert the rating values into the dataframe using a 0-10 scale, or zero si no value is defined, 
        if(json_rating['title'] != None):
        
            movies_cleaned_df.loc[i, 'imDb_rate'] = float(json_rating['imDb'] or 0)
            movies_cleaned_df.loc[i, 'metacritic_rate'] = float(json_rating['metacritic'] or 0)/10
            movies_cleaned_df.loc[i, 'theMovieDb_rate'] = float(json_rating['theMovieDb'] or 0)
            movies_cleaned_df.loc[i, 'rottenTomatoes_rate'] = float(json_rating['rottenTomatoes'] or 0)/10
            movies_cleaned_df.loc[i, 'filmAffinity_rate'] = float(json_rating['filmAffinity'] or 0)
        
        # Check if there are reviews gotten from the API
        if(json_reviews['title'] != None and len(json_reviews['items']) > 0):
            rate = 0
            rates_count = 0
            sentiment_count = 0

            # Calculate the average rating for all the reviews
            for review in json_reviews['items']:
                movies_cleaned_df.loc[i, 'reviews'].append(review['content'])
                
                cleaned_review = [cleantext(review['content'])]
                sentiment = model.predict(tv.transform(cleaned_review).toarray())
                sentiment_count += int(sentiment)*10

                if (review['rate'] != ''):
                    rate += int(review['rate'])
                    rates_count += 1
            
            movies_cleaned_df.loc[i, 'reviews_avg_rate'] = round(rate/rates_count, 1) if rates_count > 0 else 0
            movies_cleaned_df.loc[i, 'sentiment_avg_rate'] = round(sentiment_count/len(json_reviews['items']), 1)
            
            all_rates = movies_cleaned_df.loc[i].filter(items=['imDb_rate',
                                                               'metacritic_rate',
                                                                'theMovieDb_rate',
                                                                'rottenTomatoes_rate',
                                                                'filmAffinity_rate',
                                                                'reviews_avg_rate',
                                                                'sentiment_avg_rate']
                                                        ).to_numpy()
            
            movies_cleaned_df.loc[i, 'avg_rate'] = round(all_rates[np.nonzero(all_rates)].mean(), 2)
            
            print('Rates saved for the film : ' + movies_cleaned_df.loc[i, 'title'])
        
        else:
            print('No reviews or rating found for the film : ' + movies_cleaned_df.loc[i, 'title'])
            movies_cleaned_df = movies_cleaned_df.drop(i).reset_index(drop=True)
        
        #current += 1
        
movies_cleaned_df.to_pickle(plk_path)

In [15]:
len(movies_cleaned_df[movies_cleaned_df['reviews'].str.len() == 0])

0

In [16]:
movies = movies_cleaned_df.rename(columns={'imdb_id': 'id', 'plot_synopsis' : 'overview', 'title' : 'original_title'})
print(len(movies))
movies.head()

14212


Unnamed: 0,id,original_title,overview,reviews,imDb_rate,metacritic_rate,theMovieDb_rate,rottenTomatoes_rate,filmAffinity_rate,reviews_avg_rate,sentiment_avg_rate,avg_rate
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,[This terrifying film with plenty of vampires ...,7.0,8.2,7.2,8.8,6.5,8.5,9.2,7.91
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...","[Alright, given the reviews and the ratings on...",4.4,0.0,4.3,0.0,3.3,7.1,2.4,4.3
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",[The Stewart /Sullavan relationship and the wa...,8.1,9.6,8.4,10.0,8.0,9.4,10.0,9.07
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","[""Mr. Holland's Opus"" is the story of a musici...",7.3,5.9,7.0,7.5,6.3,8.9,10.0,7.56
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","[""Scarface"" has a major cult following even no...",8.3,6.5,8.2,8.1,8.2,8.8,9.6,8.24


In [22]:
if exists(final_model_path):
    tfv_matrix = pickle.load(open(final_model_path, 'rb'))
    print(f'File {final_model_path} opened successfully')

else :
    tfv = TfidfVectorizer(
        min_df=3,
        max_features=None,
        strip_accents="unicode",
        analyzer="word",
        token_pattern="\w{1}",
        ngram_range=(1, 3),
        stop_words="english",
    )

    tfv_matrix = tfv.fit_transform(movies["overview"])
    pickle.dump(tfv_matrix, open(final_model_path, 'wb'))
    print(f'File {final_model_path} created successfully')

File data/final_model.plk opened successfully


In [23]:
if exists(sig_path) :
    sig = pickle.load(open(sig_path, 'rb'))
    print(f'File {sig_path} opened successfully')

else :
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    pickle.dump(sig, open(sig_path, 'wb'))
    print(f'File {sig_path} created successfully')

File data/sig.plk opened successfully


In [24]:
indices = pd.Series(movies.index, index=movies['original_title']).drop_duplicates()

In [25]:
def give_rec(title, sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    sig_scores = sig_scores[1:11]
    movie_indices = [i[0] for i in sig_scores]
    
    result = pd.concat([
        movies['original_title'].iloc[movie_indices],
        movies['avg_rate'].iloc[movie_indices]], axis=1)
    result = result.rename(columns={'original_title': 'Title', 'avg_rate': 'Rating'})
    result = result.set_index(['Title', 'Rating'])
    result = result.sort_values(by=['Rating'], ascending=False)
    
    return result

In [26]:
give_rec("Mr. Holland's Opus")

Title,Rating
King Kong,8.33
Frozen,8.0
Contact,7.41
Contagion,6.94
Twilight Zone: The Movie,6.43
The Life of David Gale,5.93
The Rules of Attraction,5.86
The Family,5.53
Deep Impact,5.21
Glen or Glenda,3.87
