Récuperation et calcul des reviews et ratings.

Utilise le dataset mpst_full_data.csv et crée les fichiers movies.plk, final_model.plk et sig.plk

In [1]:
import numpy as np
import pandas as pd
import pickle
import requests
import json

from time import time
from pprint import pprint
from os.path import exists
from get_data import getData
from cleantext import cleantext
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dalex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
tfIdfOverview_path = 'data/tfidf_overview_reccommendation.plk'
tfIdfScores_path = 'data/tfidf_scores_reccommendation.plk'
sig_path = 'data/sig_reccommendation.plk'

In [3]:
movies_cleaned_df = getData()

File data/full_data.plk opened successfully


In [4]:
movies_cleaned_df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,reviews,imDb_rate,metacritic_rate,theMovieDb_rate,rottenTomatoes_rate,filmAffinity_rate,reviews_avg_rate,sentiment_avg_rate,avg_rate
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,[This terrifying film with plenty of vampires ...,7.0,8.2,7.2,8.8,6.5,8.5,9.2,7.91
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...","[Alright, given the reviews and the ratings on...",4.4,0.0,4.3,0.0,3.3,7.1,2.4,4.3
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",[The Stewart /Sullavan relationship and the wa...,8.1,9.6,8.4,10.0,8.0,9.4,10.0,9.07
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","[""Mr. Holland's Opus"" is the story of a musici...",7.3,5.9,7.0,7.5,6.3,8.9,10.0,7.56
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","[""Scarface"" has a major cult following even no...",8.3,6.5,8.2,8.1,8.2,8.8,9.6,8.24


In [5]:
# Check that all the films have reviews 
len(movies_cleaned_df[movies_cleaned_df['reviews'].str.len() == 0])

0

In [6]:
movies = movies_cleaned_df.rename(columns={'imdb_id': 'id', 'plot_synopsis' : 'overview', 'title' : 'original_title'})
print(len(movies))

14212


In [7]:
movies.head()

Unnamed: 0,id,original_title,overview,reviews,imDb_rate,metacritic_rate,theMovieDb_rate,rottenTomatoes_rate,filmAffinity_rate,reviews_avg_rate,sentiment_avg_rate,avg_rate
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,[This terrifying film with plenty of vampires ...,7.0,8.2,7.2,8.8,6.5,8.5,9.2,7.91
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...","[Alright, given the reviews and the ratings on...",4.4,0.0,4.3,0.0,3.3,7.1,2.4,4.3
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",[The Stewart /Sullavan relationship and the wa...,8.1,9.6,8.4,10.0,8.0,9.4,10.0,9.07
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","[""Mr. Holland's Opus"" is the story of a musici...",7.3,5.9,7.0,7.5,6.3,8.9,10.0,7.56
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","[""Scarface"" has a major cult following even no...",8.3,6.5,8.2,8.1,8.2,8.8,9.6,8.24


In [8]:
# Create a set of unique word along all the 50k reviews
unique_words = set([])

for overview in movies['overview']:
    split = overview.split()
    for word in split:
        unique_words.add(word)

print('There are %s different words along all the overviews.' %len(unique_words))

There are 420958 different words along all the overviews.


In [31]:
overviews = movies['overview'].astype('str')
rates = movies['avg_rate'].astype('str')

In [32]:
print(len(overviews))
print(len(rates))

14212
14212


In [47]:
if exists(tfIdfOverview_path) and exists(tfIdfScores_path) :
    tfv_matrix = pickle.load(open(tfIdfOverview_path, 'rb'))
    print(f'File {tfIdfOverview_path} opened successfully')
    tfv_rate = pickle.load(open(tfIdfScores_path, 'rb'))
    print(f'File {tfIdfScores_path} opened successfully')
    
else :
    tfv = TfidfVectorizer(
        min_df=3,
        max_features=None,
        analyzer='word',
        token_pattern='\w{1}',
        ngram_range=(1, 3),
    )

    tfv_matrix = tfv.fit_transform(overviews)
    tfv_rate = tfv.transform(rates)


    pickle.dump(tfv_matrix, open(tfIdfOverview_path, 'wb'))
    print(f'File {tfIdfOverview_path} created successfully')
    pickle.dump(tfv_rate, open(tfIdfScores_path, 'wb'))
    print(f'File {tfIdfScores_path} created successfully')

File data/tfidf_overview_reccommendation.plk created successfully
File data/tfidf_scores_reccommendation.plk created successfully


In [50]:
print(tfv_matrix.shape)
print(tfv_rate.shape)

(14212, 21874)
(14212, 21874)


In [51]:
if exists(sig_path) :
    sig = pickle.load(open(sig_path, 'rb'))
    print(f'File {sig_path} opened successfully')

else :
    sig = sigmoid_kernel(tfv_matrix, tfv_rate)
    pickle.dump(sig, open(sig_path, 'wb'))
    print(f'File {sig_path} created successfully')

File data/sig_reccommendation.plk created successfully


In [52]:
indices = pd.Series(movies.index, index=movies['original_title']).drop_duplicates()

In [67]:
def give_rec(title, sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    sig_scores = sig_scores[1:11]
    movie_indices = [i[0] for i in sig_scores]
    
    result = pd.concat([
        movies['original_title'].iloc[movie_indices],
        movies['avg_rate'].iloc[movie_indices]], axis=1)
    result = result.rename(columns={'original_title': 'Title', 'avg_rate': 'Rating'})
    result = result.set_index(['Title', 'Rating'])
    result = result.sort_values(by=['Rating'], ascending=False)
    return result

In [68]:
give_rec("Mr. Holland's Opus")

Title,Rating
Combat Shock,6.0
The Exorcism of Emily Rose,6.0
Psycho III,6.0
Brooklyn's Finest,6.0
Stand Up Guys,6.0
Way of the Vampire,1.98
Zaat,1.98
Meet the Spartans,1.97
The Garbage Pail Kids Movie,1.97
2010: Moby Dick,1.96
