In [2]:
import pandas as pd
import numpy as np
import re

## Data Cleaning

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [4]:
display(movies.head())
display(ratings.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [5]:
movie_title = []
year = []

In [6]:
for element in movies['title']:
    match = re.match(r"(.+) \((\d{4})\)", element)
    # Vérifier si la correspondance a été trouvée
    if match:
        # Ajouter le nom du film à la liste des noms de films
        movie_title.append(match.group(1))
        
        # Ajouter l'année à la liste des années
        year.append(match.group(2))
    else:
        # Si aucune correspondance n'a été trouvée, ajouter des valeurs par défaut
        movie_title.append(None)
        year.append(None)

In [7]:
movies['title'] = movie_title
movies['year'] = year

In [8]:
df = movies.merge(ratings, on='movieId')

In [9]:
df['genres'] = df['genres'].replace('(no genres listed)', 'Not specified')

In [10]:
df['genres'] = df['genres'].apply(lambda row: row.split('|'))

## Scraping

In [11]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
import pickle

In [12]:
l_movies = list(df['title'].unique())

In [13]:
def retrieve_movies_results(url):
    try:
        # On crée une session de navigation web
        session = requests.Session()
        response = session.get(url)
        
        # On récupère les paramètres de cookies
        cookies_dictionary = session.cookies.get_dict()

        # On construire le cookie
        cookie = '; '.join([f'{key}={value}' for key, value in cookies_dictionary.items()])

        # On récupère le contenu dans l'url avec la configuration indiquée dans le header
        USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
        headers = {"user-agent": USER_AGENT, "cookie": cookie}  # Ajouter le cookie au header
        html_text = requests.get(url, headers=headers, timeout=20).text

        return html_text
    except :
        print('La requête a échouée')
        return None

In [14]:
def process_results(html_text):
    # La libraire BeautifulSoup permet de parser le texte que nous avons extrait de la page web
    soup = BeautifulSoup(html_text, features= "lxml")
    # On recherche tous les éléments html <div> qui indiquent des divisions vers la classe que nous cherchons à importer 
    data = [synopsis.text for synopsis in soup.find_all('div', {'class' : 'ipc-html-content-inner-div'})]
    results = pd.DataFrame({'Synopsis' : data})
    
    return results

In [28]:
# Fonction permettant de matcher dans un dataframe le premier résultat de la recherche ainsi que le nom du film recherché
def merging(url, movies, output_file='synopsis.pkl'):
    synops = []
    i = 0
    
    try:
        # Charger les données existantes si le fichier existe
        with open(output_file, 'rb') as file:
            saved_data = pickle.load(file)
            synops.extend(saved_data['synops'])
            i = saved_data['i']
    except FileNotFoundError:
        # Si le fichier n'existe pas, commencer à partir de zéro
        pass

    for val in movies[i:]:
        try:
            movie = val.replace(" ", "%20")
            complete_url = url + movie
            html_text = retrieve_movies_results(complete_url)
            results_df = process_results(html_text)
            synops.append(results_df['Synopsis'][0])
        except:  # Parfois un film ne comporte pas de description référencée sur IMDB et parfois soucis de matching entre les noms de la base et ceux de IMDB
            synops.append("Description non disponible")
        i += 1
        print("Scraping du",i,"e film : " + val)

        # Enregistrer les données après chaque requête
        data_to_save = {'synops': synops, 'i': i}
        with open(output_file, 'wb') as file:
            pickle.dump(data_to_save, file)

        # Ajouter un délai de 3 secondes entre chaque requête
        time.sleep(3)

    match = pd.DataFrame({'Synopsis': synops, 'title': movies})
    return match


In [29]:
# Import des noms des films
def movie_names_import (path):
    names = pd.read_csv(path)
    return names

In [30]:
def load_existing_base (file = "descriptions.csv"):
    # On essaie de charger la base de données existante
    try :
        base = pd.read_csv(file)
    #... sinon on la crée 
    except :
        # on crée un dataframe avec les bonnes colonnes
        columns = ["", "Synopsis", "title"]
        base = pd.DataFrame(columns=columns)
        # on l'enregistre au format csv
        base.to_csv(file)
    return base["title"]

In [31]:
synopsis= merging(url = 'https://www.imdb.com/search/title/?title=', movies = l_movies)

In [32]:
synopsis

Unnamed: 0,Synopsis,title
0,A cowboy doll is profoundly threatened and jea...,Toy Story
1,Four teenagers are sucked into a magical video...,Jumanji
2,John and Max resolve to save their beloved bai...,Grumpier Old Men
3,"Based on Terry McMillan's novel, this film fol...",Waiting to Exhale
4,George Banks must deal not only with his daugh...,Father of the Bride Part II
...,...,...
55113,"The story of Marie-Laure, a blind French teena...",We
55114,Scotland Yard Commander Clare Blake is called ...,Window of the Soul
55115,33-years old Tamás is heartbroken after his gi...,Bad Poems
55116,Sworn enemies find themselves in each other's ...,A Girl Thing


In [38]:
final_df = df.merge(synopsis, on='title')

In [40]:
final_df.to_csv('movie_db.csv')