In [1]:
pip install unidecode scikit-learn BeautifulSoup4 pandas requests lxml selenium regex

Note: you may need to restart the kernel to use updated packages.


In [19]:
from sklearn.preprocessing import StandardScaler
from requests.auth import HTTPBasicAuth
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd
import numpy as np
import unidecode
import requests
import locale
import pickle
import json
import re 
import lxml
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

## 1. Get the film data from the website

The reasons why I chose the following features:
1. It's easy to manage numerical and categorical values for future predictions.
2. After going through several online projects regarding movie predictions, I found the most commonly used features from IMDb datasets to be `languages`, `budgets`, `genres`, `runtime`, `vote_average`, `vote_count`.
3. After checking the SensCritique website, I found features that might be useful for prediction. 
   1. The chosen features:
      1. Numerical: `ranking`, `year`, `duration`, `stars`, `favorites`, `saves`, `number of critiques`.
      2. Categorical: `director`, `genres`, `group`, `country of origin (pays d'origine)`.
      3. Other: `original_title` could be used to create a linked table with information from IMDb datasets.
   2. The features that I did not choose:
      1. `Synopsis`: Requires further NLP (Natural Language Processing) analysis, which could make it challenging to process and find a link to the ranking.

In [31]:
search_dict_main_page = {
    'rating': {'name': 'div', 'attrs': {"data-testid": "Rating"}},
    'director': {'name': 'a', 'attrs': {"data-testid": "link"}},
    'duration': {'name': 'span', 'attrs': {"data-testid": "duration"}},
    'genres': {'name': 'span', 'attrs': {"data-testid": "genres"}},
    'ranking': {'name': 'span', 'attrs':{'data-testid': 'product-title-wrapper'}}
}

search_dict_film_page = {
    'film_year': {'name': 'p', 'attrs': {"class": "Text__SCTitle-sc-1aoldkr-1 CoverProductInfos__StyledText-sc-1un0kh1-13 eGhlHy jugtWW"}},
    'original_title':{'name': 'p', 'attrs': {"class": "Text__SCTitle-sc-1aoldkr-1 CoverProductInfos__StyledText-sc-1un0kh1-13 eGhlHy kuMSsq"}}
}


In [33]:
class RequestSensCritique:
    def __init__(self):
        self._base_urls = 'https://www.senscritique.com/'
    
    def create_soup(self, content, is_url=True):
        if is_url:
            page_content = requests.get(content).content
        else:
            page_content = content
        
        return BeautifulSoup(page_content, "html.parser")

    def find_text(self, element, search_criteria):
        found_element = element.find(**search_criteria)
        return found_element.text if found_element else None

    def get_full_page(self, link):
        full_url = self._base_urls + link
        driver = webdriver.Chrome()

        try:
            driver.get(full_url)

            for _ in range(8):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
            
            page_source = driver.page_source

        finally:
            driver.quit()

        return self.create_soup(page_source, is_url=False)
        
    def get_film_page_info(self, film_link, dict_name):
        film_url = self._base_urls + film_link
        film_page_soup = self.create_soup(film_url)

        film_page_dict = {key: self.find_text(film_page_soup, criteria) for key, criteria in dict_name.items()}

        stats_elements = film_page_soup.select('p.Text__SCText-sc-1aoldkr-0.Stats__Text-sc-1u6v943-2.gATBvI')
        if len(stats_elements) == 3:
            film_page_dict['stars'] = stats_elements[0].text
            film_page_dict['saves'] = stats_elements[1].text
            film_page_dict['favorites'] = stats_elements[2].text
        
        critique_number_text = film_page_soup.select('.NavigationTab__WrapperTextStyled-sc-18dtd9d-7')[2].text
        film_page_dict['critique_number'] = int(critique_number_text.split('(')[-1].split(')')[0])

        # certain movies do not have film_group and origin_country
        group_label = film_page_soup.find('span', string='Groupe : ')
        if group_label:
            film_page_dict['film_group'] = group_label.find_next('a', {'class': 'Text__SCText-sc-1aoldkr-0 Link__PrimaryLink-sc-1v081j9-0 gATBvI bGxijB'}).text

        country_label = film_page_soup.find('h3', string=re.compile('Pays d\'origine :'))
        if country_label:
            film_page_dict['origin_country'] = country_label.find_next_sibling('span').get_text()
        
        return film_page_dict
    
    def get_main_page(self, link, main_dict, film_info_dict):
        film_soup = self.get_full_page(link)

        films = film_soup.find_all('div', class_="ProductListItem__Wrapper-sc-1jkxxpj-1 kusRkg")

        all_films_dict = {}
        count = 0

        for film in films:
            if count <= 2:
                title = self.find_text(film, {'name': 'a', 'attrs': {"data-testid": "product-title"}})

                new_film_dict = {key: self.find_text(film, criteria) for key, criteria in main_dict.items()}

                film_link = film.find('a', {"data-testid": "product-title"})['href']
                new_film_dict.update(self.get_film_page_info(film_link, film_info_dict))
                
                all_films_dict[title] = new_film_dict
                count += 1
        
        return all_films_dict


In [34]:
create_request = RequestSensCritique()

top_111_link = 'films/tops/top111'
films_info_dict = create_request.get_main_page(top_111_link, search_dict_main_page,search_dict_film_page )

In [35]:
# save the data to the json file
with open('film_info.json', 'w') as json_file:
    json.dump(films_info_dict, json_file, indent=4)

## 2. Read and Clean the data

In [36]:
df = pd.read_json('film_info.json', orient='index')

df.head()

Unnamed: 0,rating,director,duration,genres,ranking,film_year,original_title,stars,saves,favorites,critique_number,film_group,origin_country
Douze Hommes en colère (1957),8.7,Sidney Lumet,1 h 36 min.,"Policier, Drame",1. Douze Hommes en colère (1957),1957,12 Angry Men,51.7K,16.3K,6.8K,592,Drame,États-Unis
Harakiri (1962),8.6,Masaki Kobayashi,2 h 13 min.,Drame,2. Harakiri (1962),1962,Seppuku,8.2K,12.7K,1.5K,115,Hara-Kiri,Japon
Blade Runner : The Final Cut (2007),8.5,Ridley Scott,1 h 57 min.,Science-fiction,3. Blade Runner : The Final Cut (2007),2007,,1.2K,269,111,3,Blade Runner,Groupe :


In [39]:
df.sample(5,random_state=2000)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, Douze Hommes en colère (1957) to Blade Runner : The Final Cut (2007)
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rating           3 non-null      float64
 1   director         3 non-null      object 
 2   duration         3 non-null      object 
 3   genres           3 non-null      object 
 4   ranking          3 non-null      object 
 5   film_year        3 non-null      int64  
 6   original_title   2 non-null      object 
 7   stars            3 non-null      object 
 8   saves            3 non-null      object 
 9   favorites        3 non-null      object 
 10  critique_number  3 non-null      int64  
 11  film_group       3 non-null      object 
 12  origin_country   3 non-null      object 
dtypes: float64(1), int64(2), object(10)
memory usage: 336.0+ bytes


Noticed Problems:
1. `duration` should be converted to `int` for future analysis. For example, `1 h 36 min.` should be converted to `96`.
2. `genres` should use the `exploded` function to save them to separate rows.
3. `stars`, `saves`, and `favorites` should be converted to numerical values.

In [25]:
# convert duration to numerique values
def convert_to_minutes(duration_str):
    parts = duration_str.split()
    hours = int(parts[0])
    minutes = int(parts[-2])
    total_minutes = hours * 60 + minutes

    return total_minutes

df['duration_minutes'] = df['duration'].apply(convert_to_minutes)

df.head(5)

Unnamed: 0,rating,director,duration,genres,ranking,film_year,original_title,stars,saves,favorites,critique_number,duration_minutes
Douze Hommes en colère (1957),8.7,Sidney Lumet,1 h 36 min.,"Policier, Drame",1. Douze Hommes en colère (1957),1957,12 Angry Men,51.7K,16.3K,6.8K,592,96
Harakiri (1962),8.6,Masaki Kobayashi,2 h 13 min.,Drame,2. Harakiri (1962),1962,Seppuku,8.2K,12.7K,1.5K,115,133
Blade Runner : The Final Cut (2007),8.5,Ridley Scott,1 h 57 min.,Science-fiction,3. Blade Runner : The Final Cut (2007),2007,,1.2K,269,111,3,117


In [27]:
# get ranking number from ranking column
df['ranking_number'] = df['ranking'].str.split('.').str.get(0)

df.head(5)

Unnamed: 0,rating,director,duration,genres,ranking,film_year,original_title,stars,saves,favorites,critique_number,duration_minutes,ranking_number
Douze Hommes en colère (1957),8.7,Sidney Lumet,1 h 36 min.,"Policier, Drame",1. Douze Hommes en colère (1957),1957,12 Angry Men,51.7K,16.3K,6.8K,592,96,1
Harakiri (1962),8.6,Masaki Kobayashi,2 h 13 min.,Drame,2. Harakiri (1962),1962,Seppuku,8.2K,12.7K,1.5K,115,133,2
Blade Runner : The Final Cut (2007),8.5,Ridley Scott,1 h 57 min.,Science-fiction,3. Blade Runner : The Final Cut (2007),2007,,1.2K,269,111,3,117,3


In [28]:
# using explode function to process genres
df['genres_split'] = df['genres'].str.split(', ')
df_genre = df.explode('genres_split')

df_genre.head(5)

Unnamed: 0,rating,director,duration,genres,ranking,film_year,original_title,stars,saves,favorites,critique_number,duration_minutes,ranking_number,genres_split
Douze Hommes en colère (1957),8.7,Sidney Lumet,1 h 36 min.,"Policier, Drame",1. Douze Hommes en colère (1957),1957,12 Angry Men,51.7K,16.3K,6.8K,592,96,1,Policier
Douze Hommes en colère (1957),8.7,Sidney Lumet,1 h 36 min.,"Policier, Drame",1. Douze Hommes en colère (1957),1957,12 Angry Men,51.7K,16.3K,6.8K,592,96,1,Drame
Harakiri (1962),8.6,Masaki Kobayashi,2 h 13 min.,Drame,2. Harakiri (1962),1962,Seppuku,8.2K,12.7K,1.5K,115,133,2,Drame
Blade Runner : The Final Cut (2007),8.5,Ridley Scott,1 h 57 min.,Science-fiction,3. Blade Runner : The Final Cut (2007),2007,,1.2K,269,111,3,117,3,Science-fiction


In [30]:
# convert stars, saves favorites to numbers
def deal_with_k(number_str):
    if 'K' in number_str:
        parts = number_str.split('K')[0]
        return int(float(parts) * 1000)
    else:
        return int(number_str)

df['stars_number'] = df['stars'].apply(deal_with_k)
df['saves_number'] = df['saves'].apply(deal_with_k)
df['favorites_number'] = df['favorites'].apply(deal_with_k)

df.head(5)


Unnamed: 0,rating,director,duration,genres,ranking,film_year,original_title,stars,saves,favorites,critique_number,duration_minutes,ranking_number,genres_split,stars_number,saves_number,favorites_number
Douze Hommes en colère (1957),8.7,Sidney Lumet,1 h 36 min.,"Policier, Drame",1. Douze Hommes en colère (1957),1957,12 Angry Men,51.7K,16.3K,6.8K,592,96,1,"[Policier, Drame]",51700,16300,6800
Harakiri (1962),8.6,Masaki Kobayashi,2 h 13 min.,Drame,2. Harakiri (1962),1962,Seppuku,8.2K,12.7K,1.5K,115,133,2,[Drame],8200,12700,1500
Blade Runner : The Final Cut (2007),8.5,Ridley Scott,1 h 57 min.,Science-fiction,3. Blade Runner : The Final Cut (2007),2007,,1.2K,269,111,3,117,3,[Science-fiction],1200,269,111


## 3. The Movie Popularity Prediction 