In [None]:
pip install unidecode scikit-learn BeautifulSoup4 pandas requests lxml selenium regex

In [315]:
import requests
import time
import json
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


##  1. Get the film data from the website

The reasons why I chose the following features:
1. It's easy to manage numerical and categorical values for future predictions.
2. After going through several online projects regarding movie predictions, I found the most commonly used features from IMDb datasets to be `languages`, `budgets`, `genres`, `runtime`, `vote_average`, `vote_count`.
3. After checking the SensCritique website, I found features that might be useful for prediction. 
   1. The chosen features:
      1. Numerical: `ranking`, `year`, `duration`, `stars`, `favorites`, `saves`, `number of critiques`.
      2. Categorical: `director`, `genres`, `group`, `country of origin (pays d'origine)`.
      3. Other: `original_title` could be used to create a linked table with information from IMDb datasets.
   2. The features that I did not choose:
      1. `Synopsis`: Requires further NLP (Natural Language Processing) analysis, which could make it challenging to process and find a link to the ranking.

In [None]:
search_dict_main_page = {
    'rating': {'name': 'div', 'attrs': {"data-testid": "Rating"}},
    'director': {'name': 'a', 'attrs': {"data-testid": "link"}},
    'duration': {'name': 'span', 'attrs': {"data-testid": "duration"}},
    'genres': {'name': 'span', 'attrs': {"data-testid": "genres"}},
    'ranking': {'name': 'span', 'attrs':{'data-testid': 'product-title-wrapper'}}
}

search_dict_film_page = {
    'film_year': {'name': 'p', 'attrs': {"class": "Text__SCTitle-sc-1aoldkr-1 CoverProductInfos__StyledText-sc-1un0kh1-13 eGhlHy jugtWW"}},
    'original_title':{'name': 'p', 'attrs': {"class": "Text__SCTitle-sc-1aoldkr-1 CoverProductInfos__StyledText-sc-1un0kh1-13 eGhlHy kuMSsq"}}
}


In [None]:
class RequestSensCritique:
    def __init__(self):
        self._base_urls = 'https://www.senscritique.com/'
    
    def create_soup(self, content, is_url=True):
        if is_url:
            page_content = requests.get(content).content
        else:
            page_content = content
        
        return BeautifulSoup(page_content, "html.parser")

    def find_text(self, element, search_criteria):
        found_element = element.find(**search_criteria)
        return found_element.text if found_element else None

    def get_full_page(self, link):
        full_url = self._base_urls + link
        driver = webdriver.Chrome()

        try:
            driver.get(full_url)

            for _ in range(8):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
            
            page_source = driver.page_source

        finally:
            driver.quit()

        return self.create_soup(page_source, is_url=False)
        
    def get_film_page_info(self, film_link, dict_name):
        film_url = self._base_urls + film_link
        film_page_soup = self.create_soup(film_url)

        film_page_dict = {key: self.find_text(film_page_soup, criteria) for key, criteria in dict_name.items()}

        stats_elements = film_page_soup.select('p.Text__SCText-sc-1aoldkr-0.Stats__Text-sc-1u6v943-2.gATBvI')
        if len(stats_elements) == 3:
            film_page_dict['stars'] = stats_elements[0].text
            film_page_dict['saves'] = stats_elements[1].text
            film_page_dict['favorites'] = stats_elements[2].text
        
        critique_number_text = film_page_soup.select('.NavigationTab__WrapperTextStyled-sc-18dtd9d-7')[2].text
        film_page_dict['critique_number'] = int(critique_number_text.split('(')[-1].split(')')[0])

        # certain movies do not have film_group and origin_country
        group_label = film_page_soup.find('span', string='Groupe : ')
        if group_label:
            film_page_dict['film_group'] = group_label.find_next('a', {'class': 'Text__SCText-sc-1aoldkr-0 Link__PrimaryLink-sc-1v081j9-0 gATBvI bGxijB'}).text

        country_label = film_page_soup.find('h3', string=re.compile('Pays d\'origine :'))
        if country_label:
            film_page_dict['origin_country'] = country_label.find_next_sibling('span').get_text()
        
        return film_page_dict
    
    def get_main_page(self, link, main_dict, film_info_dict):
        film_soup = self.get_full_page(link)

        films = film_soup.find_all('div', class_="ProductListItem__Wrapper-sc-1jkxxpj-1 kusRkg")

        all_films_dict = {}

        for film in films:
            title = self.find_text(film, {'name': 'a', 'attrs': {"data-testid": "product-title"}})

            new_film_dict = {key: self.find_text(film, criteria) for key, criteria in main_dict.items()}

            film_link = film.find('a', {"data-testid": "product-title"})['href']
            new_film_dict.update(self.get_film_page_info(film_link, film_info_dict))
            
            all_films_dict[title] = new_film_dict
        
        return all_films_dict


In [None]:
create_request = RequestSensCritique()

top_111_link = 'films/tops/top111'
films_info_dict = create_request.get_main_page(top_111_link, search_dict_main_page,search_dict_film_page )

In [None]:
# save the data to the json file
with open('film_info.json', 'w') as json_file:
    json.dump(films_info_dict, json_file, indent=4)

## 2. Read and Clean the data

### 2.1 Getting to know the data

In [None]:
df = pd.read_json('film_info.json', orient='index')

df.head()

In [None]:
df.sample(5,random_state=2000)

Noticed Problems from the sample:
1. The first column should be named as `title`, and we should delete the year from the title.
2. `duration` should be converted to `int` for future analysis. For example, `1 h 36 min.` should be converted to `96`.
3. `genres` should use the `exploded` function to save them to separate rows.
4. `stars`, `saves`, and `favorites` should be converted to numerical values.

Check the categorical data.

In [None]:
df['origin_country'].value_counts()

We need to delete the commas in the column of `origin_country`.

In [None]:
df['director'].value_counts()

In [None]:
df['genres'].value_counts()

In [None]:
df.info()

In [None]:
df['film_year']

Summaries from the info:
1. There are two films that don't have `genres`, check the website to validate the correctness. -> They don't have genres on the website as well.
2. `film_group` has lots of empty data, so I won't use it to do the prediction.

In [None]:
empty_genres = df[df['genres'].isna()]

empty_genres.head()

In [None]:
df['rating'].describe()

### 2.2 Clean the Data
Based on what I've observed from the sample data.

In [None]:
# name the first column to title
df.reset_index(inplace=True)

df.rename(columns={'index': 'title'}, inplace=True)


In [None]:
# clean the title without the year
def clean_title(title):
    return re.sub(r' \(\d{4}\)', '', title)
  
    
df['clean_title'] = df['title'].apply(clean_title)

df.head()


In [None]:
# convert duration to numerique values
def convert_to_minutes(duration_str):
    parts = duration_str.split()
    hours = int(parts[0])
    minutes = int(parts[-2])
    total_minutes = hours * 60 + minutes

    return total_minutes

df['duration_minutes'] = df['duration'].apply(convert_to_minutes)

df.head(5)

In [None]:
# get ranking number from ranking column
df['ranking_number'] = df['ranking'].str.split('.').str.get(0)

df.head(5)

In [None]:
# delete the commas in the column of `origin_country`.
df['origin_country_clean'] = df['origin_country'].str.replace(',', ' ')

# clean the extra space here
df['origin_country_clean'] = df['origin_country_clean'].str.strip()

df['origin_country_clean'].value_counts()


In [None]:
# convert stars, saves favorites to numbers
def deal_with_k(number_str):
    if 'K' in number_str:
        parts = number_str.split('K')[0]
        return int(float(parts) * 1000)
    else:
        return int(number_str)

df['stars_number'] = df['stars'].apply(deal_with_k)
df['saves_number'] = df['saves'].apply(deal_with_k)
df['favorites_number'] = df['favorites'].apply(deal_with_k)

df.head(5)


In [None]:
# process genres
df['genres_split'] = df['genres'].str.split(', ')

# Exploding the DataFrame
exploded_df = df.explode('genres_split')

# Performing one-hot encoding
one_hot_encoded_df = pd.get_dummies(exploded_df, columns=['genres_split'], prefix='', prefix_sep='')

# Grouping by title and aggregating the one-hot encoded genres
final_df = one_hot_encoded_df.groupby('title').sum().reset_index()

final_df.head()


## 3. The Movie Popularity Prediction 

### 3.1 Prepare the dataframes for prediction

The reason I selected these features is to analyze the ratings and rankings on this website more effectively. These may be influenced by preferences towards specific directors or countries. 

Additionally, by examining these features, we might uncover relationships between the ratings and various factors such as the number of stars, saves, favorites, and critiques.

In [None]:
features_temp_df = df[['clean_title','director','origin_country_clean', 'genres_split', 'duration_minutes','stars_number','saves_number','favorites_number','critique_number','film_year']]

labels_df = df[['ranking_number', 'rating']]

ranking_df = labels_df['ranking_number']
rating_df = labels_df['rating']

#### 3.1.1 Deal with numerique datas

Have a look at the dataset and drop values that would be outliers.

In [None]:
features_temp_df.head()

In [None]:
features_temp_df.info()

In [None]:
features_temp_df.describe()

In [None]:
features_temp_df.isnull().any()

#### 3.1.2 Deal with categorical data

In [None]:
# check if we have empty values in the categorical data
features_temp_df.isnull().any()

# create a copy of it
features_temp_df = features_temp_df.copy()

features_temp_df[features_temp_df.isnull().values==True]


In [None]:
# fillna instead of dropna, as we need to have the same amout of data in features_df and labels_df
features_temp_df['origin_country_clean'].fillna("empty", inplace = True)
features_temp_df['genres_split'].fillna("empty", inplace = True)


In [None]:
features_temp_df[features_temp_df.isnull().values==True]

In [None]:
from sklearn.preprocessing import LabelEncoder

# Creating an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fitting and transforming the director column
features_temp_df['director_encoded'] = label_encoder.fit_transform(features_temp_df['director'])

features_temp_df.head()


In [None]:
# Getting one-hot encoded columns for origin_country_clean
country_dummies = pd.get_dummies(features_temp_df['origin_country_clean'], prefix='country')

# Concatenating the one-hot encoded columns to the original dataframe
features_temp_df = pd.concat([features_temp_df, country_dummies], axis=1)

features_temp_df.head()

In [None]:
# Exploding the DataFrame
exploded_df = features_temp_df.explode('genres_split')

# Performing one-hot encoding
one_hot_encoded_df = pd.get_dummies(exploded_df, columns=['genres_split'], prefix='', prefix_sep='')

# Grouping by title and aggregating the one-hot encoded genres
features_df = one_hot_encoded_df.groupby('clean_title').sum().reset_index()

features_df.head()


In [None]:
features_df.describe()

In [None]:
features_df['film_year']

In [313]:
# Important step to ensure the labels align with the features.
rating_df = rating_df.loc[features_df.index]
rating_df.head()

0    8.7
1    8.6
2    8.5
3    8.5
4    8.5
Name: rating, dtype: float64

In [314]:
ranking_df = ranking_df.loc[features_df.index]
ranking_df.head()

0    1
1    2
2    3
3    4
4    5
Name: ranking_number, dtype: object

In [None]:
# we assume that they have the same length
assert len(features_df) == len(rating_df)

### 3.2 Exploratory Data Analysis

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features_df, ranking_df, test_size=0.2, random_state = 42)

In [None]:
x_train['film_year']

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = 12, 12

fig, axs = plt.subplots(3, 2)
fig.suptitle('Numerical Feature Histograms', y=1.05, fontsize=16)

axs[0, 0].hist(x_train['film_year'].values, bins=30, color='salmon')
axs[0, 0].set_title('film_year')

axs[0, 1].hist(x_train['duration_minutes'].values, bins=30, color='salmon')
axs[0, 1].set_title('duration_minutes')

axs[1, 0].hist(x_train['stars_number'].values, bins=30, color='salmon')
axs[1, 0].set_title('stars_number')

axs[1, 1].hist(x_train['saves_number'].values, bins=30, color='salmon')
axs[1, 1].set_title('saves_number')

axs[2, 0].hist(x_train['favorites_number'].values, bins=30, color='salmon')
axs[2, 0].set_title('favorites_number')

axs[2, 1].hist(x_train['critique_number'].values, bins=30, color='salmon')
axs[2, 1].set_title('critique_number')

plt.tight_layout()
plt.show()

Except for `film_year`, other plots seem to be skewed distribution. 