<img style="display: block; margin-left: auto; margin-right: auto; width: 100%;" src="https://dilancroos.com/mind_map.jpg" alt="banner">

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

url_sc = 'https://www.senscritique.com/films/tops/top111' # Page to scrap
page_sc = requests.get(url_sc)
soup = BeautifulSoup(page_sc.content, "html.parser")

In [2]:
movie_id = [] # movie id; from movie link
titles = [] # movie title
ratings = [] # movie rating
director_links = [] # director link
movie_links = [] # movie link
poster_links = [] # poster link
movie_year = [] # movie year
duration_in_min = [] # movie duration in minutes
genres = [] # movie genres
uniq_genres = [] # unique movie genres

films = soup.find_all('div', class_="ProductListItem__Wrapper-sc-1jkxxpj-1 kusRkg")
website_url = 'https://www.senscritique.com' # Base url

for film in films: # Loop through each film
    ratings.append(film.find('div', attrs={'data-testid': 'Rating'}).text) # rating
    titles.append(film.find('a', attrs={'data-testid': 'product-title'}).text.split('(')[0]) # title
    director_links.append(website_url+film.find('a', attrs={'data-testid': 'link'}).get('href')) # director link
    movie_link = website_url+film.find('a', attrs={'data-testid': 'poster'}).get('href') # movie link
    movie_id.append(movie_link.split('/')[-1])
    movie_links.append(movie_link) # movie link
    poster_links.append(film.find('span', attrs={'data-testid': 'poster-img-wrapper'}).get('data-srcname')) # poster link
    movie_year.append(film.find('a', attrs={'data-testid': 'product-title'}).text.split('(')[1].split(')')[0]) # movie year
    duration = film.find('span', attrs={'data-testid': 'duration'}).text # duration
    if 'h' in duration: # converting the movie runtime from hours to minutes
        duration = int(duration.split('h')[0])*60 + int(duration.split('h')[1].split('min')[0])
    else:
        duration = int(duration.split('min')[0])
    duration_in_min.append(duration)
    genres_str = film.find('span', attrs={'data-testid': 'genres'}).text # genres
    genres_list = genres_str.split(', ') # genres list
    for genre in genres_list: # Loop through each genre to create a list of unique genres
        if genre not in uniq_genres:
            uniq_genres.append(genre)
    genres.append(genres_str) # genres
    

# Create a dataframe with the extracted data
df_extracted_films = pd.DataFrame(
    {
        'movie_id': movie_id,
        'titles': titles, 
        'movie_year': movie_year, 
        'ratings': ratings,
        'duration_in_mins': duration_in_min, 
        'genres': genres,
        'director_links': director_links, 
        'movie_links': movie_links, 
        'poster_links': poster_links
    })
df_extracted_films.head()

Unnamed: 0,movie_id,titles,movie_year,ratings,duration_in_mins,genres,director_links,movie_links,poster_links
0,370894,Douze Hommes en colère,1957,8.7,96,"Policier, Drame",https://www.senscritique.com/contact/Sidney_Lu...,https://www.senscritique.com/film/douze_hommes...,https://media.senscritique.com/media/000017381...
1,402373,Harakiri,1962,8.6,133,Drame,https://www.senscritique.com/contact/Masaki_Ko...,https://www.senscritique.com/film/harakiri/402373,https://media.senscritique.com/media/000015754...
2,42244431,Blade Runner : The Final Cut,2007,8.5,117,Science-fiction,https://www.senscritique.com/contact/Ridley_Sc...,https://www.senscritique.com/film/blade_runner...,https://media.senscritique.com/media/000019400...
3,368376,"Le Bon, la Brute et le Truand",1966,8.5,179,"Western, Aventure",https://www.senscritique.com/contact/Sergio_Le...,https://www.senscritique.com/film/le_bon_la_br...,https://media.senscritique.com/media/000008032...
4,368097,Barberousse,1965,8.5,185,Drame,https://www.senscritique.com/contact/Akira_Kur...,https://www.senscritique.com/film/barberousse/...,https://media.senscritique.com/media/000019677...


In [3]:
# Create a dataframe with the unique genres
df_uniq_genres = pd.DataFrame({'genres': uniq_genres})
df_uniq_genres

Unnamed: 0,genres
0,Policier
1,Drame
2,Science-fiction
3,Western
4,Aventure
5,Arts martiaux
6,Gangster
7,Thriller
8,Animation
9,Fantasy


In [4]:
# Only show the first five movies and directors with their links [:5]
for m, d in zip(df_extracted_films['movie_links'][:5],df_extracted_films['director_links'][:5]):
    print(f'''movie: {m}
director: {d}
''')

movie: https://www.senscritique.com/film/douze_hommes_en_colere/370894
director: https://www.senscritique.com/contact/Sidney_Lumet/7800

movie: https://www.senscritique.com/film/harakiri/402373
director: https://www.senscritique.com/contact/Masaki_Kobayashi/20521

movie: https://www.senscritique.com/film/blade_runner_the_final_cut/42244431
director: https://www.senscritique.com/contact/Ridley_Scott/573

movie: https://www.senscritique.com/film/le_bon_la_brute_et_le_truand/368376
director: https://www.senscritique.com/contact/Sergio_Leone/3267

movie: https://www.senscritique.com/film/barberousse/368097
director: https://www.senscritique.com/contact/Akira_Kurosawa/1114



In [5]:
# function to convert values in K to int
def k_to_int(x):
    if 'K' in x:
        return int(float(x.split('K')[0])*1000)
    else:
        return int(x)

In [6]:
actions_no = [] # number of actions
want_to_see_no = [] # number of people who want to see the movie
favorites_no = [] # number of people who have the movie in their favorites
technical_sheet_links = [] # link to the technical sheet
native_country = [] # native country of the movie
num_reviews = [] # number of reviews
actors_link_obj = [] # actors link

# Supporting List for native_country
elements = [] 

for movie_link in df_extracted_films['movie_links']: # Loop through each movie link
    page_movie = requests.get(movie_link)
    soup_movie = BeautifulSoup(page_movie.content, "html.parser")

    stats = soup_movie.find_all('div', attrs={'data-testid':'stats'})

    both = []
    for stat in stats: # Loop through each stat (actions, want to see, favorites)
        try:
            val1 = stat.find('p', attrs={'class': 'Text__SCText-sc-1aoldkr-0 Stats__Text-sc-1u6v943-2 gATBvI hzycEu'}).text
            value = k_to_int(val1) # converting the number of "actions" from K to int
            both.append(value)
        except:
            val2 = stat.find('p', attrs={'class': 'Text__SCText-sc-1aoldkr-0 Stats__Text-sc-1u6v943-2 gATBvI irORIr'}).text
            value = k_to_int(val2) # converting the number of "favorites" from K to int
            favorites_no.append(value)

    for i in range(len(both)): # Loop through each stat (actions, want to see) to create seperate lists of actions and want to see
        if i % 2 == 0: # if the index is even
            actions_no.append(both[i])
        else: # if the index is odd
            want_to_see_no.append(both[i])

    get_infos = soup_movie.find('div', attrs={'data-testid':'product-infos'})
    
    for el in get_infos: # Loop through each element in the product infos
        coun_obj = el.find_all('span', attrs={'class': 'Movie__Ellipsis-sc-1tik1a1-4 KQVGi'})
        if len(coun_obj) > 0: # if there is a country object, get the native country
            for co in coun_obj: # Loop through each country object
                if len(coun_obj) < 3: # if there is no native country
                    elements.append("None")
                    break
                if "Pays d'origine" in co.text: # if there is a native country
                    country = co.text.split("Pays d'origine : ")[1]
                    elements.append(country)
    
    #get the number of reviews
    review_sec = soup_movie.find('div', attrs={'class':'ProductReviews__ButtonWrapper-sc-mgib11-4 ffAAmZ'})

    # print(cirt_link)
    num_reviews.append(int(review_sec.find('a', attrs={'data-testid':'button'}).text.split(' ')[2]))

    # get the technical sheet link
    technical_sheet_links.append(movie_link+'/details')

    #getting info from technical sheet
    technical_sheet = page_movie = requests.get(movie_link+'/details')
    soup_sheet = BeautifulSoup(technical_sheet.content, "html.parser")
    prim_actors = soup_sheet.find_all('div', attrs={'class':'ContactCard__Name-sc-3teq8m-1 FRNZA'})
    secon_actors = soup_sheet.find_all('div', attrs={'data-testid':'more-casting'})

    actors_link = [] # actors link
    for p_card in prim_actors: # Loop through each primary actor
        contact_cards = p_card.find('a', attrs={'data-testid': 'link'}).get('href')
        actors_link.append(website_url + contact_cards)
        
    for s_card in secon_actors: # Loop through each secondary actor
        contact_cards = s_card.find('a', attrs={'data-testid': 'link'}).get('href')
        actors_link.append(website_url + contact_cards)
    
    actors_link_obj.append(actors_link) # append both primary and secondary actors link to a list

for nc in elements: # Loop through each native country
    native_country.append(nc) # append native country to a list

In [11]:
actor_id = [] # actor id; from actor link
actor_name = [] # actor name; from actor link
actor_link = [] # actor link
actor_likes_obj = [] # actor likes

i = 0
while i < len(actors_link_obj): # Loop through each actor link until the end of the list
    for actor in actors_link_obj[i]: # Loop through each actor link for each movie
        actor_id.append(actor.split('/')[-1]) # actor id
        actor_name.append(actor.split('/')[4]) # actor name
        actor_link.append(actor) # actor link
        
        page_actor = requests.get(actor)
        soup_actor = BeautifulSoup(page_actor.content, "html.parser")
        actor_likes = soup_actor.find('div', attrs={'data-testid':'stats'}).text.split(' ')[0]
        actor_likes_int = k_to_int(actor_likes) # converting the "number of actor likes" from K to int
        actor_likes_obj.append(actor_likes_int) # actor likes
    i += 1

In [12]:
# Create actors dataframe with the extracted data
df_actors_extracted = pd.DataFrame(
    {
        'actor_id': actor_id,
        'actor_name': actor_name,
        'actor_link': actor_link,
        'likes': actor_likes_obj
     }).sort_values(by=['likes'], ascending=False) # sort by likes in descending order
df_actors_extracted.head()

Unnamed: 0,actor_id,actor_name,actor_link,likes
726,26764,Quentin_Tarantino,https://www.senscritique.com/contact/Quentin_T...,10700
69,2888,clint_eastwood,https://www.senscritique.com/contact/clint_eas...,6200
842,13,robert_de_niro,https://www.senscritique.com/contact/robert_de...,5700
589,13,robert_de_niro,https://www.senscritique.com/contact/robert_de...,5700
754,13,robert_de_niro,https://www.senscritique.com/contact/robert_de...,5700


In [23]:
director_id = [] # director id; from director link
director_name = [] # director name; from director link
director_likes_obj = [] # director likes
for director_link in df_extracted_films['director_links']: # Loop through each director link
    director_id.append(director_link.split('/')[-1]) # director id
    director_name.append(director_link.split('/')[4]) # director name
    
    director_page = requests.get(director_link)
    soup_director = BeautifulSoup(director_page.content, "html.parser")
    director_likes = soup_director.find('div', attrs={'data-testid':'stats'}).text.split(' ')[0]
    director_likes_int = k_to_int(director_likes) # converting the number of director likes from K to int
    director_likes_obj.append(director_likes_int) # director likes

# Create directors dataframe with the extracted data
df_directors_extracted = pd.DataFrame(
    {
        'director_id': director_id,
        'director_name': director_name,
        'director_link': director_links,
        'likes': director_likes_obj
    }).sort_values(by=['likes'], ascending=False) # sort by likes in descending order
df_directors_extracted.head()

Unnamed: 0,director_id,director_name,director_link,likes
22,26764,Quentin_Tarantino,https://www.senscritique.com/contact/Quentin_T...,10700
12,31665,Hayao_Miyazaki,https://www.senscritique.com/contact/Hayao_Miy...,8800
11,31665,Hayao_Miyazaki,https://www.senscritique.com/contact/Hayao_Miy...,8800
28,2101,Martin_Scorsese,https://www.senscritique.com/contact/Martin_Sc...,7600
46,1129,Stanley_Kubrick,https://www.senscritique.com/contact/Stanley_K...,7200


In [14]:
# Create a dataframe with the extracted data
df_extracted_films = pd.DataFrame({
    'movie_id': movie_id,
    'titles': titles, 
    'movie_year': movie_year, 
    'ratings': ratings,
    'duration_in_mins': duration_in_min, 
    'genres': genres,
    'director_links': director_links,
    'director_likes': director_likes_obj, 
    'movie_links': movie_links, 
    'poster_links': poster_links,
    'actions_no': actions_no,
    'want_to_see_no': want_to_see_no,
    'favorites_no': favorites_no,
    'reviews_no': num_reviews,
    'native_country': native_country,
    'technical_sheet_links': technical_sheet_links,
    'actors_links': actors_link_obj
  })
df_extracted_films.head()

Unnamed: 0,movie_id,titles,movie_year,ratings,duration_in_mins,genres,director_links,director_likes,movie_links,poster_links,actions_no,want_to_see_no,favorites_no,reviews_no,native_country,technical_sheet_links,actors_links
0,370894,Douze Hommes en colère,1957,8.7,96,"Policier, Drame",https://www.senscritique.com/contact/Sidney_Lu...,1600,https://www.senscritique.com/film/douze_hommes...,https://media.senscritique.com/media/000017381...,51800,16400,6800,592,États-Unis,https://www.senscritique.com/film/douze_hommes...,[https://www.senscritique.com/contact/henry_fo...
1,402373,Harakiri,1962,8.6,133,Drame,https://www.senscritique.com/contact/Masaki_Ko...,464,https://www.senscritique.com/film/harakiri/402373,https://media.senscritique.com/media/000015754...,8200,12700,1500,115,Japon,https://www.senscritique.com/film/harakiri/402...,[https://www.senscritique.com/contact/tatsuya_...
2,42244431,Blade Runner : The Final Cut,2007,8.5,117,Science-fiction,https://www.senscritique.com/contact/Ridley_Sc...,3600,https://www.senscritique.com/film/blade_runner...,https://media.senscritique.com/media/000019400...,1200,274,111,3,,https://www.senscritique.com/film/blade_runner...,[https://www.senscritique.com/contact/harrison...
3,368376,"Le Bon, la Brute et le Truand",1966,8.5,179,"Western, Aventure",https://www.senscritique.com/contact/Sergio_Le...,2800,https://www.senscritique.com/film/le_bon_la_br...,https://media.senscritique.com/media/000008032...,54500,13600,5500,379,"Italie, Espagne, Allemagne, États-Unis",https://www.senscritique.com/film/le_bon_la_br...,[https://www.senscritique.com/contact/clint_ea...
4,368097,Barberousse,1965,8.5,185,Drame,https://www.senscritique.com/contact/Akira_Kur...,2700,https://www.senscritique.com/film/barberousse/...,https://media.senscritique.com/media/000019677...,4300,7700,833,88,Japon,https://www.senscritique.com/film/barberousse/...,[https://www.senscritique.com/contact/toshiro_...


In [15]:
# Create movie_actors dataframe with the extracted data
df_movies_actors = (
    df_extracted_films[['movie_id', 'actors_links']]
    .explode('actors_links') # explode actors_links to get a row for each actor
    .rename(columns={'actors_links': 'actor_link'})
)

# Cleaning the df_movies_actors dataframe with the extracted data
df_movies_actors_cleaned = (
    df_movies_actors
    .assign(actor_id=lambda x: x['actor_link'].str.split('/').str[-1])
    .assign(actor_name=lambda x: x['actor_link'].str.split('/').str[4])
    .drop(columns=['actor_link'])
    .reset_index(drop=True)
)
df_movies_actors_cleaned.head()

Unnamed: 0,movie_id,actor_id,actor_name
0,370894,6117,henry_fonda
1,370894,3063,Lee_J_Cobb
2,370894,4165,ed_begley
3,370894,3330,e_g_marshall
4,370894,4650,jack_warden


In [16]:
from IPython.display import Image

print(f'''movie: {df_extracted_films['movie_links'][0]}
director: {df_extracted_films['director_links'][0]}''')

Image(url= df_extracted_films['poster_links'][0]) # get the poseter

movie: https://www.senscritique.com/film/douze_hommes_en_colere/370894
director: https://www.senscritique.com/contact/Sidney_Lumet/7800


### Dataframes

In [24]:
df_actors_extracted.head()

Unnamed: 0,actor_id,actor_name,actor_link,likes
726,26764,Quentin_Tarantino,https://www.senscritique.com/contact/Quentin_T...,10700
69,2888,clint_eastwood,https://www.senscritique.com/contact/clint_eas...,6200
842,13,robert_de_niro,https://www.senscritique.com/contact/robert_de...,5700
589,13,robert_de_niro,https://www.senscritique.com/contact/robert_de...,5700
754,13,robert_de_niro,https://www.senscritique.com/contact/robert_de...,5700


In [25]:
df_directors_extracted.head()

Unnamed: 0,director_id,director_name,director_link,likes
22,26764,Quentin_Tarantino,https://www.senscritique.com/contact/Quentin_T...,10700
12,31665,Hayao_Miyazaki,https://www.senscritique.com/contact/Hayao_Miy...,8800
11,31665,Hayao_Miyazaki,https://www.senscritique.com/contact/Hayao_Miy...,8800
28,2101,Martin_Scorsese,https://www.senscritique.com/contact/Martin_Sc...,7600
46,1129,Stanley_Kubrick,https://www.senscritique.com/contact/Stanley_K...,7200


In [19]:
df_extracted_films.head()

Unnamed: 0,movie_id,titles,movie_year,ratings,duration_in_mins,genres,director_links,director_likes,movie_links,poster_links,actions_no,want_to_see_no,favorites_no,reviews_no,native_country,technical_sheet_links,actors_links
0,370894,Douze Hommes en colère,1957,8.7,96,"Policier, Drame",https://www.senscritique.com/contact/Sidney_Lu...,1600,https://www.senscritique.com/film/douze_hommes...,https://media.senscritique.com/media/000017381...,51800,16400,6800,592,États-Unis,https://www.senscritique.com/film/douze_hommes...,[https://www.senscritique.com/contact/henry_fo...
1,402373,Harakiri,1962,8.6,133,Drame,https://www.senscritique.com/contact/Masaki_Ko...,464,https://www.senscritique.com/film/harakiri/402373,https://media.senscritique.com/media/000015754...,8200,12700,1500,115,Japon,https://www.senscritique.com/film/harakiri/402...,[https://www.senscritique.com/contact/tatsuya_...
2,42244431,Blade Runner : The Final Cut,2007,8.5,117,Science-fiction,https://www.senscritique.com/contact/Ridley_Sc...,3600,https://www.senscritique.com/film/blade_runner...,https://media.senscritique.com/media/000019400...,1200,274,111,3,,https://www.senscritique.com/film/blade_runner...,[https://www.senscritique.com/contact/harrison...
3,368376,"Le Bon, la Brute et le Truand",1966,8.5,179,"Western, Aventure",https://www.senscritique.com/contact/Sergio_Le...,2800,https://www.senscritique.com/film/le_bon_la_br...,https://media.senscritique.com/media/000008032...,54500,13600,5500,379,"Italie, Espagne, Allemagne, États-Unis",https://www.senscritique.com/film/le_bon_la_br...,[https://www.senscritique.com/contact/clint_ea...
4,368097,Barberousse,1965,8.5,185,Drame,https://www.senscritique.com/contact/Akira_Kur...,2700,https://www.senscritique.com/film/barberousse/...,https://media.senscritique.com/media/000019677...,4300,7700,833,88,Japon,https://www.senscritique.com/film/barberousse/...,[https://www.senscritique.com/contact/toshiro_...


In [20]:
df_movies_actors_cleaned.head()

Unnamed: 0,movie_id,actor_id,actor_name
0,370894,6117,henry_fonda
1,370894,3063,Lee_J_Cobb
2,370894,4165,ed_begley
3,370894,3330,e_g_marshall
4,370894,4650,jack_warden
