# Scraping senscritique.com: movie and music.

In this notebook, we code a scrapper that allows you to generate a dataset containing the preferences of senscritique.com's users in terms of movies and songs. The dataset that is constructed is a dictionnary, for which a keys is an username and a value is a dataframe.

In [2]:
# All of the librairies we require
import pandas as pd
import numpy as np
import re
import requests
from requests import Session
from bs4 import BeautifulSoup
from lxml import html
import time 

In [3]:
# The list of all users on Senscritique.com
# This was also obtained by using a scrapping procedure, but it is not hard to reproduce by yourself.
with open('url_users_senscritique.txt', 'r') as f:
    users = [line.strip() for line in f].copy()

users_df = pd.DataFrame(users, columns = ['users'])
display(users_df)

Unnamed: 0,users
0,/Chlo%C3%A9Pzk
1,/Electra
2,/Fanny__Boutonn%C3%A9
3,/mikmikmik
4,/Ondine26
...,...
47405,/papa53
47406,/GregBodaway
47407,/Adrien_Plaine
47408,/Tib%C3%A8reDebout%C3%A9


In [4]:
# Some functions we will need during the computation of the entries

def parser_rating(rate):
    if rate in {'','-',' '}:
        return np.nan
    return float(rate)

def parser_year(rate):
    if rate in {'','-',' '}:
        return np.nan
    return int(rate)

In [11]:
# SCRAPPING EVERY RATINGS OF EVERY USER ON SENSCRITIQUE WITH AT LEAST ONE MOVIE AND ONE SONG REVIEW

# Before scrapping, we need to log in. Otherwise, you cannot see people's profile set on 'private'.
# It is necessary to create an account in order to run this code, as most people's profile will not show up otherwise
# Once you have created an account, log in through your web-browser and use the inspector to obtain the cookies and head information.
#Also update the cookies information via inspection of the login page, if you do so.
# I kept the structure of the dictionnary so that you can reproduce the code, but I removed my own informations
cookies = {
    'SC_DEVICE_CATEGORY': '',
    'SC_SESSIONS_ID': '',
    'blocksPush': '',
    'SC_SHOW_MODAL_LOGIN': '',
    'SC_ID_TOKEN': '',
    'SC_REFRESH_TOKEN': '',
    'SC_AUTH': '',
    'SC_AUTH_UID': '',
}

headers = {
    'User-Agent': '',
    'Accept': '',
    'Accept-Language': '',
    'Accept-Encoding': '',
    'Referer': '',
    'DNT': '',
    'Connection': '',
    'Upgrade-Insecure-Requests': '',
    'Sec-Fetch-Dest': '',
    'Sec-Fetch-Mode': '',
    'Sec-Fetch-Site': '',
    'Sec-Fetch-User': '',
}

login_url='https://www.senscritique.com/auth/login'

with Session() as c:
    c.post(login_url)
    
    # The object we build has the following type:
    # A dictionnary, where the keys are the usernames and the values a three element list.
    # Each element of the list is a dataframe: One for movie reviews of the aformentioned user, one for music tracks, one for albums.
    # A treatment will probably be applied to merge albums and tracks. I have not decided yet how.
    dico_users = dict()

    for user in ['auth']:
  
       #In this block, we check whether the user has both music and movies reviews: otherwise just drop them.
        time.sleep(0.01)
        profile = c.get('https://www.senscritique.com/'+user, headers=headers, cookies=cookies)
        time.sleep(0.01)
        html_soup = BeautifulSoup(profile.text, 'html.parser')
        data_center_html_soup = html_soup.find_all('div',class_='uvi-stats-pies')
        if data_center_html_soup != []:
            data_center_html = html_soup.find_all('div',class_='uvi-stats-pies')[0].find_all('a', class_="uvi-stats-pie")
            n_data_center = len(data_center_html)
            data_center = {'movie': 0,'music': 0}
            for i in range(n_data_center):
                if data_center_html[i]['data-sc-pie-label'] == 'FILMS':
                    data_center['movie'] = int(data_center_html[i]['data-sc-pie-value'])
                elif data_center_html[i]['data-sc-pie-label'] == 'MUSIQUE':
                    data_center['music'] = int(data_center_html[i]['data-sc-pie-value'])
            if (data_center['movie'] != 0 and data_center['music'] != 0):
            
                #We build the movie review dataframe as a dictionnary, that will be converted into a pd.Dataframe at the end.
                list_of_dict_user_movies = []
                list_of_dict_user_tracks = []
                list_of_dict_user_albums = []
                dico_users[user] = []
            
                #Now, let's start scrapping.
                time.sleep(0.01)
                response_user = c.get('https://www.senscritique.com/'+user+'/collection/all/films/all/all/all/all/all/all/all/page-1', headers=headers, cookies=cookies)
                time.sleep(0.01)
                html_soup = BeautifulSoup(response_user.text, 'html.parser')

            
                number_pages_soup = html_soup.find_all('li', class_="eipa-page")
            
                if number_pages_soup == []:
                    time.sleep(0.01)
                    response_user = c.get('https://www.senscritique.com/'+user+'/collection/all/films/all/all/all/all/all/all/all/page-1', headers=headers, cookies=cookies)
                    time.sleep(0.01)
                    html_soup = BeautifulSoup(response_user.text, 'html.parser')
                    num_critics_page = int(len(html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')) / 2)
        
                    class_rating_user_page = html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')
                    list_rating_user_page = [parser_rating(class_rating_user_page[2*i+1].text.replace('\n','').replace('\t','')) for i in range(num_critics_page)]
        
                    global_ratings = html_soup.find_all('a',class_='erra-global')
                    list_rating_average = [parser_rating(rating.text.replace('\n','').replace('\t','')) for rating in global_ratings]
                    list_rating_count = [parser_rating(count['title'].replace('Note globale pondérée sur :','').replace('avis','')) for count in global_ratings]
     
                    #I added one more column in order to make the distinction between movies with the same title (there are like 10 'Rocky' for instance)
                    misc_html = html_soup.find_all('p', class_ = 'elco-baseline elco-options')
                    list_misc = [misc.text.replace('\n','').replace('\t','') for misc in misc_html]

                    list_title_page = [re.search('\n(.*)\n',(html_soup.find_all('h2', class_= 'd-heading2 elco-title')[i].text)).group(1) for i in range(num_critics_page)]

                    for i in range(num_critics_page):
                        dict_movie = dict()
                        dict_movie['Category'] = 'movie'
                        dict_movie['title'] = list_title_page[i]
                        dict_movie['misc_info'] = list_misc[i]
                        dict_movie['user_rating'] = list_rating_user_page[i]
                        dict_movie['total_rating_count'] = list_rating_count[i]
                        dict_movie['total_rating_average'] = list_rating_average[i]
                        list_of_dict_user_movies.append(dict_movie)
                    df_user_collection_movies = pd.DataFrame(list_of_dict_user_movies)
                    dico_users[user].append(df_user_collection_movies)

                else: 
                    number_pages = int(number_pages_soup[-1].text.replace('.','').replace('\n',''))

                   #We want to grab the following informations (we know the category will be 'movie'):
                   #Title, User rating, Number of ratings, average rating and a column with Misc informations to avoid ambiguities
                    for i in range(1,number_pages+1):
                        time.sleep(0.01)
                        response_user = c.get('https://www.senscritique.com/'+user+'/collection/all/films/all/all/all/all/all/all/all/page-'+str(i), headers=headers, cookies=cookies)
                        time.sleep(0.01)
                        html_soup = BeautifulSoup(response_user.text, 'html.parser')

                        num_critics_page = int(len(html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')) / 2)

                        class_rating_user_page = html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')
                        list_rating_user_page = [parser_rating(class_rating_user_page[2*i+1].text.replace('\n','').replace('\t','')) for i in range(num_critics_page)]

                        global_ratings = html_soup.find_all('a',class_='erra-global')
                        list_rating_average = [parser_rating(rating.text.replace('\n','').replace('\t','')) for rating in global_ratings]
                        list_rating_count = [parser_rating(count['title'].replace('Note globale pondérée sur :','').replace('avis','')) for count in global_ratings]

                        #I added one more column in order to make the distinction between movies with the same title (there are like 10 'Rocky' for instance)
                        misc_html = html_soup.find_all('p', class_ = 'elco-baseline elco-options')
                        list_misc = [misc.text.replace('\n','').replace('\t','') for misc in misc_html]

                        list_title_page = [re.search('\n(.*)\n',(html_soup.find_all('h2', class_= 'd-heading2 elco-title')[i].text)).group(1) for i in range(num_critics_page)]

                        for i in range(num_critics_page):
                                dict_movie = dict()
                                dict_movie['Category'] = 'movie'
                                dict_movie['title'] = list_title_page[i]
                                dict_movie['misc_info'] = list_misc[i]
                                dict_movie['user_rating'] = list_rating_user_page[i]
                                dict_movie['total_rating_count'] = list_rating_count[i]
                                dict_movie['total_rating_average'] = list_rating_average[i]
                                list_of_dict_user_movies.append(dict_movie)
                    df_user_collection_movies = pd.DataFrame(list_of_dict_user_movies)
                    dico_users[user].append(df_user_collection_movies)


                
                   #We do the same for music.
                time.sleep(0.01)
                response_user = c.get('https://www.senscritique.com/'+user+'/collection/all/morceaux/all/all/all/all/all/all/list/page-1', headers=headers, cookies=cookies)
                time.sleep(0.01)
                html_soup = BeautifulSoup(response_user.text, 'html.parser')

                number_pages_soup = html_soup.find_all('li', class_="eipa-page")#[-1].text.replace('.','').replace('\n','')
                if number_pages_soup == []:
                    #number_pages = int(html_soup.find_all('li', class_="eipa-page")[-1].text.replace('.','').replace('\n',''))

                    #We want to grab the following informations (we know the category will be 'movie'):
                    #Title, User rating, Number of ratings, average rating and a column with Misc informations to avoid ambiguities
                    time.sleep(0.01)
                    response_user = c.get('https://www.senscritique.com/'+user+'/collection/all/morceaux/all/all/all/all/all/all/list/page-1', headers=headers, cookies=cookies)
                    time.sleep(0.01)
                    html_soup = BeautifulSoup(response_user.text, 'html.parser')

                    num_critics_page = int(len(html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')) / 2)

                    class_rating_user_page = html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')
                    list_rating_user_page = [parser_rating(class_rating_user_page[2*i+1].text.replace('\n','').replace('\t','')) for i in range(num_critics_page)]

                    global_ratings = html_soup.find_all('a',class_='erra-global')
                    list_rating_average = [parser_rating(rating.text.replace('\n','').replace('\t','')) for rating in global_ratings]
                    list_rating_count = [parser_rating(count['title'].replace('Note globale pondérée sur :','').replace('avis','')) for count in global_ratings]

                    #I added one more column in order to make the distinction between movies with the same title (there are like 10 'Rocky' for instance)
                    misc_html = html_soup.find_all('p', class_ = 'elco-baseline elco-options')
                    list_misc = [misc.text.replace('\n','').replace('\t','') for misc in misc_html]

                    list_title_page = [re.search('\n(.*)\n',(html_soup.find_all('h2', class_= 'd-heading2 elco-title')[i].text)).group(1) for i in range(num_critics_page)]

                    for i in range(num_critics_page):
                        dict_tracks = dict()
                        dict_tracks['Category'] = 'tracks'
                        dict_tracks['title'] = list_title_page[i]
                        dict_tracks['misc_info'] = list_misc[i]
                        dict_tracks['user_rating'] = list_rating_user_page[i]
                        dict_tracks['total_rating_count'] = list_rating_count[i]
                        dict_tracks['total_rating_average'] = list_rating_average[i]
                        list_of_dict_user_tracks.append(dict_tracks)
                    df_user_collection_tracks = pd.DataFrame(list_of_dict_user_tracks) 
                    dico_users[user].append(df_user_collection_tracks)

                else:
                    number_pages = int(number_pages_soup[-1].text.replace('.','').replace('\n',''))

               #We want to grab the following informations (we know the category will be 'movie'):
               #Title, User rating, Number of ratings, average rating and a column with Misc informations to avoid ambiguities
                    for i in range(1,number_pages+1):
                        time.sleep(0.01)
                        response_user = c.get('https://www.senscritique.com/'+user+'/collection/all/morceaux/all/all/all/all/all/all/list/page-'+str(i), headers=headers, cookies=cookies)
                        time.sleep(0.01)
                        html_soup = BeautifulSoup(response_user.text, 'html.parser')

                        num_critics_page = int(len(html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')) / 2)

                        class_rating_user_page = html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')
                        list_rating_user_page = [parser_rating(class_rating_user_page[2*i+1].text.replace('\n','').replace('\t','')) for i in range(num_critics_page)]

                        global_ratings = html_soup.find_all('a',class_='erra-global')
                        list_rating_average = [parser_rating(rating.text.replace('\n','').replace('\t','')) for rating in global_ratings]
                        list_rating_count = [parser_rating(count['title'].replace('Note globale pondérée sur :','').replace('avis','')) for count in global_ratings]

                        #I added one more column in order to make the distinction between movies with the same title (there are like 10 'Rocky' for instance)
                        misc_html = html_soup.find_all('p', class_ = 'elco-baseline elco-options')
                        list_misc = [misc.text.replace('\n','').replace('\t','') for misc in misc_html]

                        list_title_page = [re.search('\n(.*)\n',(html_soup.find_all('h2', class_= 'd-heading2 elco-title')[i].text)).group(1) for i in range(num_critics_page)]

                        for i in range(num_critics_page):
                            dict_tracks = dict()
                            dict_tracks['Category'] = 'tracks'
                            dict_tracks['title'] = list_title_page[i]
                            dict_tracks['misc_info'] = list_misc[i]
                            dict_tracks['user_rating'] = list_rating_user_page[i]
                            dict_tracks['total_rating_count'] = list_rating_count[i]
                            dict_tracks['total_rating_average'] = list_rating_average[i]
                            list_of_dict_user_tracks.append(dict_tracks)
                    df_user_collection_tracks = pd.DataFrame(list_of_dict_user_tracks) 
                    dico_users[user].append(df_user_collection_tracks)

                #We do the same for albums.
                time.sleep(0.01)
                response_user = c.get('https://www.senscritique.com/'+user+'/collection/all/albums/all/all/all/all/all/all/list/page-1', headers=headers, cookies=cookies)
                time.sleep(0.01)
                html_soup = BeautifulSoup(response_user.text, 'html.parser')

                number_pages_soup = html_soup.find_all('li', class_="eipa-page")#[-1].text.replace('.','').replace('\n','')
                if number_pages_soup == []:
                    #number_pages = int(html_soup.find_all('li', class_="eipa-page")[-1].text.replace('.','').replace('\n',''))

                    #We want to grab the following informations (we know the category will be 'movie'):
                    #Title, User rating, Number of ratings, average rating and a column with Misc informations to avoid ambiguities
                    time.sleep(0.01)
                    response_user = c.get('https://www.senscritique.com/'+user+'/collection/all/albums/all/all/all/all/all/all/list/page-1', headers=headers, cookies=cookies)
                    time.sleep(0.01)
                    html_soup = BeautifulSoup(response_user.text, 'html.parser')

                    num_critics_page = int(len(html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')) / 2)

                    class_rating_user_page = html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')
                    list_rating_user_page = [parser_rating(class_rating_user_page[2*i+1].text.replace('\n','').replace('\t','')) for i in range(num_critics_page)]

                    global_ratings = html_soup.find_all('a',class_='erra-global')
                    list_rating_average = [parser_rating(rating.text.replace('\n','').replace('\t','')) for rating in global_ratings]
                    list_rating_count = [parser_rating(count['title'].replace('Note globale pondérée sur :','').replace('avis','')) for count in global_ratings]

                    #I added one more column in order to make the distinction between movies with the same title (there are like 10 'Rocky' for instance)
                    misc_html = html_soup.find_all('p', class_ = 'elco-baseline elco-options')
                    list_misc = [misc.text.replace('\n','').replace('\t','') for misc in misc_html]

                    list_title_page = [re.search('\n(.*)\n',(html_soup.find_all('h2', class_= 'd-heading2 elco-title')[i].text)).group(1) for i in range(num_critics_page)]

                    for i in range(num_critics_page):
                        dict_albums = dict()
                        dict_albums['Category'] = 'albums'
                        dict_albums['title'] = list_title_page[i]
                        dict_albums['misc_info'] = list_misc[i]
                        dict_albums['user_rating'] = list_rating_user_page[i]
                        dict_albums['total_rating_count'] = list_rating_count[i]
                        dict_albums['total_rating_average'] = list_rating_average[i]
                        list_of_dict_user_albums.append(dict_albums)
                    df_user_collection_albums = pd.DataFrame(list_of_dict_user_albums) 
                    dico_users[user].append(df_user_collection_albums)

                else:
                    number_pages = int(number_pages_soup[-1].text.replace('.','').replace('\n',''))

               #We want to grab the following informations (we know the category will be 'movie'):
               #Title, User rating, Number of ratings, average rating and a column with Misc informations to avoid ambiguities
                    for i in range(1,number_pages+1):
                        time.sleep(0.01)
                        response_user = c.get('https://www.senscritique.com/'+user+'/collection/all/albums/all/all/all/all/all/all/list/page-'+str(i), headers=headers, cookies=cookies)
                        time.sleep(0.01)
                        html_soup = BeautifulSoup(response_user.text, 'html.parser')

                        num_critics_page = int(len(html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')) / 2)

                        class_rating_user_page = html_soup.find_all('span', class_= 'elrua-useraction-inner only-child')
                        list_rating_user_page = [parser_rating(class_rating_user_page[2*i+1].text.replace('\n','').replace('\t','')) for i in range(num_critics_page)]

                        global_ratings = html_soup.find_all('a',class_='erra-global')
                        list_rating_average = [parser_rating(rating.text.replace('\n','').replace('\t','')) for rating in global_ratings]
                        list_rating_count = [parser_rating(count['title'].replace('Note globale pondérée sur :','').replace('avis','')) for count in global_ratings]

                        #I added one more column in order to make the distinction between movies with the same title (there are like 10 'Rocky' for instance)
                        misc_html = html_soup.find_all('p', class_ = 'elco-baseline elco-options')
                        list_misc = [misc.text.replace('\n','').replace('\t','') for misc in misc_html]

                        list_title_page = [re.search('\n(.*)\n',(html_soup.find_all('h2', class_= 'd-heading2 elco-title')[i].text)).group(1) for i in range(num_critics_page)]

                        for i in range(num_critics_page):
                            dict_albums = dict()
                            dict_albums['Category'] = 'albums'
                            dict_albums['title'] = list_title_page[i]
                            dict_albums['misc_info'] = list_misc[i]
                            dict_albums['user_rating'] = list_rating_user_page[i]
                            dict_albums['total_rating_count'] = list_rating_count[i]
                            dict_albums['total_rating_average'] = list_rating_average[i]
                            list_of_dict_user_albums.append(dict_albums)
                    df_user_collection_albums = pd.DataFrame(list_of_dict_user_albums) 
                    dico_users[user].append(df_user_collection_albums)
