Library import

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

We first extract a list of movies.
Here are the parametes for the query:

In [2]:
# the website we are scrapping from
site = 'https://www.imdb.com'

# we are going to take a look only at movies
title_types = ['feature', 'tv_movie']

# intervals of years for the search
year_intervals = [(1920, 1929), (1930, 1939), (1940, 1949), (1950, 1959), (1960, 1969), (1970, 1979), (1980, 1989), (1990, 1999), (2000, 2009), (2010, 2020)]

# minimum rating we are considering
min_rating = '7.0'

# minimum number of votes we are considering
min_num_votes = '100000'

Scrape movie links

In [3]:
movie_links = []

for first_year, last_year in year_intervals:
    # this variable indicates the starting position in the query result
    # the website only allows for 50 resuts per page
    start = 1

    while True:
        # conpute the linl
        query_link = '/search/title/?'
        title_type = 'title_type=' + ','.join(title_types)
        release_date = f'release_date={str(first_year)}-01-01,{str(last_year)}-12-31'
        user_rating = f'user_rating={min_rating},'
        num_votes = f'num_votes={min_num_votes},'
        sort = 'sort=user_rating,desc'
        num_start = f'start={str(start)}'
        ref = 'ref_=adv_nxt'

        query = site + query_link + '&'.join([title_type, release_date, user_rating, num_votes, sort, num_start, ref])

        page = requests.get(query)
        soup = BeautifulSoup(page.content, 'html.parser')

        # find the list of movie links
        link_list = [site + h3.find('a')['href'] for h3 in soup.findAll('h3', {'class': 'lister-item-header'})]

        if len(link_list) == 0:
            break

        movie_links += link_list
        start += 50

For every movie, find all the users on the first page who rated the specific movie

In [4]:
# dict with users for every movie
user_links = {}

for movie_link in tqdm(movie_links):
    page = requests.get(movie_link + 'reviews')
    soup = BeautifulSoup(page.content, 'html.parser')

    user_links[movie_link] = [site + span.find('a')['href'] for span in soup.findAll('span', {'class': 'display-name-link'})]

100%|██████████| 1132/1132 [17:32<00:00,  1.08it/s]


For the first 300 movies, scrape a user and all of his ratings

In [17]:
users = []

# we write the results in a file
with open('ratings.csv', 'a') as file:
    file.write('user,movie,rating\n')

    for movie_link in movie_links[:300]:
        for user_link in user_links[movie_link]:
            if user_link in users:
                continue

            link = user_link + 'ratings'
            scrapped_one_user = False
            starting_position = 0

            # while we can find new links for the ratings of a user
            while link:
                print(movie_links.index(movie_link), user_links[movie_link].index(user_link), user_link, starting_position)

                page = requests.get(link)

                # some users have a 'private' profile and we get an error if we try to access his rating 
                if page.status_code == 200:
                    users.append(user_link)
                    scrapped_one_user = True

                    soup = BeautifulSoup(page.content, 'html.parser')

                    movie_ratings = {site + div.find('h3', {'class': 'lister-item-header'}).find('a')['href']: div.findAll('span', {'class': 'ipl-rating-star__rating'})[1].text for div in soup.findAll('div', {'class': 'lister-item-content'}) if site + div.find('h3', {'class': 'lister-item-header'}).find('a')['href'] in movie_links}

                    for movie in movie_ratings:
                         file.write(f'{user_link},{movie},{movie_ratings[movie]}\n')

                    list_pagination = soup.find('div', {'class': 'list-pagination'})
                    if list_pagination is None:
                        break

                    link = site + list_pagination.findAll('a')[1]['href']
                    if link =='#':
                        break
                else:
                    break

                starting_position += 100

            # if we scrapped one user for this movie, we continue to the next movie
            if scrapped_one_user:
                break

289 0 https://www.imdb.com/user/ur0446812/ 0
289 1 https://www.imdb.com/user/ur1002035/ 0
289 3 https://www.imdb.com/user/ur0175770/ 0
289 4 https://www.imdb.com/user/ur0279405/ 0
289 5 https://www.imdb.com/user/ur5400700/ 0
289 7 https://www.imdb.com/user/ur5295094/ 0
289 8 https://www.imdb.com/user/ur2512513/ 0
289 9 https://www.imdb.com/user/ur3499855/ 0
289 10 https://www.imdb.com/user/ur4111911/ 0
289 11 https://www.imdb.com/user/ur4493111/ 0
289 12 https://www.imdb.com/user/ur6643268/ 0
289 12 https://www.imdb.com/user/ur6643268/ 100
289 12 https://www.imdb.com/user/ur6643268/ 200
289 12 https://www.imdb.com/user/ur6643268/ 300
289 12 https://www.imdb.com/user/ur6643268/ 400
289 12 https://www.imdb.com/user/ur6643268/ 500
289 12 https://www.imdb.com/user/ur6643268/ 600
289 12 https://www.imdb.com/user/ur6643268/ 700
289 12 https://www.imdb.com/user/ur6643268/ 800
289 12 https://www.imdb.com/user/ur6643268/ 900
289 12 https://www.imdb.com/user/ur6643268/ 1000
289 12 https://www.im

Make sure that the 300 users scrapped cover all of the movies

In [22]:
df = pd.read_csv('ratings.csv')

print(f'Number of unique movies reviewed by {len(df.user.unique())}: {len(df.movie.unique())}/{len(movie_links)}')

Number of unique movies reviewed by 300: 1132/1132


In [31]:
with open('titles.csv', 'w') as title_file:
    title_file.write('link,title,year,poster_html\n')

for movie_link in tqdm(movie_links):
    page = requests.get(movie_link)
    soup = BeautifulSoup(page.content, 'html.parser')

    title_wrapper = soup.find('div', {'class': 'title_wrapper'}).find('h1').text.split(u'\xa0')

    title = title_wrapper[0].encode('utf-8')
    year  = title_wrapper[1][1:-2]

    poster_link = soup.find('div', {'class': 'poster'}).find('img')['src']

    with open('titles.csv', 'a') as title_file:
        title_file.write(f'{movie_link},{title},{year},{poster_link}\n')







  0%|          | 0/1132 [00:00<?, ?it/s][A[A[A[A[A[A





  0%|          | 1/1132 [00:00<16:02,  1.17it/s][A[A[A[A[A[A





  0%|          | 2/1132 [00:03<24:52,  1.32s/it][A[A[A[A[A[A





  0%|          | 3/1132 [00:04<23:06,  1.23s/it][A[A[A[A[A[A





  0%|          | 4/1132 [00:05<22:01,  1.17s/it][A[A[A[A[A[A





  0%|          | 5/1132 [00:06<21:34,  1.15s/it][A[A[A[A[A[A





  1%|          | 6/1132 [00:07<20:04,  1.07s/it][A[A[A[A[A[A





  1%|          | 7/1132 [00:08<20:44,  1.11s/it][A[A[A[A[A[A





  1%|          | 8/1132 [00:09<22:26,  1.20s/it][A[A[A[A[A[A





  1%|          | 9/1132 [00:11<22:21,  1.19s/it][A[A[A[A[A[A





  1%|          | 10/1132 [00:12<23:34,  1.26s/it][A[A[A[A[A[A





  1%|          | 11/1132 [00:13<24:00,  1.28s/it][A[A[A[A[A[A





  1%|          | 12/1132 [00:14<22:31,  1.21s/it][A[A[A[A[A[A





  1%|          | 13/1132 [00:15<20:41,  1.11s/it][A[A[A[A[