In [1]:
!pip install -U BeautifulSoup4
!pip install -U tqdm

In [1]:
import sqlite3
import numpy as np
import pandas as pd
import requests
from lxml import etree
from bs4 import BeautifulSoup
from tqdm import tqdm
from datetime import datetime

In [2]:
class IMDBScraper:
    def __init__(self):
        self.request_headers = {
            'accept-language': "en-US,en;q=1.0",
            'content-language': 'en-US',
            'user-agent': 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
        }
    def get_top_rated(self):
        response = requests.get('https://www.imdb.com/chart/top', headers=self.request_headers)
        soup = BeautifulSoup(response.text, "html.parser")
        movie_titles = [e.text for e in soup.select('.titleColumn a')]
        movie_years = [int(e.text.replace('(', '').replace(')', '')) for e in soup.select('.secondaryInfo')]
        movie_ratings = [float(e.text) for e in soup.select("strong")]
        top_rated_movies = pd.DataFrame()
        top_rated_movies['id'] = range(1, len(movie_titles) + 1)
        top_rated_movies['title'] = movie_titles
        top_rated_movies['release_year'] = movie_years
        top_rated_movies['rating'] = movie_ratings
        movie_page_links = [e.get("href") for e in soup.select('.titleColumn a')]
        self.movie_titles = movie_titles
        self.movie_page_links = movie_page_links
        return top_rated_movies
    def get_movie_info(self):
        top_rated_movies = self.get_top_rated()
        movie_ids = top_rated_movies['id'].values
        movie_info = []
        for mplink, mid, mtitle in tqdm(zip(self.movie_page_links, movie_ids, self.movie_titles)):
            movie_page = f"https://www.imdb.com{mplink}"
            response = requests.get(movie_page, headers=self.request_headers)
            soup = BeautifulSoup(response.text, "html.parser")       
            movie_time = soup.select("#__next > main > div > section.ipc-page-background.ipc-page-background--base > section > div:nth-child(4) > section > section > div > div > ul > li:nth-child(3)")
            if mtitle == "Das Boot":
                movie_times_mins = 149
            else:
                movie_time = movie_time[0].text.split()
                if len(movie_time) == 2:
                    hours, mins = int(movie_time[0].replace("h", "")), int(movie_time[1].replace("m", ""))
                    movie_time_mins = hours * 60 + mins
                else:
                    hours_mins = movie_time[0]
                    if 'h' in hours_mins:
                        hours = int(hours_mins.replace('h', ''))
                        movie_time_mins = hours * 60
                    elif 'm' in hours_mins:
                        mins = int(hours_mins.replace('m', ''))
                        movie_time_mins = mins
            credits_page = f"https://www.imdb.com{mplink}fullcredits"
            response = requests.get(credits_page, headers=self.request_headers)
            soup = BeautifulSoup(response.text, "html.parser")
            actors = [e.text.strip() for e in soup.select('.primary_photo+ td a')]
            director = soup.select("#fullcredits_content > table:nth-child(2) > tbody > tr > td.name > a")[0].text.strip()
            if len(actors) > 15:
                actors = actors[:15]
            ords = list(range(1, len(actors) + 1))
            movie_data = {
                'id': mid,
                'director': director,
                'runtime': movie_time_mins,
                'actor': actors,
                'ord': ords
            }
            movie_info.append(movie_data)
        self._movie_info = movie_info
        return movie_info
    def __getitem__(self):
        top_rated = self.get_top_rated()
        movie_info = self.get_movie_info()
        movie_info_list = []
        for item in movie_info:
            movie_data = {
                'id': item['id'],
                'director': item['director'],
                'runtime': item['runtime']
            }
            movie_info_list.append(movie_data)
        movie_info_df = pd.DataFrame(movie_info_list)
        top_rated_movies = pd.merge(top_rated, movie_info_df, left_on='id', right_on='id', how="left")
        unique_directors = top_rated_movies["director"].unique()
        ordered_directors = sorted(list(unique_directors))
        ids = list(range(1, len(ordered_directors) + 1))
        directors = pd.DataFrame()
        directors["director_id"] = ids
        directors["name"] = ordered_directors
        top_rated_movies_directors = pd.merge(top_rated_movies, directors, how="left", left_on="director", right_on="name")
        top_rated_movies_selected = top_rated_movies_directors[['id', 'title', 'release_year', 'rating', 'runtime', 'director_id']]
        directors.columns = ["id", "name"]
        return top_rated_movies_selected, directors
    def get_actors(self):
        movie_info = self._movie_info
        actors_list = []
        for item in tqdm(movie_info):
            actor = item['actor']
            actors_list += actor
        unique_actors = set(actors_list)
        ordered_actors = sorted(list(unique_actors))
        ids = list(range(1, len(ordered_actors) + 1))
        actors = pd.DataFrame()
        actors['id'] = ids
        actors['name'] = ordered_actors
        self._actors = actors
        return actors
    def get_casting(self):
        movie_info = self._movie_info
        actors = self._actors
        casting = pd.DataFrame()
        for item in tqdm(movie_info):
            movie_id = item['id']
            names = item['actor']
            ords = item['ord']
            df = pd.DataFrame()
            df['actor_name'] = names
            df['ord'] = ords
            df['movie_id'] = movie_id
            casting = pd.concat([casting, df])
        nrows = casting.shape[0]
        casting_ids = list(range(1, nrows + 1))
        casting = casting.reset_index(drop=True)
        casting["casting_id"] = casting_ids
        casting_merged = pd.merge(casting, actors, left_on='actor_name', right_on='name', how='left')
        casting_selected = casting_merged[['casting_id', 'movie_id', 'id', 'ord']]
        casting_selected.columns = ['id', 'movie_id', 'actor_id', 'ord']
        return casting_selected

In [3]:
top_rated_movies, directors = IMDBScraper()
actors = imdb_scraper.get_actors()
casting = imdb_scraper.get_casting()

250it [14:39,  3.52s/it]
100%|█████████████████████████████████████| 250/250 [00:00<00:00, 230001.32it/s]
100%|████████████████████████████████████████| 250/250 [00:00<00:00, 397.08it/s]


In [30]:
top_rated_movies.head()

Unnamed: 0,id,title,release_year,rating,runtime,director_id
0,1,The Shawshank Redemption,1994,9.2,142,43
1,2,The Godfather,1972,9.2,175,41
2,3,The Dark Knight,2008,9.0,152,20
3,4,The Godfather Part II,1974,9.0,202,41
4,5,12 Angry Men,1957,9.0,96,130


In [26]:
directors.head()

(154, 2)

In [6]:
actors.head()

Unnamed: 0,id,name
0,1,Aamir Khan
1,2,Aaron Eckhart
2,3,Abbas-Ali Roomandi
3,4,Abbey Lee
4,5,Abbie Cornish


In [7]:
casting.head()

Unnamed: 0,id,movie_id,actor_id,ord
0,1,1,2943,1
1,2,1,2194,2
2,3,1,325,3
3,4,1,3134,4
4,5,1,543,5


In [8]:
top_rated_movies.to_csv('movies.csv', index=False)
directors.to_csv('directors.csv', index=False)
actors.to_csv('actors.csv', index=False)
casting.to_csv('casting.csv', index=False)

In [20]:
con = sqlite3.connect('imdb.db')
top_rated_movies.to_sql('movies', con, index=False)
directors.to_sql('directors', con, index=False)
actors.to_sql('actors', con, index=False)
casting.to_sql('casting', con, index=False)
cur = con.cursor()

In [21]:
create_movies = """
PRAGMA foreign_keys=off;
BEGIN TRANSACTION;
ALTER TABLE movies RENAME TO movies_no_keys;
CREATE TABLE movies (
    id INTEGER,
    title TEXT,
    release_year INTEGER,
    rating REAL,
    runtime INTEGER,
    director_id INTEGER,
    PRIMARY KEY (id)
);
INSERT INTO movies SELECT * FROM movies_no_keys;
COMMIT;
PRAGMA foreign_keys=on;
"""
cur.executescript(create_movies)
con.commit()

In [22]:
create_directors = """
PRAGMA foreign_keys=off;
BEGIN TRANSACTION;
ALTER TABLE directors RENAME TO directors_no_keys;
CREATE TABLE directors (
    id INTEGER,
    name TEXT,
    PRIMARY KEY (id),
    FOREIGN KEY (id) REFERENCES movies (director_id) 
            ON DELETE CASCADE ON UPDATE NO ACTION
);
INSERT INTO directors SELECT * FROM directors_no_keys;
COMMIT;
PRAGMA foreign_keys=on;
"""
cur.executescript(create_directors)
con.commit()

In [23]:
create_actors = """
PRAGMA foreign_keys=off;
BEGIN TRANSACTION;
ALTER TABLE actors RENAME TO actors_no_keys;
CREATE TABLE actors (
    id INTEGER,
    name TEXT,
    PRIMARY KEY (id)
);
INSERT INTO actors SELECT * FROM actors_no_keys;
COMMIT;
PRAGMA foreign_keys=on;
"""
cur.executescript(create_actors)
con.commit()

In [24]:
create_casting = """
PRAGMA foreign_keys=off;
BEGIN TRANSACTION;
ALTER TABLE casting RENAME TO casting_no_keys;
CREATE TABLE casting (
    id INTEGER,
    movie_id INTEGER,
    actor_id INTEGER,
    ord INTEGER,
    PRIMARY KEY (id)
    FOREIGN KEY (movie_id) REFERENCES movies (id) 
            ON DELETE CASCADE ON UPDATE NO ACTION
    FOREIGN KEY (actor_id) REFERENCES actors (id) 
            ON DELETE CASCADE ON UPDATE NO ACTION
);
INSERT INTO casting SELECT * FROM casting_no_keys;
COMMIT;
PRAGMA foreign_keys=on;
"""
cur.executescript(create_casting)
con.commit()

In [25]:
drop_tables = """
DROP TABLE movies_no_keys;
DROP TABLE directors_no_keys;
DROP TABLE actors_no_keys;
DROP TABLE casting_no_keys;
"""
cur.executescript(drop_tables)
con.commit()