In [1]:
!pip install -U beautifulsoup4

Requirement already up-to-date: beautifulsoup4 in /usr/local/lib/python3.7/dist-packages (4.9.3)


In [2]:
import sqlite3
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from datetime import datetime

In [3]:
class IMDBScraper:
    def get_top_rated(self):
        request_headers = {
            'accept-language': "en-US,en;q=1.0",
            'content-language': 'en-US',
            'user-agent': 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
        }
        response = requests.get('https://www.imdb.com/chart/top', headers=request_headers)
        soup = BeautifulSoup(response.text)
        movie_titles = [e.text for e in soup.select('.titleColumn a')]
        movie_years = [int(e.text.replace('(', '').replace(')', '')) for e in soup.select('.secondaryInfo')]
        top_rated_movies = pd.DataFrame()
        top_rated_movies['id'] = range(1, len(movie_titles) + 1)
        top_rated_movies['title'] = movie_titles
        top_rated_movies['release_year'] = movie_years
        movie_page_links = ["https://www.imdb.com{}".format(e.get("href")) for e in soup.select('.titleColumn a')]
        self.movie_page_links = movie_page_links
        return top_rated_movies
    def get_movie_info(self):
        top_rated_movies = self.get_top_rated()
        movie_ids = top_rated_movies['id'].values
        movie_info = []
        for link, mid in tqdm(zip(self.movie_page_links, movie_ids)):
            request_headers = {
            'accept-language': "en-US,en;q=1.0",
            'content-language': 'en-US',
            'user-agent': 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
            }
            response = requests.get(link, headers=request_headers)
            soup = BeautifulSoup(response.text)
            rating = float(soup.select('strong span')[0].text)
            director = soup.select('.credit_summary_item a')[0].text
            genre = [e.text.strip() for e in soup.select('.subtext a')]
            release_date = genre.pop()
            release_date_split = release_date.split()
            release_date_split.pop()
            release_date = '-'.join(release_date_split)
            movie_time = soup.select('time')[0].text.strip().split()
            if len(movie_time) == 2:
                hours, mins = movie_time
                hours = int(hours.replace('h', ''))
                mins = int(mins.replace('min', ''))
                movie_time_mins = hours * 60 + mins
            else:
                hours_mins = movie_time[0]
                if 'h' in hours_mins:
                    hours = int(hours_mins.replace('h', ''))
                    movie_time_mins = hours * 60
                elif 'min' in hours_mins:
                    mins = int(hours_mins.replace('min', ''))
                    movie_time_mins = mins
            actor = [e.text.strip() for e in soup.select('.primary_photo+ td a')]
            ord = list(range(1, len(actor) + 1))
            movie_data = {
                'id': mid,
                'rating': rating,
                'director': director,
                'genre': genre,
                'release_date': release_date,
                'runtime': movie_time_mins,
                'actor': actor,
                'ord': ord
            }
            movie_info.append(movie_data)
        self._movie_info = movie_info
        return movie_info
    def get_top_rated_movies(self):
        top_rated = self.get_top_rated()
        movie_info = self.get_movie_info()
        movie_info_list = []
        for item in movie_info:
            movie_data = {
                'id': item['id'],
                'rating': item['rating'],
                'director': item['director'],
                'runtime': item['runtime']
            }
            movie_info_list.append(movie_data)
        movie_info_df = pd.DataFrame(movie_info_list)
        top_rated_movies = pd.merge(top_rated, movie_info_df, left_on='id', right_on='id')
        return top_rated_movies[['id', 'title', 'release_year', 'rating', 'director', 'runtime']]
    def get_actors(self):
        movie_info = self._movie_info
        actors_list = []
        for item in tqdm(movie_info):
            actor = item['actor']
            actors_list += actor
        unique_actors = set(actors_list)
        ordered_actors = sorted(list(unique_actors))
        ids = list(range(1, len(ordered_actors) + 1))
        actors = pd.DataFrame()
        actors['id'] = ids
        actors['name'] = ordered_actors
        self._actors = actors
        return actors
    def get_casting(self):
        movie_info = self._movie_info
        actors = self._actors
        casting = pd.DataFrame()
        for item in tqdm(movie_info):
            movie_id = item['id']
            names = item['actor']
            ords = item['ord']
            df = pd.DataFrame()
            df['actor_name'] = names
            df['ord'] = ords
            df['movie_id'] = movie_id
            casting = casting.append(df)
        casting_merged = pd.merge(casting, actors, left_on='actor_name', right_on='name', how='left')
        casting_selected = casting_merged[['movie_id', 'id', 'ord']]
        casting_selected.columns = ['movie_id', 'actor_id', 'ord']
        return casting_selected

In [4]:
imdb_scraper = IMDBScraper()
top_rated_movies = imdb_scraper.get_top_rated_movies()
actors = imdb_scraper.get_actors()
casting = imdb_scraper.get_casting()

250it [03:54,  1.07it/s]
100%|██████████| 250/250 [00:00<00:00, 181007.42it/s]
100%|██████████| 250/250 [00:00<00:00, 319.27it/s]


In [7]:
conn = sqlite3.connect('imdb.db')
top_rated_movies.to_sql('movies', conn, index=False)
actors.to_sql('actors', conn, index=False)
casting.to_sql('casting', conn, index=False)

In [None]:
#top_rated_movies.to_csv('top_rated_movies.csv', index=False)

In [None]:
#top_rated_movies.to_json('imdb_top_rated.json', orient='records', force_ascii=False)

In [None]:
#top_rated_movies.to_excel('imdb_top_rated.xlsx', index=False)