#### Scrape and store scripts from iMSDB

In [1]:
import os
import bs4
import requests
import time
import json

In [2]:
def get_all_movies():
    """
    Get a list of all movies from imsdb
    :return:
    """

    all_scripts_url = "https://imsdb.com/all-scripts.html"
    page = requests.get(all_scripts_url)
    soup = bs4.BeautifulSoup(page.content, 'html.parser')
    movie_p = soup.find_all('p')
    movie_titles = [p.find('a').text for p in movie_p]

    return movie_titles

In [3]:
def convert_title_to_url(title: str):
    """
    Convert movie title to imsdb url
    :param title: movie title
    :return:
    """
    title = title.replace(' ', '-')
    title = title.replace(':', '')
    return f"https://imsdb.com/scripts/{title}.html"

convert_title_to_url("42")

'https://imsdb.com/scripts/42.html'

In [8]:
def get_script(script_url: str):
    """
    Fetches and returns script from the given URL

    :param script_url: URL of script to fetch
    :return: The extracted script text if found, otherwise None.
    :rtype: Optional[str]
    """
    response = requests.get(script_url)
    if response.status_code != 200:
        print(f"No Script Found for {script_url}")
        return None
    page = requests.get(script_url)
    soup = bs4.BeautifulSoup(page.content, 'html.parser')
    td_tag = soup.find('td', class_='scrtext')
    return td_tag.text

get_script("https://imsdb.com/scripts/This-is-40.html")

In [5]:
def fix_title(title):
    """
    Fixes the formatting of a title string

    :param title: movie title
    :rtype: str
    """
    if title.split(',')[-1] == ' The':
        new_title = title.split(',')[::-1]
        title = ' '.join(new_title).strip()
    return title

In [6]:
script_directory = "../../scripts"
movie_titles = get_all_movies()
movie_dict = {}
for ind, title in enumerate(movie_titles):
    try:
        if ind % 100 == 0:
            print(ind)
        title_url = convert_title_to_url(title)
        script = get_script(title_url)
        fixed_title = fix_title(title)
        if script is None:
            movie_dict[fixed_title] = None
        elif len(script.split()) < 100:
            movie_dict[fixed_title] = None
        else:
            file_name = title.replace('.', '_')
            filename = title.replace(':', '-') + ".txt"
            movie_dict[fixed_title] = filename
            with open(f"{script_directory}/{filename}", "w", encoding="utf-8") as f:
                f.write(script)
    except Exception as e:
        print(f"{title}: {e}")
    time.sleep(1)

0
No Script Found for https://imsdb.com/scripts/A.I..html
100
No Script Found for https://imsdb.com/scripts/Batman-and-Robin.html
No Script Found for https://imsdb.com/scripts/Batman-Begins.html
No Script Found for https://imsdb.com/scripts/Batman-Forever.html
No Script Found for https://imsdb.com/scripts/Batman-Returns.html
200
No Script Found for https://imsdb.com/scripts/Casablanca.html
300
No Script Found for https://imsdb.com/scripts/Contact.html
No Script Found for https://imsdb.com/scripts/Dark-Knight,-The.html
No Script Found for https://imsdb.com/scripts/Donnie-Darko.html
400
No Script Found for https://imsdb.com/scripts/Executive-Decision.html
No Script Found for https://imsdb.com/scripts/Eyes-Wide-Shut.html
No Script Found for https://imsdb.com/scripts/Full-Metal-Jacket.html
No Script Found for https://imsdb.com/scripts/Fury.html
500
No Script Found for https://imsdb.com/scripts/Ginger-Snaps.html
No Script Found for https://imsdb.com/scripts/Goodfellas.html
No Script Found f

In [7]:
with open('../../data/movie_list.json', 'w') as f:
    json.dump(movie_dict, f)