# Movie script crawler and parser

- ## Imports

In [1]:
import os, sys, json, re, argparse, urllib, html5lib
from bs4 import BeautifulSoup, Tag, UnicodeDammit

- ### Extract functions

In [None]:
def extract_characters(script):
    '''
    Extracts the (unique) characters list from the script
    '''
    characters=[]
    for block in script['movie_script']:
        if(block['type'] == BLOCK_TYPES[SPEECH]):
            character = block['character']
            if not character in characters:
                characters.append(character)

    return characters

def extract_locations(script):
    '''
    Extracts the (unique) locations list from the script
    '''
    locations=[]
    for block in script['movie_script']:
        if(block['type'] == BLOCK_TYPES[LOCATION]):
            location = block['text']
            if not location in locations:
                locations.append(location)

    return locations

def extract_directions(script):
    '''
    Extracts the stage directions list from the script
    '''
    directions=[]
    for block in script['movie_script']:
        if(block['type'] == BLOCK_TYPES[DIRECTIONS]):
            directions.append(block['text'])

    return directions

def extract_speech_given_character(script, character, strict_match=False):
    '''
    Extracts the given character's utterances from the script

    If strict_match is True, we will only extract utterances that perfectly match (==) the parameter;
    otherwise, we will extract utterances whose character partly matches (in) the parameter.
    In both cases, the match is case-insensitive.

    Also asks the user wether one wants to keep the character's name before each utterance.
    '''

    keep_character_name = False
    answer = input('Do you want to keep the character\'s names? (y/N) ')
    if( answer == 'y' or answer =='Y' ):
        keep_character_name = True

    speeches=[]
    for block in script['movie_script']:
        if( block['type'] == BLOCK_TYPES[SPEECH] and
            (strict_match and (character.lower() == block['character'].lower()) or
             not strict_match and (character.lower() in block['character'].lower())) ):
            if( keep_character_name ):
                speeches.append(block['character'])
            speeches.append(block['text'])

    return speeches

def extract_all_characters_speech(script):
    '''
    Extracts all speeches from the script
    '''
    return extract_speech_given_character(script, '')

def extract_speech_asking_user(script):
    '''
    Extracts utterances by asking the user which character one wants to get

    Also asks wether the user wants a perfect (==) or partial (in) match.
    '''
    character = input_string('Please provide the name of the character: ')

    strict_match = False
    answer = input('Do you want utterances of this exact character (or any character that matches \'{}\')? (y/N) '.format(character))
    if( answer == 'y' or answer =='Y' ):
        strict_match = True

    return extract_speech_given_character(script, character, strict_match)

def extract_speech_using_characters_list(script):
    '''
    Extracts utterances by providing the user with the characters list

    Also asks wether the user wants a perfect (==) or partial (in) match.
    '''
    characters = sorted(extract_characters(script))
    character = characters[input_from_list('Please choose a character:', characters)]

    strict_match = False
    answer = input('Do you want utterances of this exact character (or any character that matches \'{}\')? (y/N) '.format(character))
    if( answer == 'y' or answer =='Y' ):
        strict_match = True

    return extract_speech_given_character(script, character, strict_match)

def extract_speech(script):
    '''
    Asks the user which speeches one wants to extract, and calls the appropriate function
    '''
    speech=[]

    choices = ['all characters',
               'give the character\'s name',
               'choose from the characters list']

    action = input_from_list("Which character speeches do you want to extract?", choices)

    if( action == 0 ):
        return extract_all_characters_speech(script)
    elif( action == 1 ):
        return extract_speech_asking_user(script)
    elif( action == 2 ):
        return extract_speech_using_characters_list(script)


In [None]:
BLOCK_TYPES=['character', 'speech', 'stage direction', 'location']
CHARACTER=0
SPEECH=1
DIRECTIONS=2
LOCATION=3

ACTIONS=['extract all character names', 'extract some speech',
         'extract all stage directions', 'extract all locations']
EXTRACT_CHARACTERS=0
EXTRACT_SPEECH=1
EXTRACT_DIRECTIONS=2
EXTRACT_LOCATIONS=3

- ### Script loader and parser

In [None]:
# loop until we get a valid script_url

script_url = ''
is_webpage_fetched = False
while not is_webpage_fetched:
    # get the script's URL from the parameters if it was passed
    try:
        request = urllib.request.Request(script_url)
        webpage_bytes = urllib.request.urlopen(request)
        soup = BeautifulSoup(webpage_bytes, 'lxml')
        print('Detected encoding is ', soup.original_encoding)
        is_webpage_fetched = True
    except urllib.error.URLError as err:
        print('Catched an URLError while fetching the URL:', err)
        print()
        pass
    except ValueError as err:
        print('Catched a ValueError while fetching the URL:', err)
        print()
        pass
    except:
        print('Catched an unrecognized error')
        raise
    else:
        #script_text = soup.find("td", class_="scrtext").find("pre")
        script_text = soup.find("pre")

        if( script_text.find("pre") ):
            print('Found a <pre> inside the <pre>')
            script_text = script_text.find("pre")

        print('Parsing {} and extracting the first <pre> resulted in the following text:'.format(request.full_url))
        print(str(script_text)[:256])
        answer = input('Is that the script you expected? (Y/n) ')

        if( answer == 'N' or answer == 'n' ):
            answer = input('Shall we try with another URL? (Y/n) ')
            if( answer == 'N' or answer == 'n' ):
                raise ValueError('The result was not what we expected.')

        is_webpage_fetched = True



print()
print()
print('OK, we have the text. A few questions before we get parsing:')