# Movie script crawler and parser

- ## Imports

In [12]:
import os, sys, json, re, argparse, urllib2, html5lib
from bs4 import BeautifulSoup, Tag, UnicodeDammit

- ### Extract functions

In [None]:
def extract_characters(script):
    '''
    Extracts the (unique) characters list from the script
    '''
    characters=[]
    for block in script['movie_script']:
        if(block['type'] == BLOCK_TYPES[SPEECH]):
            character = block['character']
            if not character in characters:
                characters.append(character)

    return characters

def extract_locations(script):
    '''
    Extracts the (unique) locations list from the script
    '''
    locations=[]
    for block in script['movie_script']:
        if(block['type'] == BLOCK_TYPES[LOCATION]):
            location = block['text']
            if not location in locations:
                locations.append(location)

    return locations

def extract_directions(script):
    '''
    Extracts the stage directions list from the script
    '''
    directions=[]
    for block in script['movie_script']:
        if(block['type'] == BLOCK_TYPES[DIRECTIONS]):
            directions.append(block['text'])

    return directions

def extract_speech_given_character(script, character, strict_match=False):
    '''
    Extracts the given character's utterances from the script

    If strict_match is True, we will only extract utterances that perfectly match (==) the parameter;
    otherwise, we will extract utterances whose character partly matches (in) the parameter.
    In both cases, the match is case-insensitive.

    Also asks the user wether one wants to keep the character's name before each utterance.
    '''

    keep_character_name = False
    answer = input('Do you want to keep the character\'s names? (y/N) ')
    if( answer == 'y' or answer =='Y' ):
        keep_character_name = True

    speeches=[]
    for block in script['movie_script']:
        if( block['type'] == BLOCK_TYPES[SPEECH] and
            (strict_match and (character.lower() == block['character'].lower()) or
             not strict_match and (character.lower() in block['character'].lower())) ):
            if( keep_character_name ):
                speeches.append(block['character'])
            speeches.append(block['text'])

    return speeches

def extract_all_characters_speech(script):
    '''
    Extracts all speeches from the script
    '''
    return extract_speech_given_character(script, '')

def extract_speech_asking_user(script):
    '''
    Extracts utterances by asking the user which character one wants to get

    Also asks wether the user wants a perfect (==) or partial (in) match.
    '''
    character = input_string('Please provide the name of the character: ')

    strict_match = False
    answer = input('Do you want utterances of this exact character (or any character that matches \'{}\')? (y/N) '.format(character))
    if( answer == 'y' or answer =='Y' ):
        strict_match = True

    return extract_speech_given_character(script, character, strict_match)

def extract_speech_using_characters_list(script):
    '''
    Extracts utterances by providing the user with the characters list

    Also asks wether the user wants a perfect (==) or partial (in) match.
    '''
    characters = sorted(extract_characters(script))
    character = characters[input_from_list('Please choose a character:', characters)]

    strict_match = False
    answer = input('Do you want utterances of this exact character (or any character that matches \'{}\')? (y/N) '.format(character))
    if( answer == 'y' or answer =='Y' ):
        strict_match = True

    return extract_speech_given_character(script, character, strict_match)

def extract_speech(script):
    '''
    Asks the user which speeches one wants to extract, and calls the appropriate function
    '''
    speech=[]

    choices = ['all characters',
               'give the character\'s name',
               'choose from the characters list']

    action = input_from_list("Which character speeches do you want to extract?", choices)

    if( action == 0 ):
        return extract_all_characters_speech(script)
    elif( action == 1 ):
        return extract_speech_asking_user(script)
    elif( action == 2 ):
        return extract_speech_using_characters_list(script)


In [None]:
BLOCK_TYPES=['character', 'speech', 'stage direction', 'location']
CHARACTER=0
SPEECH=1
DIRECTIONS=2
LOCATION=3

ACTIONS=['extract all character names', 'extract some speech',
         'extract all stage directions', 'extract all locations']
EXTRACT_CHARACTERS=0
EXTRACT_SPEECH=1
EXTRACT_DIRECTIONS=2
EXTRACT_LOCATIONS=3

- ### Script loader and parser

In [17]:
# loop until we get a valid script_url

script_url = 'http://www.imsdb.com/scripts/Star-Wars-The-Force-Awakens.html'
is_webpage_fetched = False
while not is_webpage_fetched:
    # get the script's URL from the parameters if it was passed
    try:
        request = urllib2.urlopen(script_url)
        soup = BeautifulSoup(request, 'lxml')
        print('Detected encoding is ', soup.original_encoding)
        is_webpage_fetched = True
    except urllib2.URLError as err:
        print('Catched an URLError while fetching the URL:', err)
        print()
        pass
    except ValueError as err:
        print('Catched a ValueError while fetching the URL:', err)
        print()
        pass
    except:
        print('Catched an unrecognized error')
        raise
    else:
        #script_text = soup.find("td", class_="scrtext").find("pre")
        script_text = soup.find("pre")

        if( script_text.find("pre") ):
            print('Found a <pre> inside the <pre>')
            script_text = script_text.find("pre")

        print('Parsing {} and extracting the first <pre> resulted in the following text:'.format(script_url))
        print(str(script_text)[:256])
        is_webpage_fetched = True



print()
print()
print('OK, we have the text. A few questions before we get parsing:')

('Detected encoding is ', u'iso-8859-1')
Parsing http://www.imsdb.com/scripts/Star-Wars-The-Force-Awakens.html and extracting the first <pre> resulted in the following text:
<pre>

 
<b>                               STAR WARS: THE FORCE AWAKENS
</b>
                         

                         

                                       Written by

                         
                      Lawrence Kasda
()
()
OK, we have the text. A few questions before we get parsing:


In [25]:
# script dict to be serialized as JSON
script=dict()


# Let's fill what we can here
script['movie_url'] = script_url

# movie's name


# movie's year



BLOCK_TYPES=['character', 'speech', 'stage direction', 'location']
CHARACTER=0
SPEECH=1
DIRECTIONS=2
LOCATION=3


# COMPILE ALL THE REGULAR EXPRESSIONS!
spaces_regex = re.compile("^(\s*).*")
location_regex = re.compile("^\s*(INT\.|EXT\.)")

def get_line_type(line, stripped_line, usual_spaces, characters):
    # Counting the number of spaces at the beginning of the line
    spmatch = spaces_regex.search(line)
    spaces_number = len(spmatch.group(1))
    block_type = 0

    if( location_regex.search(line) != None ):
        return LOCATION

    if stripped_line in characters:
        return CHARACTER

    # Look for space
    for block_type_usual_spaces in usual_spaces:
        if spaces_number in block_type_usual_spaces:
            block_type = usual_spaces.index(block_type_usual_spaces)
            #print('We consider {:d} leading spaces as a \'{:s}\' block.'.format(
            #      spaces_number, BLOCK_TYPES[block_type]))
            return usual_spaces.index(block_type_usual_spaces)

    print('There are {:d} space(s) at the beginning of this line'.format(spaces_number))
    question = "What kind of block is that?\n"
    for i in range(len(BLOCK_TYPES)):
        question += '\t('+str(i)+') ' + BLOCK_TYPES[i] + '\n'
    print(question)

    validated = False
    while( validated == False):
        try:
            block_type = int(input('? [0-{:d}] '.format(len(BLOCK_TYPES)-1)))
            while( block_type < 0 or block_type >= len(BLOCK_TYPES)):
                block_type = int(input('? [0-{:d}] '.format(len(BLOCK_TYPES)-1)))
        except ValueError:
            continue

        validated = True


    remember_spaces = False
    validated = False
    while( validated == False):
        answer_spaces = input('Are all  lines with {:d} leading spaces \'{:s}\' blocks ? (Y/n) '.format(
                spaces_number, BLOCK_TYPES[block_type]))

        if( answer_spaces == 'n' or answer_spaces =='N' ):
            print('You said no: we will ask you again next time.')
            remember_spaces = False
        else:
            print('You said yes: ' +
                  'every new block with {:d} leading spaces '.format(spaces_number) +
                  'will now be considered a \'{:s}\'.'.format(BLOCK_TYPES[block_type]) )
            remember_spaces = True

        validated = True


    if( remember_spaces ):
        usual_spaces[block_type].append(spaces_number)

    return block_type



# In[53]:

# DA big loop

usual_spaces=[[] for i in range(len(BLOCK_TYPES))]

# Ici on définit les variables qu'on remplira de texte
is_intro = True
movie_script = []
intro = []
last_line_type = -1
last_character = ''
text = []
characters=[]


print()
print()
print("Here we go for some kickass movie script parsing!")
print()
print()
print("Start by telling me when the introduction will end.")

for block in script_text.descendants:
    # Si block est une instance de bs4.Tag, il est entouré de balises HTML
    # Le prochain block contiendra le même texte sans les balises
    # Donc on continue sans parser ce bloc
    if(isinstance(block, Tag)):
        continue

    # UnicodeDammit converts any string to UTF-8
    # does not work so well
    block = UnicodeDammit(block, soup.original_encoding).unicode_markup
    # remove leading and ending end of lines
    block = block.strip('\n')

    # if the block doesn't have any text, skip it
    if( re.search('\w', block) == None ):
        continue

    # bs4 ne coupe pas toujours bien les différents blocs
    # Mieux vaut donc redécouper par paragraphe et les traiter un à un
    for line in block.split('\n'):
        stripped_line = line.strip(' \n\t\r')
        if( re.search('\w', line) == None ):
            continue

        print('------------------------------ Begin line ------------------------------')
        print(line)
        print('                        ------- End line -------')

        if( is_intro ):
            print()
            answer = input("Is that still part of the intro? (Y/n) ")

            if(answer == 'n' or answer == 'N'):
                is_intro = False
                movie_script.append({
                    'type': 'introduction',
                    'text': '\n'.join(intro)})

                print(movie_script[-1])
            else:
                print("OK")
                print()
                intro.append(stripped_line)
                continue


        line_type = get_line_type(line, stripped_line, usual_spaces, characters)
        print("The last line was interpreted as '{}'".format(BLOCK_TYPES[line_type]))
        print()

        if(last_line_type == -1 # -1 = not initialized
           or last_line_type == line_type):
            text.append(stripped_line)
        else:
            if(last_line_type == CHARACTER):
                last_character='\n'.join(text)
                if not last_character in characters:
                    characters.append(last_character)
            elif(last_line_type == SPEECH):
                movie_script.append({
                    'type': BLOCK_TYPES[last_line_type],
                    BLOCK_TYPES[CHARACTER]: last_character,
                    'text': '\n'.join(text)})
                print('We just parsed this JSON block:')
                print(movie_script[-1])
            else:
                movie_script.append({
                    'type': BLOCK_TYPES[last_line_type],
                    'text': '\n'.join(text)})
                print('We just parsed this JSON block:')
                print(movie_script[-1])
            text=[stripped_line]

        last_line_type = line_type

        #print('block_type={:d}'.format(line_type))
        #print('usual spaces:')
        #print(usual_spaces)
        #print('This line is a \'{:s}\'.'.format(BLOCK_TYPES[line_type]))

        print()

    print()
    print()

movie_script.append({
    'type': BLOCK_TYPES[line_type],
    'text': '\n'.join(text)})

print('We just parsed this JSON block:')
print(movie_script[-1])
print()
print()

script['movie_script'] = movie_script

print('All done, biiiiitch!')


# In[64]:

#print(flush=True)
#print(flush=True)
print('(Our current directory is: {})'.format(os.getcwd()))
out_filename = input('Now, gimme an output filename: ')

try:
    fd = open(out_filename, 'w')
    json.dump(script, fd, indent=True)
    print('We just successfully wrote {}\'s script as JSON to {} .'.format(script['movie_title'], fd.name))
    print('Bravo!')
except:
    print("Shit happened: ", sys.exc_info()[0])
finally:
    fd.close()
    print()
    print('Done.')


()
()
Here we go for some kickass movie script parsing!
()
()
Start by telling me when the introduction will end.
------------------------------ Begin line ------------------------------
                               STAR WARS: THE FORCE AWAKENS
                        ------- End line -------
()
Is that still part of the intro? (Y/n) 'Y'
OK
()
()
()
------------------------------ Begin line ------------------------------
                                       Written by
                        ------- End line -------
()
Is that still part of the intro? (Y/n) 'y'
OK
()
------------------------------ Begin line ------------------------------
                      Lawrence Kasdan, J.J. Abrams & Michael Arndt
                        ------- End line -------
()
Is that still part of the intro? (Y/n) 'y'
OK
()
------------------------------ Begin line ------------------------------
                       Based on characters created by George Lucas
                        ------- End line 

KeyboardInterrupt: 

In [27]:
# script dict to be serialized as JSON
script=dict()


# Let's fill what we can here
script['movie_url'] = script_url

# movie's name


# movie's year



BLOCK_TYPES=['character', 'speech', 'stage direction', 'location']
CHARACTER=0
SPEECH=1
DIRECTIONS=2
LOCATION=3


# COMPILE ALL THE REGULAR EXPRESSIONS!
spaces_regex = re.compile("^(\s*).*")
location_regex = re.compile("^\s*(INT\.|EXT\.)")


# DA big loop
usual_spaces=[[] for i in range(len(BLOCK_TYPES))]
# Ici on définit les variables qu'on remplira de texte
is_intro = True
movie_script = []
intro = []
last_line_type = -1
last_character = ''
text = []
characters=[]


# In[53]:

# DA big loop

usual_spaces=[[] for i in range(len(BLOCK_TYPES))]
usual_spaces=[[26,32]]
# Ici on définit les variables qu'on remplira de texte
is_intro = True
movie_script = []
intro = []
last_line_type = -1
last_character = ''
text = []
characters=[]



32

In [None]:
for block in script_text.descendants:
    # Si block est une instance de bs4.Tag, il est entouré de balises HTML
    # Le prochain block contiendra le même texte sans les balises
    # Donc on continue sans parser ce bloc
    if(isinstance(block, Tag)):
        continue

    # UnicodeDammit converts any string to UTF-8
    # does not work so well
    block = UnicodeDammit(block, soup.original_encoding).unicode_markup
    # remove leading and ending end of lines
    block = block.strip('\n')

    # if the block doesn't have any text, skip it
    if( re.search('\w', block) == None ):
        continue

    # bs4 ne coupe pas toujours bien les différents blocs
    # Mieux vaut donc redécouper par paragraphe et les traiter un à un
    for line in block.split('\n'):
        stripped_line = line.strip(' \n\t\r')



        line_type = get_line_type(line, stripped_line, usual_spaces, characters)
        print("The last line was interpreted as '{}'".format(BLOCK_TYPES[line_type]))
        print()

        if(last_line_type == -1 # -1 = not initialized
           or last_line_type == line_type):
            text.append(stripped_line)
        else:
            if(last_line_type == CHARACTER):
                last_character='\n'.join(text)
                if not last_character in characters:
                    characters.append(last_character)
            elif(last_line_type == SPEECH):
                movie_script.append({
                    'type': BLOCK_TYPES[last_line_type],
                    BLOCK_TYPES[CHARACTER]: last_character,
                    'text': '\n'.join(text)})
                print('We just parsed this JSON block:')
                print(movie_script[-1])
            else:
                movie_script.append({
                    'type': BLOCK_TYPES[last_line_type],
                    'text': '\n'.join(text)})
                print('We just parsed this JSON block:')
                print(movie_script[-1])
            text=[stripped_line]

        last_line_type = line_type

        #print('block_type={:d}'.format(line_type))
        #print('usual spaces:')
        #print(usual_spaces)
        #print('This line is a \'{:s}\'.'.format(BLOCK_TYPES[line_type]))

        print()

    print()
    print()

movie_script.append({
    'type': BLOCK_TYPES[line_type],
    'text': '\n'.join(text)})

print('We just parsed this JSON block:')
print(movie_script[-1])
print()
print()

script['movie_script'] = movie_script

print('All done!')


# In[64]:

#print(flush=True)
#print(flush=True)
print('(Our current directory is: {})'.format(os.getcwd()))
out_filename = 'test.txt'

try:
    fd = open(out_filename, 'w')
    json.dump(script, fd, indent=True)
    print('We just successfully wrote {}\'s script as JSON to {} .'.format(script['movie_title'], fd.name))
    print('Bravo!')
except:
    print("Shit happened: ", sys.exc_info()[0])
finally:
    fd.close()
    print()
    print('Done.')
