# Movie script crawler and parser

- ## Imports

In [1]:
import os, sys, json, re, argparse, urllib2, html5lib
from bs4 import BeautifulSoup, Tag, UnicodeDammit
import pandas as pd
from pandas.io.json import json_normalize




- ### Extract functions

- ### Script loader and parser

In [2]:
#load script
script_url = 'http://www.imsdb.com/scripts/Star-Wars-The-Force-Awakens.html'
is_webpage_fetched = False
request_headers = {
"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
#"Connection": "keep-alive" 
}


while not is_webpage_fetched:
    # get the script's URL from the parameters if it was passed
    try:
        request = urllib2.Request(script_url, headers=request_headers)
        contents = urllib2.urlopen(request)#.read()
        soup = BeautifulSoup(contents, 'lxml')
        print('Detected encoding is ', soup.original_encoding)
        is_webpage_fetched = True
    except urllib2.URLError as err:
        print('Catched an URLError while fetching the URL:', err)
        pass
    except ValueError as err:
        print('Catched a ValueError while fetching the URL:', err)
        pass
    except:
        print('Catched an unrecognized error')
        raise
    else:
        #script_text = soup.find("td", class_="scrtext").find("pre")
        script_text = soup.find("pre")

        if( script_text.find("pre") ):
            print('Found a <pre> inside the <pre>')
            script_text = script_text.find("pre")

        print('Parsing {} and extracting the first <pre> resulted in the following text:'.format(script_url))
        print(str(script_text)[:256])
        is_webpage_fetched = True




print('\n \n OK, we have the text. A few questions before we get parsing:')

('Detected encoding is ', u'iso-8859-1')
Parsing http://www.imsdb.com/scripts/Star-Wars-The-Force-Awakens.html and extracting the first <pre> resulted in the following text:
<pre>

 
<b>                               STAR WARS: THE FORCE AWAKENS
</b>
                         

                         

                                       Written by

                         
                      Lawrence Kasda

 
 OK, we have the text. A few questions before we get parsing:


- ### White space analysis

In [3]:
%%time
n=0
spaces_regex = re.compile("^(\s*).*")
location_regex = re.compile("^\s*(INT\.|EXT\.)")
space_vector=[]

for block in script_text.descendants:
    # Si block est une instance de bs4.Tag, il est entouré de balises HTML
    # Le prochain block contiendra le même texte sans les balises
    # Donc on continue sans parser ce bloc
    if(isinstance(block, Tag)):
        continue

    # UnicodeDammit converts any string to UTF-8
    # does not work so well
    block = UnicodeDammit(block, soup.original_encoding).unicode_markup
    # remove leading and ending end of lines
    block = block.strip('\n').strip('\r\n')
    
    # if the block doesn't have any text, skip it
    if( re.search('\w', block) == None ):
        continue

    for line in block.split('\n'):
        stripped_line = line.strip(' \n\t\r')
        if( re.search('\w', line) == None ):
            continue    
        # Counting the number of spaces at the beginning of the line
        spmatch = spaces_regex.search(line)
        space_vector.append(len(spmatch.group(1)))
        #print(block)
        #print(line)
        #print(len(spmatch.group(1)))
        #print('----------------')
    if n==151:
        break
    else:
        n+=1

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 4.34 ms


In [4]:
block

u'          of her canteen-hitting. She sets her things on a piece of\r\n          sheet metal and sits next to it, sliding down the sand dune.\r\n          She RIDES DOWN THE MOUNTAIN OF SAND. WE HOLD FOR A LONG\r\n          TIME, looking down, as she recedes from us, toward the distant\r\n          SALT FLATS below. Random SCATTERED DEBRIS. Her SPEEDER,\r\n          tiny from here.\r\n          Rey moves to her junker SPEEDER, jumps on, fires the sputtery\r\n          engine and DRIVES OFF.'

In [5]:
block.split('\n')

[u'          of her canteen-hitting. She sets her things on a piece of\r',
 u'          sheet metal and sits next to it, sliding down the sand dune.\r',
 u'          She RIDES DOWN THE MOUNTAIN OF SAND. WE HOLD FOR A LONG\r',
 u'          TIME, looking down, as she recedes from us, toward the distant\r',
 u'          SALT FLATS below. Random SCATTERED DEBRIS. Her SPEEDER,\r',
 u'          tiny from here.\r',
 u'          Rey moves to her junker SPEEDER, jumps on, fires the sputtery\r',
 u'          engine and DRIVES OFF.']

In [6]:
line

u'          engine and DRIVES OFF.'

In [7]:
stripped_line

u'engine and DRIVES OFF.'

In [8]:
len(line)

32

In [9]:
BLOCK_TYPES=['character', 'speech', 'stage direction', 'location','unknown']
CHARACTER=0
SPEECH=1
DIRECTIONS=2
LOCATION=3


usual_spaces=[[26],[11],[10],[25],[]]

# Ici on définit les variables qu'on remplira de texte
is_intro = True
movie_script = []
intro = []
last_line_type = -1
last_character = ''
text = []
characters=[]

def write_csv(data,name,path):
    #if folder does not exist
    if not os.path.exists(path):
        os.makedirs(path)
    data.to_csv('%s%s.csv' %(path,name),sep='|',encoding='latin1')
    return

def get_line_type(line, stripped_line, usual_spaces):
    # Counting the number of spaces at the beginning of the line
    spmatch = spaces_regex.search(line)
    spaces_number = len(spmatch.group(1))
    block_type = 4

    if( location_regex.search(line) != None ):
        #print('location')
        return LOCATION

    #if stripped_line in characters:
        #print(character)
        #return CHARACTER

    # Look for space
    for block_type_usual_spaces in usual_spaces:
        if spaces_number in block_type_usual_spaces:
            block_type = usual_spaces.index(block_type_usual_spaces)
            #print('We consider {:d} leading spaces as a \'{:s}\' block.'.format(
            #      spaces_number, BLOCK_TYPES[block_type]))
            #print(BLOCK_TYPES[block_type])
            return usual_spaces.index(block_type_usual_spaces)
            

      
    #print('failure for identifying : %s categorizing it as unknown' %(repr(line)))
    return block_type #return code 5 for unknown


line_type = get_line_type(line, stripped_line, usual_spaces)

In [16]:
%%time
n=0
spaces_regex = re.compile("^(\s*).*")
location_regex = re.compile("^\s*(INT\.|EXT\.)")
space_vector=[]


BLOCK_TYPES=['character', 'speech', 'stage direction', 'location','unknown']
CHARACTER=0
SPEECH=1
DIRECTIONS=2
LOCATION=3


usual_spaces=[[26],[11],[10],[25],[]]

# Ici on définit les variables qu'on remplira de texte
is_intro = True
movie_script = []
intro = []
last_line_type = -1
last_character = 'unknown'
text = []
characters=[]



for block in script_text.descendants:
    # Si block est une instance de bs4.Tag, il est entouré de balises HTML
    # Le prochain block contiendra le même texte sans les balises
    # Donc on continue sans parser ce bloc
    if(isinstance(block, Tag)):
        continue

    # UnicodeDammit converts any string to UTF-8
    # does not work so well
    block = UnicodeDammit(block, soup.original_encoding).unicode_markup
    # remove leading and ending end of lines
    block = block.strip('\n').strip('\n\r')
    
    # if the block doesn't have any text, skip it
    if( re.search('\w', block) == None ):
        continue

    for line in block.split('\n'):
        stripped_line = line.strip(' \n\t\r')
        if( re.search('\w', line) == None ):
            continue    
        # Counting the number of spaces at the beginning of the line
        spmatch = spaces_regex.search(line)
        space_vector.append(len(spmatch.group(1)))
        #print(block)
        #print(line)
        #print(len(spmatch.group(1)))
        line_type = get_line_type(line, stripped_line, usual_spaces)
        #print(line_type)
        #print(line)

        if(last_line_type == -1 # -1 = not initialized
           or last_line_type == line_type):
            text.append(stripped_line)
        else:
            if(last_line_type == CHARACTER):
                last_character='\n'.join(text) #regex to supress (parenthesis) & replicate speaker
                if not last_character in characters:
                    characters.append(last_character)
            elif(last_line_type == SPEECH):
                movie_script.append({
                    'type': BLOCK_TYPES[last_line_type],
                    BLOCK_TYPES[CHARACTER]: last_character,
                    'text': '\n'.join(text)})
                #print('We just parsed this JSON block:')
                #print(movie_script[-1])
            else:
                movie_script.append({
                    'type': BLOCK_TYPES[last_line_type],
                    'text': '\n'.join(text)})
                #print('We just parsed this JSON block:')
                #print(movie_script[-1])
            text=[stripped_line]

        last_line_type = line_type
        #print('----------------')


    if n==10000000000:
        break
    else:
        n+=1
        
result = json_normalize(movie_script)
write_csv(result,'test','../data/')

CPU times: user 76 ms, sys: 0 ns, total: 76 ms
Wall time: 78.2 ms


In [17]:
result

Unnamed: 0,character,text,type
0,,STAR WARS: THE FORCE AWAKENS\nWritten by\nLawr...,unknown
1,,"A long time ago in a galaxy far, far away...",stage direction
2,,STAR WARS\nEPISODE VII\nTHE FORCE AWAKENS,unknown
3,unknown,"Luke Skywalker has vanished. In his absence,\n...",speech
4,,PAN across the star field to a bright moon. A ...,stage direction
5,,ARE IN:\nINT. TROOP TRANSPORT VEHICLE - NIGHT,location
6,,"TWENTY STORMTROOPERS. Holding on at attention,...",stage direction
7,,QUIET:\nEXT. JAKKU VILLAGE - NIGHT,location
8,,"WIDE SHOT of a small, peaceful village. Distan...",stage direction
9,,INT. LARGE HUT - NIGHT,location


In [None]:
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

space_vector_=np.array(space_vector)
plt.figure(figsize=(20,10))
plt.hist(space_vector_, bins=np.arange(space_vector_.min(), space_vector_.max()+1)-0.5)
plt.title('White space (head of line) distributions')
plt.xlabel('Number of white spaces')
plt.ylabel('Count')


In [None]:
from collections import Counter
Counter(space_vector)

    #all the occurences except the top 3 are occurences from global stage references