In [1]:
from startrek.script import Script

In [13]:
a = Script('tng', script_path='./scripts/tng/Episode 49 - The Ensigns of Command.txt')

In [None]:
a.dialogue

In [21]:
a.extract_dialogue_from_script()

In [4]:
from collections import defaultdict
from itertools import tee
import re

In [5]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

In [6]:
def _number_header_from_line(line):
    line = line.split()
    return line[0], ' '.join(line[1:])

In [7]:
def get_between_indices(s, begin, end):
    return s[begin:end]

In [23]:
sections, indices = section_headers(a.dialogue)

In [None]:
sections

In [25]:
from collections import deque
import copy
import itertools

In [26]:
b = sectioned_script(a.dialogue, indices)

NameError: name 'sectioned_script' is not defined

In [19]:
chars = get_characters(a.dialogue)

TypeError: 'NoneType' object is not iterable

In [133]:
chars

{"ARD'RIAN",
 'BEVERLY',
 'DATA',
 'GEORDI',
 'GOSHEVEN',
 'HARITATH',
 'KENTOR',
 "O'BRIEN",
 'PICARD',
 'RIKER',
 'SHELIAK',
 'TROI',
 'WESLEY',
 'WORF'}

In [58]:
def section_headers(dialogue):
    '''Returns the section headers from a block of dialogue and their
    respective line numbers in said block. Run once.'''

    sections = {}
    indices = []
    _regex_number = r'^\d{1,3}?[a-zA-Z]{0,1}'
    regex_number = re.compile(_regex_number)

    for index, line in enumerate(dialogue):
        words = line.split()
        if not words:
            continue
        try:
            int(words[0][0])
            number = words[0]
            name = " ".join(words[1:]).replace(':', '')
            if not name:
                name = 'OMITTED'
            # Corner case check if year is in section number
            # if not re.findall(regex_number, number):
            #     print(number, re.findall(regex_number, number))
            if len(number) > 3 and number[3].isdigit():
                continue
            # Check for same section number
            if number in sections.keys():
                sections[number].append(name)
            else:
                sections[number] = [name]
            indices.append(index)
        except:
            continue
    return sections, indices

def get_characters(dialogue):
    """
    Gets a set of all the characters with dialogue from a sectioned script.
    """
    ACT = ['ACT']
    END = ['END OF']
    NUMBERS = ['ONE', 'TWO', 'THREE', 'FOUR', 'FIVE', 'SIX', 'SEVEN', 'EIGHT', 'NINE', 'TEN']
    SKIPS = ['THE END', 'END OF TEASER', 'FADE OUT', 'FADE OUT.', 'FADE OUT:', 'END OF THE TEASER', 'ENTER.', 'ENTERS.']
    for combo in itertools.product(END, ACT, NUMBERS):
        SKIPS.append(' '.join(combo))
    for combo in itertools.product(ACT, NUMBERS):
        SKIPS.append(' '.join(combo))
    _regex_character = r"^\s*([A-Z-.'\"() ]+)\s*$"
    regex_character = re.compile(_regex_character)

    characters = set()
    for line in dialogue:
        matches = re.findall(regex_character, line.strip())
        if matches:
            for match in matches:
                # Yay random corner cases!
                if match in SKIPS:
                    continue
                parens = match.find('(')
                quote = match.find('\'')
                if parens != -1:
                    match = match[:parens - 1]
                if quote != -1:
                    # Taking a stab at possesive names instead of names with random asterisks 
                    # in them to sound all scify and bullshit for this project.
                    if match[quote+1] == 'S':
                        match = match[:quote]
                if 'VOICE' in match:
                    match = match.replace('VOICE', '')
                if match.endswith('.'):
                    while match.endswith('.'):
                        match = match[:-1]
                if match.startswith('('):
                    continue
                if match.endswith(')'):
                    continue
                characters.add(match.replace('"', '').strip())

    return characters

def _separate_dialogue_block(block, characters):
    """
    Takes a list of lines of dialogue and character names and returns a dictionary
    with each character's lines Dict[name, text].
    """
    block = deque(block)
    dialogue = {}
    temp = ''

    # Check if any initial lines are text and save them.
    while True:
        if not block:
            break
        line = block.popleft()
        if not line.isupper():
            # First line is dialogue/text
            temp = f"{temp} {line}"
        else:
            block.appendleft(line)
            break

    if temp:
        dialogue[0] = dict(name='None', text=temp.strip())
        index = 1
    else:
        index = 0

    name = ''
    text = ''
    for line in block:
        if not line:
            continue
        if line.isupper():
            # Denotes a character's name on it's own line, indicating dialogue.
            if name == line:
                # Character continuing dialogue.
                continue
            else:
                # Different character dialogue.
                if name:
                    # New dialogue. Push dialogue into dict and start over.
                    dialogue[index] = dict(name=_character_name(name, characters), text=text.strip())
                    name = line
                    text = ''
                    index += 1
                else:
                    # Character name
                    name = line
                    continue
        else:
            # Dialogue line
            text = f"{text} {line}"

    return dialogue

def _character_name(input_name, characters):
    """
    Cleans up character names. E.g., remove posession or voice over notes.
    """
    for character in characters:
        if character in input_name:
            return character
    return 'None'

def separate_lines_by_character(dialogue, characters=None):
#     temp_sectioned = _sectioned_script(dialogue, indices)
#     sectioned = copy.deepcopy(temp_sectioned)
    if not characters:
        characters = get_characters(dialogue)
    sectioned = copy.deepcopy(dialogue)
    for number, section in dialogue.items():
        for index, part in enumerate(section):
            block = part['part']
            if block:
                block = _separate_dialogue_block(block, characters)
#                 print(sectioned[number][index])
                sectioned[number][index]['part'] = block
            else:
                del sectioned[number]
                continue
            
    return sectioned

def _sectioned_script(dialogue, header_indices):
    """
    Separates dialogue into sections using the indices of the headers found in the dialogue list of lines.
    """
    sections = {}
    index_pairs = pairwise(header_indices)

    for pair in index_pairs:
        # Get the lines between two indices
        part = get_between_indices(dialogue, *pair)
        # The first line is the header of the section
        head = part.pop(0)
        # The header is composed of a number and text.
        number, header = _number_header_from_line(head)
        # Make a dictionary with the header and text.
        d = dict(header=header, part=part)
        # Check if the section number already exists.
        if number in sections.keys():
            sections[number].append(d)
        else:
            sections[number] = [d]

    return sections

In [49]:
# a -> extract dialogue into list (extract_dialogue_from_script) -> get headers and locations (section_headers) 
# -> split dialogue into sections (_sectioned_script) -> split lines by characters

In [59]:
q = _sectioned_script(a.dialogue, indices)

In [51]:
r = _sectioned_script(a.dialogue, indices)

In [62]:
get_characters(a.dialogue)

{"ARD'RIAN",
 'BEVERLY',
 'DATA',
 'GEORDI',
 'GOSHEVEN',
 'HARITATH',
 'KENTOR',
 "O'BRIEN",
 'PICARD',
 'RIKER',
 'SHELIAK',
 'TROI',
 'WESLEY',
 'WORF'}

In [60]:
separate_lines_by_character(q)

{'1': [{'header': 'EXT. SPACE - THE ENTERPRISE (OPTICAL)',
   'part': {0: {'name': 'None',
     'text': 'Moving at impulse near some extraordinarily interesting astronomical object.'}}}],
 '2': [{'header': 'INT. TEN-FORWARD',
   'part': {0: {'name': 'None',
     'text': "Present are PICARD, BEVERLY, and TWELVE N.D. CREWMEMBERS. A VULCAN and ONE WOMAN are seated, holding their instruments -- a violin and a viola. O'BRIEN tunes his cello. DATA ENTERS carrying a violin. He checks at the door, startled to see the captain. Picard beckons, and Data crosses to him."},
    1: {'name': 'None',
     'text': 'Captain, Doctor, I am honored by your presence, but may I suggest you attend the second concert.'},
    2: {'name': 'None', 'text': 'Why, Data?'},
    3: {'name': 'None',
     'text': 'Ensign Ortiz will perform the violin part. My rendition will be less enjoyable.'},
    4: {'name': 'None', 'text': 'Oh?'},
    5: {'name': 'None',
     'text': 'While I am quite proficient Technically, accordi

In [298]:
y = separate_lines_by_character(q, chars)

In [299]:
y

{'1': [{'header': 'EXT. SPACE - THE ENTERPRISE (OPTICAL)',
   'part': ['Moving at impulse near some extraordinarily interesting',
    'astronomical object.']}],
 '2': [{'header': 'INT. TEN-FORWARD',
   'part': ['Present are PICARD, BEVERLY, and TWELVE N.D.',
    'CREWMEMBERS. A VULCAN and ONE WOMAN are seated,',
    'holding their instruments -- a violin and a viola.',
    "O'BRIEN tunes his cello. DATA ENTERS carrying a",
    'violin. He checks at the door, startled to see the',
    'captain. Picard beckons, and Data crosses to him.',
    'DATA',
    'Captain, Doctor, I am honored by',
    'your presence, but may I suggest',
    'you attend the second concert.',
    'BEVERLY',
    'Why, Data?',
    'DATA',
    'Ensign Ortiz will perform the',
    'violin part. My rendition will',
    'be less enjoyable.',
    'PICARD',
    'Oh?',
    'DATA',
    'While I am quite proficient',
    'Technically, according to my',
    'fellow performers, I lack soul.',
    'BEVERLY',
    "Data, telling u

In [None]:
q

In [214]:
separate_lines_by_character(b, chars)

TypeError: unhashable type: 'slice'

In [201]:
replace_character_names(b, chars)

AttributeError: 'list' object has no attribute 'items'

In [188]:
replace_character_names(e, chars)

AttributeError: 'list' object has no attribute 'items'

In [198]:
d = separate_lines_by_character(b, indices)

TypeError: unhashable type: 'slice'

In [199]:
d

{'1': [{'header': 'EXT. SPACE - THE ENTERPRISE (OPTICAL)',
   'part': ['Moving at impulse near some extraordinarily interesting',
    'astronomical object.']}],
 '2': [{'header': 'INT. TEN-FORWARD',
   'part': ['Present are PICARD, BEVERLY, and TWELVE N.D.',
    'CREWMEMBERS. A VULCAN and ONE WOMAN are seated,',
    'holding their instruments -- a violin and a viola.',
    "O'BRIEN tunes his cello. DATA ENTERS carrying a",
    'violin. He checks at the door, startled to see the',
    'captain. Picard beckons, and Data crosses to him.',
    'DATA',
    'Captain, Doctor, I am honored by',
    'your presence, but may I suggest',
    'you attend the second concert.',
    'BEVERLY',
    'Why, Data?',
    'DATA',
    'Ensign Ortiz will perform the',
    'violin part. My rendition will',
    'be less enjoyable.',
    'PICARD',
    'Oh?',
    'DATA',
    'While I am quite proficient',
    'Technically, according to my',
    'fellow performers, I lack soul.',
    'BEVERLY',
    "Data, telling u

In [156]:
c = replace_character_names(e, chars)

[{'header': 'EXT. SPACE - THE ENTERPRISE (OPTICAL)', 'part': ['Moving at impulse near some extraordinarily interesting', 'astronomical object.']}]
{'header': 'EXT. SPACE - THE ENTERPRISE (OPTICAL)', 'part': ['Moving at impulse near some extraordinarily interesting', 'astronomical object.']}


AttributeError: 'list' object has no attribute 'items'

In [None]:
a.sectioned_script()