In [1]:
from startrek.script import Script
from collections import defaultdict, deque

In [2]:
a = Script('tng', script_path='./scripts/tng/Episode 49 - The Ensigns of Command.txt')

In [3]:
b = a.script.splitlines()

In [None]:
b

In [8]:
# TODO: store tab counts in attribute to save time.

def _script_str_or_list(script):
    if isinstance(script, str):
        return script.splitlines()
    elif isinstance(script, list):
        return script
    else:
        raise ValueError('Invalid script.')

def _count_tabs_in_line(line):
    return line.count('\t')

def count_tabs_in_script(script):
    script = _script_str_or_list(script) # may be redundant
    counts = defaultdict(list)
    for line in script:
        tabs = _count_tabs_in_line(line)
        counts[tabs].append(line)
    return counts

def raw_numbered_lines(script):
    script = _script_str_or_list(script)
    return [line for line in script if line and line[0].isdigit()]

def raw_character_list(script):
    script = _script_str_or_list(script) # may be redundant
    c = count_tabs_in_script(script)
    return set(c[5])

def _strip_character_name(character_line):
    name = character_line
    parens = name.find('(')
    if parens != -1:
        name = name[:parens - 1]
    quote = name.find('\'')
    if quote != -1:
        # Taking a stab at possesive names instead of names with random asterisks 
        # in them to sound all scify and bullshit for this project.
        if name[quote+1] == 'S':
            name = name[:quote]
    if 'VOICE' in name:
        name = name.replace('VOICE', '')
    if name.endswith('.'):
        while name.endswith('.'):
            name = name[:-1]
    name = name.replace('"', '').strip()
    
    return name

def processed_character_list(script=None, characters=None):
    if script:
        script = _script_str_or_list(script)
    if not characters:
        characters = raw_character_list(script)
    processed_characters = set()
    for character in characters:
        name = _strip_character_name(character)
        if name:
            processed_characters.add(name)
        
    return processed_characters

def number_header_from_line(line):
    line = line.split()
    return line[0], ' '.join(line[1:])

# def raw_header_lines(script):
#     script = _script_str_or_list(script)
#     c = count_tabs_in_script(script)
#     return c[0]

# def _is_header_line(line):
#     return line[0].isdigit()

# def _is_character_line_from_raw(line):
#     return _count_tabs_in_line(line) == 5

def _remove_star_trek(script):
    # TODO: Fix this
    if STAR_TREK in line:
        del line

def _part_of_script(line, with_page_headers=True):
    tabs = _count_tabs_in_line(line)
    if tabs == 0 and not line[0].isdigit() and with_page_headers:
        # found a page header since it doesn't start with a number
        tabs = -1
    parts = {-1: 'page_header', 0: 'section_header', 1: 'external_action', 3: 'dialogue', 
             4: 'internal_action', 5: 'character_name', 9: 'scene_transition'}
    return parts[tabs]

def cleaned_part_of_script(line, with_page_headers=True):
    PARTS = {
        -1: 'page_header', 
        0: 'section_header', 
        1: 'external_action', 
        3: 'dialogue', 
        4: 'internal_action', 
        5: 'character_name', 
        9: 'scene_transition'
    }
    part = _part_of_script(line, with_page_headers)
    
    return PARTS.get(part, 'Invalid line')(line)
    
def _separate_front_matter(script, remove_blank_lines=True):
    script = _script_str_or_list(script)
    if remove_blank_lines:
        script = list(filter(None, script))
    script = deque(script)
    front_matter = []
    while True:
        try:
            line = script.popleft()
        except IndexError:
            return [], []
        if not line:
            continue
        if line[0].isdigit():
            script.appendleft(line)
            break
        else:
            front_matter.append(line)
    return front_matter, script
    
    
def scr(script):
    front_matter, script = _separate_front_matter(script)
    tabs = []
    for line in script:
        print(_part_of_script(line), line) #tabs.append(_count_tabs_in_line(line))
        
def big_ass_script_dict(script, *, use_front_matter=False, remove_blank_lines=True, with_page_headers=True):
    front_matter, script = _separate_front_matter(script, remove_blank_lines)
    script_dict = {}
    for line in script:
        part = _part_of_script(line, with_page_headers)
        if part == 'page_header' and with_page_headers:
            # We don't want page headers.
            continue
        if part == 'character_name':
            continue
            # while next lines are dialogue
            # attach the dialogue to the character name
        

In [9]:
class Dialogue:
    def __init__(self, character_name=None, dialogue=None):
        self.character_name = character_name
        self.dialogue = dialogue
        
    def __repr__(self):
        return f'<class Dialogue - character_name={self.character_name} - dialogue={self.dialogue}>'
    
    def __str__(self):
        return f'{self.character_name}: {self.dialogue}'
    
    def clean_dialogue(self):
        pass
    
class Section:
    def __init__(self, section_number=0, section_name=None):
        self.section_number = section_number
        self.section_name = section_name
        self.dialogue = {}
        self.dialogue_index = 0
    
    def __repr__(self):
        return f'<class Section - section_number={self.section_number} - section_name={self.section_name}>'
    
    def __str__(self):
        return f'{self.section_number}: {self.section_name}'
    
    def add_dialogue(self, dialogue):
        self.dialogue[self.dialogue_index] = dialogue
        self.dialogue_index += 1

In [10]:
repr(Dialogue('DATA', 'Test'))

'<class Dialogue - character_name=DATA - dialogue=Test>'

In [11]:
scr(a.script)

section_header 1    EXT. SPACE - THE ENTERPRISE (OPTICAL)
external_action 	Moving at impulse near some extraordinarily interesting
external_action 	astronomical object.
section_header 2    INT. TEN-FORWARD
external_action 	Present are PICARD, BEVERLY, and TWELVE N.D.
external_action 	CREWMEMBERS. A VULCAN and ONE WOMAN are seated,
external_action 	holding their instruments -- a violin and a viola.
external_action 	O'BRIEN tunes his cello. DATA ENTERS carrying a
external_action 	violin. He checks at the door, startled to see the
external_action 	captain. Picard beckons, and Data crosses to him.
character_name 					DATA
dialogue 			Captain, Doctor, I am honored by
dialogue 			your presence, but may I suggest
dialogue 			you attend the second concert.
character_name 					BEVERLY
dialogue 			Why, Data?
character_name 					DATA
dialogue 			Ensign Ortiz will perform the
dialogue 			violin part. My rendition will
dialogue 			be less enjoyable.
character_name 					PICARD
dialogue 			Oh?
charac

In [237]:
_separate_front_matter(_separate_front_matter(a.script))

ValueError: Invalid script.

In [None]:
raw_numbered_lines(a.script)

In [197]:
isinstance(a.script, str)

True

In [198]:
processed_character_list(script=a.script)

{"ARD'RIAN",
 'BEVERLY',
 'DATA',
 'GEORDI',
 'GOSHEVEN',
 'HARITATH',
 'KENTOR',
 "O'BRIEN",
 'PICARD',
 'RIKER',
 'SHELIAK',
 'TROI',
 'WESLEY',
 'WORF'}

In [154]:
d = count_tabs_in_script(b)

In [155]:
set(d.keys())

{0}

In [73]:
actions = d[1]

In [76]:
text = d[3]

In [78]:
internal_actions = d[4]

In [80]:
names = d[5]

In [82]:
transitions = d[9]

In [47]:
find_numbered_lines(b)

['1    EXT. SPACE - THE ENTERPRISE (OPTICAL)',
 '2    INT. TEN-FORWARD',
 '2    CONTINUED:',
 '2A   ANGLE ON PICARD AND BEVERLY',
 '3    INT. MAIN BRIDGE',
 '3    CONTINUED:',
 '4    ANGLE ON VIEWSCREEN (OPTICAL)',
 '4    CONTINUED:',
 '5    EXT. SPACE - THE ENTERPRISE (OPTICAL)',
 '6    INT. MAIN BRIDGE',
 '6    CONTINUED:',
 '7    EXT. PLANET SURFACE - SHUTTLE LANDING SITE - DAY',
 '7A   NEW ANGLE',
 '7A   CONTINUED: (2)',
 '8    INT. ENTERPRISE - MAIN BRIDGE',
 '8    CONTINUED:',
 '9    EXT. PLANET SURFACE - SHUTTLE LANDING SITE - DAY',
 '10   INTERCUTS',
 '10   CONTINUED:',
 '11   EXT. PLANET SURFACE - MAIN STREET - DAY',
 '11   CONTINUED:',
 '11   CONTINUED: (2)',
 '11   CONTINUED: (3)',
 '11   CONTINUED: (4)',
 '11A  ON DATA',
 '11B  WIDER',
 '11B  CONTINUED:',
 '11B  CONTINUED: (2)',
 '12   INT. OBSERVATION LOUNGE',
 '12   CONTINUED:',
 '13   EXT. SPACE - THE ENTERPRISE (OPTICAL)',
 '14   INT. MAIN BRIDGE',
 '15   ANGLE ON VIEWSCREEN - (OPTICAL)',
 '15   CONTINUED:',
 '15   CONT

In [None]:
a.dialogue

In [21]:
a.extract_dialogue_from_script()

In [4]:
from collections import defaultdict
from itertools import tee
import re

In [5]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

In [6]:
def _number_header_from_line(line):
    line = line.split()
    return line[0], ' '.join(line[1:])

In [7]:
def get_between_indices(s, begin, end):
    return s[begin:end]

In [23]:
sections, indices = section_headers(a.dialogue)

In [None]:
sections

In [25]:
from collections import deque
import copy
import itertools

In [26]:
b = sectioned_script(a.dialogue, indices)

NameError: name 'sectioned_script' is not defined

In [19]:
chars = get_characters(a.dialogue)

TypeError: 'NoneType' object is not iterable

In [133]:
chars

{"ARD'RIAN",
 'BEVERLY',
 'DATA',
 'GEORDI',
 'GOSHEVEN',
 'HARITATH',
 'KENTOR',
 "O'BRIEN",
 'PICARD',
 'RIKER',
 'SHELIAK',
 'TROI',
 'WESLEY',
 'WORF'}

In [71]:
def section_headers(dialogue):
    '''Returns the section headers from a block of dialogue and their
    respective line numbers in said block. Run once.'''

    sections = {}
    indices = []
    _regex_number = r'^\d{1,3}?[a-zA-Z]{0,1}'
    regex_number = re.compile(_regex_number)

    for index, line in enumerate(dialogue):
        words = line.split()
        if not words:
            continue
        try:
            int(words[0][0])
            number = words[0]
            name = " ".join(words[1:]).replace(':', '')
            if not name:
                name = 'OMITTED'
            # Corner case check if year is in section number
            # if not re.findall(regex_number, number):
            #     print(number, re.findall(regex_number, number))
            if len(number) > 3 and number[3].isdigit():
                continue
            # Check for same section number
            if number in sections.keys():
                sections[number].append(name)
            else:
                sections[number] = [name]
            indices.append(index)
        except:
            continue
    return sections, indices

def get_characters(dialogue):
    """
    Gets a set of all the characters with dialogue from a sectioned script.
    """
    ACT = ['ACT']
    END = ['END OF']
    NUMBERS = ['ONE', 'TWO', 'THREE', 'FOUR', 'FIVE', 'SIX', 'SEVEN', 'EIGHT', 'NINE', 'TEN']
    SKIPS = ['THE END', 'END OF TEASER', 'FADE OUT', 'FADE OUT.', 'FADE OUT:', 'FADE IN', 
             'FADE IN:', 'FADE IN.', 'END OF THE TEASER', 'ENTER.', 'ENTERS.']
    for combo in itertools.product(END, ACT, NUMBERS):
        SKIPS.append(' '.join(combo))
    for combo in itertools.product(ACT, NUMBERS):
        SKIPS.append(' '.join(combo))
    _regex_character = r"^\s*([A-Z-.'\"() ]+)\s*$"
    regex_character = re.compile(_regex_character)

    characters = set()
    for line in dialogue:
        matches = re.findall(regex_character, line.strip())
        if matches:
            for match in matches:
                # Yay random corner cases!
                if match in SKIPS:
                    continue
                parens = match.find('(')
                quote = match.find('\'')
                if parens != -1:
                    match = match[:parens - 1]
                if quote != -1:
                    # Taking a stab at possesive names instead of names with random asterisks 
                    # in them to sound all scify and bullshit for this project.
                    if match[quote+1] == 'S':
                        match = match[:quote]
                if 'VOICE' in match:
                    match = match.replace('VOICE', '')
                if match.endswith('.'):
                    while match.endswith('.'):
                        match = match[:-1]
                if match.startswith('('):
                    continue
                if match.endswith(')'):
                    continue
                characters.add(match.replace('"', '').strip())

    return characters

def _separate_dialogue_block(block, characters):
    """
    Takes a list of lines of dialogue and character names and returns a dictionary
    with each character's lines Dict[name, text].
    """
    block = deque(block)
    dialogue = {}
    temp = ''

    # Check if any initial lines are text and save them.
    while True:
        if not block:
            break
        line = block.popleft()
        if not line.isupper():
            # First line is dialogue/text
            temp = f"{temp} {line}"
        else:
            block.appendleft(line)
            break

    if temp:
        dialogue[0] = dict(name='None', text=temp.strip())
        index = 1
    else:
        index = 0

    name = ''
    text = ''
    for line in block:
        if not line:
            continue
        if line.isupper():
            # Denotes a character's name on it's own line, indicating dialogue.
            if name == line:
                # Character continuing dialogue.
                continue
            else:
                # Different character dialogue.
                if name:
                    # New dialogue. Push dialogue into dict and start over.
                    name = name=_character_name(name, characters)
                    dialogue[index] = dict(name=name, text=text.strip())
                    name = line
                    text = ''
                    index += 1
                else:
                    # Character name
                    name = line
                    continue
        else:
            # Dialogue line
            text = f"{text} {line}"

    return dialogue

def _character_name(input_name, characters):
    """
    Cleans up character names. E.g., remove posession or voice over notes.
    """
    for character in characters:
        if character in input_name:
            return character
    return 'None'

def separate_lines_by_character(dialogue, characters=None):
#     temp_sectioned = _sectioned_script(dialogue, indices)
#     sectioned = copy.deepcopy(temp_sectioned)
    if not characters:
        characters = get_characters(dialogue)
    sectioned = copy.deepcopy(dialogue)
    for number, section in dialogue.items():
        for index, part in enumerate(section):
            block = part['part']
            if block:
                block = _separate_dialogue_block(block, characters)
#                 print(sectioned[number][index])
                sectioned[number][index]['part'] = block
            else:
                del sectioned[number]
                continue
            
    return sectioned

def _sectioned_script(dialogue, header_indices):
    """
    Separates dialogue into sections using the indices of the headers found in the dialogue list of lines.
    """
    sections = {}
    index_pairs = pairwise(header_indices)

    for pair in index_pairs:
        # Get the lines between two indices
        part = get_between_indices(dialogue, *pair)
        # The first line is the header of the section
        head = part.pop(0)
        # The header is composed of a number and text.
        number, header = _number_header_from_line(head)
        # Make a dictionary with the header and text.
        d = dict(header=header, part=part)
        # Check if the section number already exists.
        if number in sections.keys():
            sections[number].append(d)
        else:
            sections[number] = [d]

    return sections

In [49]:
# a -> extract dialogue into list (extract_dialogue_from_script) -> get headers and locations (section_headers) 
# -> split dialogue into sections (_sectioned_script) -> split lines by characters

In [59]:
q = _sectioned_script(a.dialogue, indices)

In [51]:
r = _sectioned_script(a.dialogue, indices)

In [69]:
get_characters(a.dialogue)

{"ARD'RIAN",
 'BEVERLY',
 'DATA',
 'GEORDI',
 'GOSHEVEN',
 'HARITATH',
 'KENTOR',
 "O'BRIEN",
 'PICARD',
 'RIKER',
 'SHELIAK',
 'TROI',
 'WESLEY',
 'WORF'}

In [70]:
j = separate_lines_by_character(q, get_characters(a.dialogue))

{'GOSHEVEN', "O'BRIEN", 'DATA', 'KENTOR', 'RIKER', 'BEVERLY', 'WORF', 'TROI', 'PICARD', 'GEORDI', 'WESLEY', 'HARITATH', 'SHELIAK', "ARD'RIAN"}


In [298]:
y = separate_lines_by_character(q, chars)

In [74]:
a.script.splitlines()

['',
 '',
 '',
 '',
 '',
 '                STAR TREK: THE NEXT GENERATION ',
 '                              ',
 '                   "The Ensigns of Command" ',
 '                          #40273-149 ',
 '                              ',
 '                          Written by ',
 '                         H. B. Savage ',
 '                              ',
 '                          Directed by ',
 '                          Cliff Bole ',
 '',
 '',
 'THE WRITING CREDITS MAY NOT BE FINAL AND SHOULD NOT BE USED',
 'FOR PUBLICITY OR ADVERTISING PURPOSES WITHOUT FIRST CHECKING',
 'WITH THE TELEVISION LEGAL DEPARTMENT.',
 '',
 'Copyright 1989 Paramount Pictures Corporation. All Rights',
 'Reserved. This script is not for publication or',
 'reproduction. No one is authorized to dispose of same. If',
 'lost or destroyed, please notify the Script Department.',
 '',
 '                    3RD REVISED FINAL DRAFT',
 ' ',
 '                         JULY 13, 1989',
 '',
 '   STAR TREK: "The Ensigns

In [None]:
q

In [214]:
separate_lines_by_character(b, chars)

TypeError: unhashable type: 'slice'

In [201]:
replace_character_names(b, chars)

AttributeError: 'list' object has no attribute 'items'

In [188]:
replace_character_names(e, chars)

AttributeError: 'list' object has no attribute 'items'

In [198]:
d = separate_lines_by_character(b, indices)

TypeError: unhashable type: 'slice'

In [199]:
d

{'1': [{'header': 'EXT. SPACE - THE ENTERPRISE (OPTICAL)',
   'part': ['Moving at impulse near some extraordinarily interesting',
    'astronomical object.']}],
 '2': [{'header': 'INT. TEN-FORWARD',
   'part': ['Present are PICARD, BEVERLY, and TWELVE N.D.',
    'CREWMEMBERS. A VULCAN and ONE WOMAN are seated,',
    'holding their instruments -- a violin and a viola.',
    "O'BRIEN tunes his cello. DATA ENTERS carrying a",
    'violin. He checks at the door, startled to see the',
    'captain. Picard beckons, and Data crosses to him.',
    'DATA',
    'Captain, Doctor, I am honored by',
    'your presence, but may I suggest',
    'you attend the second concert.',
    'BEVERLY',
    'Why, Data?',
    'DATA',
    'Ensign Ortiz will perform the',
    'violin part. My rendition will',
    'be less enjoyable.',
    'PICARD',
    'Oh?',
    'DATA',
    'While I am quite proficient',
    'Technically, according to my',
    'fellow performers, I lack soul.',
    'BEVERLY',
    "Data, telling u

In [156]:
c = replace_character_names(e, chars)

[{'header': 'EXT. SPACE - THE ENTERPRISE (OPTICAL)', 'part': ['Moving at impulse near some extraordinarily interesting', 'astronomical object.']}]
{'header': 'EXT. SPACE - THE ENTERPRISE (OPTICAL)', 'part': ['Moving at impulse near some extraordinarily interesting', 'astronomical object.']}


AttributeError: 'list' object has no attribute 'items'

In [None]:
a.sectioned_script()