In [2]:
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string 
from collections import namedtuple

def default_script_line_parser(line):
    if ':' in line:
        index = line.index(':')
        character = line[:index].strip()
        if '(' in character:
            character = character[:character.index('(')].strip()
            
        line = line[index+1:].strip()
        return character, line
    else:
        return None, line

Dialogue = namedtuple('Dialogue', ['character', 'line', 'index'])
class Script:
    def __init__(self, line_parser=default_script_line_parser):
        self.line_parser = line_parser
        
    def read_script(self, path):
        characters, lines = self._read_from_path(path, self.line_parser)
        all_dialogue, character_dialogue = self._generate_dialogue(lines)
        self.characters = characters
        self.all_dialogue = all_dialogue
        self.character_dialogue = character_dialogue
        
    def _read_from_path(self, path, line_parser):
        characters = []
        lines = []
        
        with open(path, 'r') as f:
            for l in f.readlines():
                character, line = line_parser(l)
                if character is None:
                    continue
                    
                if character not in characters:
                    characters.append(character)
            
                lines.append((character, line))
        
        return characters, lines
    
    def _generate_dialogue(self, lines):
        character_dialogue = {}
        all_dialogue = []
        
        for i, (c, l) in enumerate(lines):
            d = Dialogue(c, l, i)
            if c not in character_dialogue:
                character_dialogue[c] = []
            
            character_dialogue[c].append(d)
            all_dialogue.append(d)
    
        return all_dialogue, character_dialogue

In [3]:
class Subtitle:
    def __init__(self):
        self.start_frame = 0
        self.end_frame = 0
        self.start_time = 0
        self.end_time = 0
        self.duration = 0
        self.text = ""
        self.relative_index = -1
        self.absolute_index = -1
        self.character = None
        self.num_speakers = 0
        self.tokens = []
        
class Subtitles:
    def __init__(self, ignore_last=True):
        self.ignore_last = ignore_last
    
    def read_subtitles(self, path):
        subs = self._read_from_path(path)
        if self.ignore_last:
            subs = subs[:-1]
        
        self.subtitles = subs
        return subs
    
    def _read_from_path(self, path):
        subs = []
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            sub_data = []
            for l in f:
                if len(l.strip()) == 0 and len(sub_data) > 0:
                    result = self._parse_sub_data(sub_data)
                    if result is not None:
                        subs.append(result)
                    sub_data = []
                else:
                    sub_data.append(l.strip())
        
        for i, s in enumerate(subs):
            s.absolute_index = i
            
        return subs

    def _parse_sub_data(self, data):
        if len(data) < 3:
            return None
        
        index, time, lines = data[0], data[1], data[2:]
        index_result = int(index)
        start_time_str, end_time_str = time.split('-->')
        start_time = self._parse_sub_time(start_time_str)
        end_time = self._parse_sub_time(end_time_str)
        num_speakers, joined_line = self._parse_lines(lines)
        
        s = Subtitle()
        s.relative_index = index
        s.start_time = start_time
        s.end_time = end_time
        s.duration = end_time - start_time
        s.text = joined_line
        s.num_speakers = num_speakers
        
        return s
    
    def _parse_lines(self, lines):
        num_speakers = 0
        speaker_tag = '- '
        for l in lines:
            if speaker_tag in l and l.index(speaker_tag) == 0:
                num_speakers += 1
        
        joined_line = ' '.join(lines)
        return num_speakers, joined_line
        
    def _parse_sub_time(self, time_str):
        # format = '00:00:02,800'
        parts, ms = time_str.split(',')
        h, m, s = parts.split(':')
        
        time = (int(h) * 3.6e6) + (int(m) * 6e4) + (int(s) * 1e3) + int(ms)
        return int(time)

In [4]:
def default_tagger_tokenizer(text, remove_stopwords=True, remove_punc=True, remove_empty=True):
    text = text.lower()
    tokens = word_tokenize(text)
    if remove_stopwords:
        sw = stopwords.words('english')
        tokens = [t for t in tokens if t not in sw]
    
    if remove_punc:
        translator = str.maketrans('', '', string.punctuation)
        tokens = [t.translate(translator) for t in tokens]
    
    if remove_empty:
        tokens = [t for t in tokens if len(t) > 0]
    
    return tokens
    
from collections import Counter
class Tag:
    def __init__(self, subtitle):
        self.subtitle = subtitle
        self.dialogue_guesses = {}
        self.decision = None
        
    def add_guess(self, dialogue):
        self.dialogue_guesses.setdefault(dialogue.index, [0, dialogue])[0] += 1
        
class SubtitleTagger:
    def __init__(self, tokenizer=default_tagger_tokenizer):
        self.tokenizer = tokenizer
    
    def tag(self, script, subtitles, **kwargs):
        guesses = self._guess_by_word_occurrence(script, subtitles, **kwargs)
        resolved, unresolved = self._resolve_guesses(guesses)
        self._add_tags(resolved)
    
    def _guess_by_word_occurrence(self, script, subtitles, **kwargs):
        script_words = []
        word_to_dialogue = {}
        for d in script.all_dialogue:
            line_tokens = self.tokenizer(d.line, **kwargs)
            script_words.extend(line_tokens)
            for lt in line_tokens:
                word_to_dialogue.setdefault(lt, []).append(d)
        
        subtitle_words = []
        word_to_subtitle = {}
        for sub in subtitles:
            sub_tokens = self.tokenizer(sub.text, **kwargs)
            sub.tokens = sub_tokens
            subtitle_words.extend(sub_tokens)
            for st in sub_tokens:
                word_to_subtitle.setdefault(st, []).append(sub)
            
        common_words = list(set(script_words) & set(subtitle_words))
        script_common_word_counts = self._count_common_word_appearances(common_words, script_words)
        subtitle_common_word_counts = self._count_common_word_appearances(common_words, subtitle_words)
        equal_appearances = [(cw, script_common_word_counts[cw]) for cw in common_words if script_common_word_counts[cw] == subtitle_common_word_counts[cw]]
        
        tags = {}
        for word, num_appearances in equal_appearances:
            for i in range(num_appearances):
                dialogue = word_to_dialogue[word][i]
                subtitle = word_to_subtitle[word][i]
                
                t = tags.setdefault(subtitle.absolute_index, Tag(subtitle))
                t.add_guess(dialogue)
                
        return tags
        
    def _count_common_word_appearances(self, common_words, word_list):
        c = Counter()
        for w in word_list:
            if w in common_words:
                c[w] += 1
        
        return c
    
    def _resolve_guesses(self, guesses):
        resolved, unresolved = [], []
        for g in guesses.values():
            character_guesses = set([dg[1].character for dg in g.dialogue_guesses.values()])
            if len(character_guesses) == 1:
                g.decision = character_guesses.pop()
                resolved.append(g)
            else:
                unresolved.append(g)
        
        return resolved, unresolved
    
    def _add_tags(self, tags):
        for t in tags:
            assert(t.decision is not None)
            t.subtitle.character = t.decision