In [1]:
import chardet
import pandas as pd
import re
from time import strptime

In [15]:
fpath = '../subtitles/Westworld - 2x01 - Journey Into Night.WEB.DEFLATE.en.srt'
with open(fpath, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [128]:
subtitle_dict = {
    'subtitle_index': [],
    'start': [],
    'end': [],
    'text': []
}

fpath = '../subtitles/Westworld - 2x01 - Journey Into Night.WEB.DEFLATE.en.srt'
with open(fpath, 'r', encoding='ISO-8859-1') as f:
    text_accumulator = []  # gathers multi-line subtitles
    for line in f:
        index_match = re.match(r'([0-9]+)\n', line)
        newline_match = re.fullmatch(r'\n', line)
        
        if index_match:
            # line is subtitle index
            index_int = int(index_match.group(1))
            subtitle_dict['subtitle_index'].append(index_int)
        elif re.search(r'-->', line):
            try:
                start, end = re.findall(r'([0-9]{2}:[0-9]{2}:[0-9]{1,2},[0-9]{2,3})', line)
            except ValueError:
                print(line)
            #subtitle_dict['start'].append(strptime(start, '%H:%M:%S,%f'))
            #subtitle_dict['end'].append(strptime(end, '%H:%M:%S,%f'))
            subtitle_dict['start'].append(start)
            subtitle_dict['end'].append(end)
        elif newline_match:
            # blank line between subtitles
            full_subtitle = ' '.join(text_accumulator)
            subtitle_dict['text'].append(full_subtitle)
            text_accumulator = []  # reset accumulator
        else:
            # text of the dialogue
            text_accumulator.append(line.strip())
    # Save the text contents of the last loop
    full_subtitle = ' '.join(text_accumulator)
    subtitle_dict['text'].append(full_subtitle)
            
for k, v in subtitle_dict.items():
    print(k, len(v))

subtitle_index 701
start 701
end 701
text 701


In [102]:
all_lines[6].strip()

'Welcome to Westworld.'

In [129]:
df = pd.DataFrame.from_dict(subtitle_dict)
df.head()

Unnamed: 0,subtitle_index,start,end,text
0,1,"00:00:00,018","00:00:01,251",_
1,2,"00:00:01,254","00:00:02,802",Welcome to Westworld.
2,3,"00:00:02,805","00:00:04,262",Which would you prefer?
3,4,"00:00:05,262","00:00:07,303",This place is the answer
4,5,"00:00:07,304","00:00:09,590",to that question that you've been asking yours...


In [133]:
for col in ['start', 'end']:
    df[col] = pd.to_datetime(df[col], format='%H:%M:%S,%f').dt.time

In [137]:
# Drop font color rows
df = df.drop(index=df.loc[df.text.str.contains('<font')].index)

In [138]:
df

Unnamed: 0,subtitle_index,start,end,text
0,1,00:00:00.018000,00:00:01.251000,_
1,2,00:00:01.254000,00:00:02.802000,Welcome to Westworld.
2,3,00:00:02.805000,00:00:04.262000,Which would you prefer?
3,4,00:00:05.262000,00:00:07.303000,This place is the answer
4,5,00:00:07.304000,00:00:09.590000,to that question that you've been asking yours...
...,...,...,...,...
694,695,01:11:33.979000,01:11:36.354000,which means I need yours.
695,696,01:11:38.081000,01:11:40.089000,Can you tell me what happened?
696,697,01:11:58.730000,01:12:01.831000,I...
697,698,01:12:01.834000,01:12:03.831000,I killed them.


---
## Post- automated processing

In [38]:
df_subs = pd.read_csv('../data/subtitle_data.csv')
df_subs.head()

Unnamed: 0,subtitle_index,start,end,text,season_num,episode_num,episode_name,filename,offscreen
0,1,00:01:52.076000,00:01:53.876000,<i>Bring her back online.</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True
1,2,00:02:02.117000,00:02:03.787000,<i>Can you hear me?</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True
2,3,00:02:03.887000,00:02:05.417000,<i>Yes.</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True
3,4,00:02:05.527000,00:02:08.088000,<i>I'm sorry. I'm not feeling quite myself.</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True
4,5,00:02:08.198000,00:02:09.898000,<i>You can lose the accent.</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True


In [41]:
# Split text on hyphens (multi-line, multi-person subtitle)
df_subs['textsplit'] = df_subs['text'].str.split(pat='- ')
# Remove empty strings from textsplit lists
df_subs['textsplit'] = df_subs['textsplit'].apply(lambda l: [x for x in l if len(x) > 0])
df_subs.head()

Unnamed: 0,subtitle_index,start,end,text,season_num,episode_num,episode_name,filename,offscreen,textsplit
0,1,00:01:52.076000,00:01:53.876000,<i>Bring her back online.</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True,[<i>Bring her back online.</i>]
1,2,00:02:02.117000,00:02:03.787000,<i>Can you hear me?</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True,[<i>Can you hear me?</i>]
2,3,00:02:03.887000,00:02:05.417000,<i>Yes.</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True,[<i>Yes.</i>]
3,4,00:02:05.527000,00:02:08.088000,<i>I'm sorry. I'm not feeling quite myself.</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True,[<i>I'm sorry. I'm not feeling quite myself.</i>]
4,5,00:02:08.198000,00:02:09.898000,<i>You can lose the accent.</i>,1,1,The Original,Westworld - 1x01 - The Original.HDTV.FLEET.en.srt,True,[<i>You can lose the accent.</i>]


In [45]:
# Explode the dataframe based on textsplit contents
df_subs = df_subs.explode('textsplit')

In [78]:
# Extract any text preceding a colon
speaker_pat = r'(.+):.*'
df_subs['speaker'] = df_subs.textsplit.str.extract(speaker_pat)
df_subs['speaker'] = df_subs.speaker.str.upper()
# Drop any "speakers" that are longer than 3 words
df_subs['speaker_word_count'] = df_subs['speaker'].fillna('').str.split().map(len)
df_subs['speaker_attributable'] = (df_subs.speaker_word_count > 0) & (df_subs.speaker_word_count <= 3)

In [79]:
df_subs.loc[df_subs.speaker_attributable, ['textsplit', 'offscreen', 'speaker']]

Unnamed: 0,textsplit,offscreen,speaker
373,man #2: Hyah! Hyah!,False,MAN #2
2246,"Man, distorted: Aw, can you fucking kill it?",False,"MAN, DISTORTED"
2935,"Old Bill: You got any stories, friend?",False,OLD BILL
2969,Man: Find me.,False,MAN
2970,Dolores: Show me how.,False,DOLORES
...,...,...,...
4311,"Lutz: <i>It's your codebase,</i>",True,LUTZ
4320,Sylvester: <i>Lots.</i>,True,SYLVESTER
4356,Elsie:<i> Hey.</i>,True,ELSIE
4381,Dr. Ford: I don't want you to worry about Jock.,False,DR. FORD


In [74]:
df_subs.loc[df_subs.speaker_attributable, 'speaker'].str.upper().value_counts()

LOGAN               9
MAN                 9
DR. FORD            6
WOMAN               5
LUTZ                4
ELSIE               3
SYLVESTER           3
EL LAZO             2
TEDDY               2
WAITER              2
FEMALE VOICE        2
WILLIAM             2
MAN IN BLACK        2
LAWRENCE            2
BARTENDER           2
THERESA             1
STUBBS              1
CAPTAIN             1
BERNARD             1
WOMAN'S VOICE       1
MAN #2              1
OLD BILL            1
WOMAN #2            1
EXCEPT ONE THING    1
SLIM                1
MAN, DISTORTED      1
MAEVE               1
MALE VOICE          1
BOY                 1
LAST CONTACT        1
ROBERT              1
SOLDIER             1
DOLORES             1
Name: speaker, dtype: int64

In [75]:
df_subs.loc[~df_subs.speaker_attributable, 'speaker'].str.upper().value_counts()

SYMPTOMS OF CRITICAL CORRUPTION                  1
BUT ONE THING IS CONSTANT                        1
I PRESENT OUR GUESTS' NEXT OBSESSION             1
LET ME TELL YOU SOMETHING                        1
SHE HAD IT ENGRAVED... ON THE BOTTOM. IT SAID    1
THE REAL QUESTION IS                             1
TIME REMAINING BEFORE TERMINAL MALFUNCTION       1
Name: speaker, dtype: int64

---
## Post-post- cleaning

In [89]:
cols = ['season_num', 'episode_num', 'episode_name', 'subtitle_index', 'text', 'speaker', 'offscreen', 'start', 'end']
df_subs2 = pd.read_csv('../data/subtitle_data.csv', usecols=cols)

In [90]:
df_subs2

Unnamed: 0,subtitle_index,start,end,season_num,episode_num,episode_name,text,offscreen,speaker
0,1,00:01:52.076000,00:01:53.876000,1,1,The Original,Bring her back online.,True,
1,2,00:02:02.117000,00:02:03.787000,1,1,The Original,Can you hear me?,True,
2,3,00:02:03.887000,00:02:05.417000,1,1,The Original,Yes.,True,
3,4,00:02:05.527000,00:02:08.088000,1,1,The Original,I'm sorry. I'm not feeling quite myself.,True,
4,5,00:02:08.198000,00:02:09.898000,1,1,The Original,You can lose the accent.,True,
...,...,...,...,...,...,...,...,...,...
13233,833,01:30:53.210000,01:30:54.908000,2,10,The Passenger,Longer than we thought.,False,
13234,834,01:30:58.833000,01:31:00.705000,2,10,The Passenger,I have a few questions for you.,False,
13235,835,01:31:01.923000,01:31:05.251000,2,10,The Passenger,The last step's a baseline interview to allow ...,False,
13236,836,01:31:08.538000,01:31:09.954000,2,10,The Passenger,Verify what?,False,


In [91]:
df_subs2.loc[df_subs2.text == ' ']

Unnamed: 0,subtitle_index,start,end,season_num,episode_num,episode_name,text,offscreen,speaker
