In [1]:
import re
import pandas as pd
from datetime import timedelta
pd.set_option('display.max_rows', 100)

ALIASES = {
    "Joe Biden": [],
    "Bernie Sanders": [],
    "Pete Buttigieg": ["Mayor Buttigieg"],
    "Amy Klobuchar": ["Sen Klobuchar"],
    "Kamala Harris": [],
    "Beto O’Rourke": [],
    "Andrew Yang": ["Yang"],
    "Cory Booker": ["Corey Booker", "Senator Booker", "Corey Booker"],
    "Tulsi Gabbard": [],
    "Julian Castro": [],
    "Elizabeth Warren": ["E. Warren", "Elizabeth W.", "Senator Warren", "Sen. Warren", "Elizabeth W"],
    "Tom Steyer": [],
    "Tim Ryan": [],
    "John Delaney": [],
    "Kirsten Gillibrand": ["Kirsten G.", "Gillibrand", "Kristen Gillibr"],
    "Jay Inslee": [],
    "Michael Bennet": ["Senator Bennet", "Bennett"],
    "Steve Bullock": [],
    "Julián Castro": [],
    "Bill de Blasio": ["Bill De Blasio", "Mayor de Blasio"],
    "Marianne Williamson": ["Ms. Williamson", "Marianne W.", "Williamson"],
    "Eric Swalwell": ["Eric Stalwell"],
    "John Hickenloop": ["John H.", "John H"],
}

ALIAS_INDEX = {}
for name, aliases in ALIASES.items():
    reverse_index = {alias: name for alias in aliases}
    ALIAS_INDEX.update(reverse_index)

def parse_time(time_string):
    time = [int(t) for t in time_string.split(":")]
    if len(time) == 2:
        # Add a 0th hour.
        time = [0] + time
    hours, minutes, seconds = time
    return timedelta(hours=hours, minutes=minutes, seconds=seconds)
    
    def __str__(self):
        return "{0:02d}:{1:02d}:{2:02d}".format(self.hours, self.minutes, self.seconds)

## Multiline Parser
```
Anderson Cooper: (00:00)
It’s the CNN, New York Times Democratic presidential debate. We want to welcome our viewers in the United States and watching around the world, watching us on CNN, CNN International, CNN Espanol, cnn.com, thenewyorktimes.com CNN, CNN’s Facebook page, and listening on the Westworld One Radio Network, Sirius XM satellite radio, NPR, and the American Forces Network. I’m Anderson Cooper moderating tonight’s debate along with CNN’s Erin Burnett and New York Times national editor, Marc Lacey. We are in Ohio tonight because it’s one of the most critical battleground states. Ohio has backed all but two presidential winners in every election since 1896.

Erin Burnett: (00:43)
The top 12 democratic presidential candidates are at their positions behind the podiums. This is a record number of candidates for a presidential primary debate. So to accommodate the large group, there are no opening statements tonight.

Marc Lacey: (00:55)
Before we begin a reminder of the ground rules, you’ll each receive 75 seconds to answer questions, 45 seconds for responses and rebuttals, and 15 seconds for clarifications. Please refrain from interrupting your fellow candidates as that will count against your time.
```

In [2]:
annotation_pattern = re.compile("^.*: \(\d+:\d+\)")
time_pattern = re.compile("(\d+:\d+)")
name_pattern = re.compile("(^.*):\W")

assert annotation_pattern.match("Bernie Sanders: (54:49)'")
assert time_pattern.search("Bernie Sanders: (54:49)'")[0] == '54:49'
assert name_pattern.match("Bernie Sanders: (54:49)'").group(1) == 'Bernie Sanders'

def multi_line_parser(raw_text):
    lines = raw_text.split('\n')
    lines = [line for line in lines if line]
    idx = 0
    data = []
    while idx < len(lines):
        annotation_line = lines[idx]
        if annotation_pattern.match(annotation_line):
            time = time_pattern.search(annotation_line)[0].strip()
            name = name_pattern.match(annotation_line).group(1).strip()
            idx += 1
            text = lines[idx]
            data.append({'raw_time': time, 'name': name, 'text': text})
        idx += 1
    return pd.DataFrame(data)

## Single line parser

```
George S:                      00:00:00           The field has been narrowed. For one night only, the top 10 candidates are here. Our Democratic primary debate starts right now.

Joe Biden:                     00:00:11           I will be a president for every American.

Sen. Warren:                00:00:14           This is our moment.

Bernie Sanders:             00:00:16           We are in a struggle for the future of this country.
```

In [3]:
line_pattern = re.compile("(^.*):\s+(\d*:?\d+:\d+)\W*(\S+.*)$")
def single_line_parser(raw_text):
    lines = raw_text.split('\n')
    lines = [line for line in lines if line]
    idx = 0
    data = []
    for line in lines:
        if line_pattern.match(line):
            name = line_pattern.match(line).group(1).strip()
            time = line_pattern.match(line).group(2).strip()
            text = line_pattern.match(line).group(3).strip()
            data.append({'raw_time': time, 'name': name, 'text': text})
    return pd.DataFrame(data)

In [4]:
def _fix_times(df):
    current_time = df.iloc[0]['time']
    df['actual_time'] = current_time
    for i, row in df.iterrows():
        if i == 0: continue
        df.loc[i, 'actual_time'] = df.loc[i - 1, 'actual_time'] + df.loc[i - 1]['time_difference']
    return df

def dedup_alias(name):
    if name in ALIAS_INDEX:
        return ALIAS_INDEX[name]
    return name

def add_features(df):
    df['time'] = df['raw_time'].apply(parse_time)
    df['time_difference'] = df['time'].shift(periods=-1, fill_value=timedelta(0)) - df['time']
    df.loc[df['time_difference'] < timedelta(0), 'time_difference'] = timedelta(0)
    df['name'] = df['name'].apply(dedup_alias)
    df = _fix_times(df)
    return df

In [5]:
DATASETS = [
    {
        'file': 'raw_data/first_1.txt',
        'debate': 'first_1',
        'parser': single_line_parser,
    },
    {
        'file': 'raw_data/first_2.txt',
        'debate': 'first_2',
        'parser': single_line_parser,
    },
    {
        'file': 'raw_data/second_1.txt',
        'debate': 'second_1',
        'parser': single_line_parser,
    },
    {
        'file': 'raw_data/second_2.txt',
        'debate': 'second_2',
        'parser': single_line_parser,
    },
    {
        'file': 'raw_data/third.txt',
        'debate': 'third',
        'parser': single_line_parser,
    },
    {
        'file': 'raw_data/fourth.txt',
        'debate': 'fourth',
        'parser': multi_line_parser,
    },
    {
        'file': 'raw_data/fifth.txt',
        'debate': 'fifth',
        'parser': multi_line_parser,
    },
    {
        'file': 'raw_data/sixth.txt',
        'debate': 'sixth',
        'parser': multi_line_parser,
    },
]

dfs = []
for dataset in DATASETS:
    print("Parsing debate {}".format(dataset['debate']))
    raw_text = open(dataset['file']).read()
    parser = dataset['parser']
    df = parser(raw_text)
    df = add_features(df)
    df['debate'] = dataset['debate']
    dfs.append(df)

all_debates_df = pd.concat(dfs)

Parsing debate first_1
Parsing debate first_2
Parsing debate second_1
Parsing debate second_2
Parsing debate third
Parsing debate fourth
Parsing debate fifth
Parsing debate sixth


In [6]:
all_debates_df.to_csv('debates.csv', index=False)

In [9]:
all_debates_df.groupby('name')['time_difference'].agg(sum).sort_values(ascending=False)[0:100]

name
Elizabeth Warren      01:17:34
Joe Biden             01:15:18
Bernie Sanders        01:11:14
Pete Buttigieg        01:01:19
Amy Klobuchar         00:57:18
Kamala Harris         00:51:26
Cory Booker           00:47:13
Andrew Yang           00:34:44
Beto O’Rourke         00:34:18
Tulsi Gabbard         00:23:14
Julian Castro         00:22:41
Jake Tapper           00:18:05
Kirsten Gillibrand    00:17:43
Tom Steyer            00:17:11
Tim Ryan              00:16:56
Michael Bennet        00:16:54
Bill de Blasio        00:15:16
John Delaney          00:15:06
Rachel Maddow         00:14:59
John Hickenloop       00:13:53
Chuck Todd            00:13:49
Marianne Williamson   00:13:47
Jay Inslee            00:11:57
Lester Holt           00:10:50
Steve Bullock         00:10:10
Dana Bash             00:09:01
Julián Castro         00:08:36
Speaker 26            00:08:29
Judy Woodruff         00:07:34
David Muir            00:07:04
Don Lemon             00:06:54
Jose                  00:06:40
Ste