In [62]:
import re
import pandas as pd
from nltk import word_tokenize,sent_tokenize,Text

In [89]:
class DebateTracker():   
    def __init__(self, fp):
        self.fp = fp
        self.file = open(self.fp)
        self.members = []
        self.moderators = []
        self.speech_entries = []
    
    def parse_speech(self):
        new_speaker = re.compile('([\sA-Z]+):(.+)')
        audience = re.compile('\[.+?\]')

        speech_no = 0
        current_speaker = None
        current_words = None
        
        for line in self.file:
            line = line.strip('\n')

            if line == 'PARTICIPANTS:':
                line = next(self.file).strip()
                while line and ':' not in line:
                    self.members.append(line)
                    line = next(self.file).strip()
            if line == 'MODERATORS:':
                line = next(self.file).strip()
                while line and ':' not in line:
                    self.moderators.append(line)
                    line = next(self.file).strip()
            speaker_match = re.match(new_speaker, line)
            if speaker_match:
                speaker,words = speaker_match.groups()
                if current_speaker:
                    self.speech_entries.append((current_speaker,current_words, Text(word_tokenize(current_words)),speech_no))
                    speech_no += 1
                current_speaker = speaker.strip()
                current_words = words.strip()
            elif line:
                current_words += line.strip()
            else:
                pass
        self.speech_entries = pd.DataFrame(self.speech_entries, columns=['speaker','string','text','no'])
        self.speech_entries['word_count'] = self.speech_entries.text.apply(len)
        self.speech_entries['role'] = self.speech_entries.speaker.apply(self.get_speaker_role)
    
    def get_speaker_role(self, name):
        name = name.lower()
        for mod in self.moderators:
            if name in mod.lower():
                return 'MOD'
        for mem in self.members:
            if name in mem.lower():
                return 'CAND'
        return 'UNK'

debate = DebateTracker('data/RepublicanCandidatesDebateCleveland_20150806')
debate.parse_speech()        

In [93]:
debate.speech_entries.head(21)

Unnamed: 0,speaker,string,text,no,word_count,role
0,KELLY,Welcome to the first debate night of the 2016 ...,"(Welcome, to, the, first, debate, night, of, t...",0,83,MOD
1,BAIER,"Less than a year from now, in this very arena,...","(Less, than, a, year, from, now, ,, in, this, ...",1,79,MOD
2,WALLACE,"Also of note, Fox News is partnering for tonig...","(Also, of, note, ,, Fox, News, is, partnering,...",2,72,MOD
3,KELLY,As for the candidates who will be answering th...,"(As, for, the, candidates, who, will, be, answ...",3,66,MOD
4,BAIER,"Neurosurgeon, Dr. Ben Carson. [applause]Texas ...","(Neurosurgeon, ,, Dr., Ben, Carson, ., [, appl...",4,25,MOD
5,WALLACE,Kentucky Senator Rand Paul. [applause]New Jers...,"(Kentucky, Senator, Rand, Paul, ., [, applause...",5,35,MOD
6,WALLACE,"Brett -- Brett, I think you would call that a ...","(Brett, --, Brett, ,, I, think, you, would, ca...",6,15,MOD
7,BAIER,It might be. It might be. We'll see.[UNKNOWN]:...,"(It, might, be, ., It, might, be, ., We, 'll, ...",7,27,MOD
8,BAIER,It might be. The rules for tonight are simple....,"(It, might, be, ., The, rules, for, tonight, a...",8,134,MOD
9,BAIER,"Gentlemen, we know how much you love hand-rais...","(Gentlemen, ,, we, know, how, much, you, love,...",9,111,MOD


In [97]:
#Who had the most speaking opportunities?
debate.speech_entries[debate.speech_entries.role == 'CAND'].speaker.value_counts()

TRUMP       36
PAUL        23
CHRISTIE    18
RUBIO       12
KASICH      11
WALKER      11
BUSH        11
HUCKABEE    10
CARSON       9
CRUZ         8
Name: speaker, dtype: int64

In [99]:
#Who tends to speak the longest (most words)?
debate.speech_entries[debate.speech_entries.role == 'CAND'].groupby('speaker')['word_count'].mean()

speaker
BUSH        157.363636
CARSON      147.222222
CHRISTIE     89.444444
CRUZ        158.625000
HUCKABEE    141.400000
KASICH      142.545455
PAUL         49.826087
RUBIO       140.416667
TRUMP        68.000000
WALKER      139.545455
Name: word_count, dtype: float64

In [100]:
debate.speech_entries[debate.speech_entries.role == 'CAND'].groupby('speaker')['word_count'].median()

speaker
BUSH        178.0
CARSON      161.0
CHRISTIE     34.5
CRUZ        152.0
HUCKABEE    159.5
KASICH      178.0
PAUL         26.0
RUBIO       171.5
TRUMP        17.5
WALKER      145.0
Name: word_count, dtype: float64