In [452]:
# Import dependencies
import pandas as pd
import numpy as np
import json
import re

from os import listdir
from os.path import isfile, join
from collections import Counter
from datetime import datetime
from matplotlib import pyplot as plt

import plotly.express as px

## Data Retrieval

In [453]:
# Define directory path
mypath = "./data_dir"

# Create list of json file names in directory
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [454]:
# Initialize empty list for all debate data
all_debates_list = []

# Loop through and read in all json files
for file in onlyfiles:
    
    # Open next JSON file
    with open(f"{mypath}/{file}") as json_data:

        # Load data from current JSON file
        data = json.load(json_data)

        # Fix dates for 1992 debate parts 1 and 2
        if (data['date'][0] == 'Part'):
            if data['date'][1] == '1':
                data['date'] = ['October', '11', '1992']
            elif data['date'][1] == '2':
                data['date'] = ['October', '15', '1992']

        # Append to full debate data list
        all_debates_list.append(data)

In [455]:
# Get date in datetime format from a particular debate JSON
def get_date(debate):
    
    # Create date string from provided date array
    date_string = '-'.join(debate['date'])

    # Create date object from date string
    date_object = datetime.strptime(date_string, '%B-%d-%Y')

    # Return date of given debate in datetime
    return date_object

In [456]:
# Get list of unique actors from a particular debate JSON
def get_unique_actors(debate):
    # Empty list to store all actors names (including duplicates)
    actor_list = []

    # Create list of prefixes to remove from names
    prefixes = ['Mr.', 'Ms.', 'Senator', 'Governor', 'Admiral']

    # Create dictionary of actor name corrections to be made
    typo_corrections = {
        # 'The President':'Reagan',
        '^Obam$':'Obama',
        'Barbara Walters':'Walters',
        'Bill Shadel': 'Shadel',
        'Edwin Newman': 'Newman',
        'Frank Mcgee': 'McGee',
        'Hal Bruno': 'Bruno',
        'Harry Ellis': 'Ellis',
        'Jim Lehrer': 'Lehrer',
        'Quincy Howe': 'Howe',
        'Sander Vanocur': 'Vanocur',
        'President Bush': 'Bush',
        '^Frederic$': 'Frederick'
    }

    # Loop through each speaking turn in the debate transcription content
    for turn in debate['content']:

        # Store actor name for each speaking turn
        actor = turn['actor']

        # Loop through prefixes in list
        for prefix in prefixes:

            # Check if the actor's name contains current prefix
            if prefix in actor:

                # Remove prefix and strip whitespace
                actor = actor.replace(prefix, '').strip()

        # Append actor to list of non-unique names
        actor_list.append(actor)

    # Loop through enumerated list of actor names
    for index, data in enumerate(actor_list):

        # Loop through items in typo corrections dict
        for k, v in typo_corrections.items():

            # Check if typo is present in current actor name
            if k in data:

                # Correct typo
                actor_list[index]=data.replace(k, typo_corrections[k])

    # Create unique list of cleaned actor names
    cleaned_actors = list(set(actor_list))

    # Return list of unique actor names for given debate
    return cleaned_actors

In [457]:
# Get non-unique words given actor name and debate JSON data
def get_actor_dialogue(debate, actor):

    # Initialize actor's dialogue to empty string
    filtered_dialogue = ''

    # Initialize speaking turn counter to 0
    speaking_turn_count = 0

    # Loop through speaking turns in debate JSON data
    for turn in debate['content']:
        
        # Check if speaking turn is made by specified actor
        if actor in turn['actor'].split():

            # Increment actor's speaking turn count
            speaking_turn_count += 1

            # Concatenate to running dialogue string for selected actor
            filtered_dialogue += f"{turn['dialogue']} "
    # Create nonunique word list from dialogue with punctuation and whitespace removed
    turn_dialogue_list = re.sub(r'[^\w\s]', '', filtered_dialogue).split()
    
    # Make lowercase list of all words
    dialogue_list_lower = [word.lower() for word in turn_dialogue_list]

    
    # Return actor's speaking turn count and nonunique word list
    return speaking_turn_count, dialogue_list_lower

In [458]:
### Collect data for dataframe

# Initialize row data for dataframe to empty list
row_data_list = []

# Loop through all debate data
for debate in all_debates_list:

    # Store date of currently selected debate
    debate_date = get_date(debate)

    # Loop through unique actor list for each debate
    for actor in get_unique_actors(debate):
        
        # Store speaking turn count and nonunique word list for currently selected debate/actor
        speaking_turn_count, actor_dialogue_list = get_actor_dialogue(debate, actor)

        # Store nonunique word count for selected debate/actor
        total_word_count = len(actor_dialogue_list)
    
        # Store unique word count for selected debate/actor
        unique_word_count = len(set(actor_dialogue_list))

        # Create row of relevant data for selected debate/actor
        row_data = [actor, debate_date, speaking_turn_count, total_word_count, unique_word_count]

        # Append row data to list
        row_data_list.append(row_data)

In [459]:
# Create dataframe from collected row data
debate_dialogue_df = pd.DataFrame(row_data_list, columns=['actor', 'date', 'speaking_turn_count','total_word_count', 'unique_word_count'])

# Preview dataframe
debate_dialogue_df

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count
0,Participants,2020-09-29,1,7,7
1,Trump,2020-09-29,341,7394,1155
2,Moderator,2020-09-29,1,4,4
3,Wallace,2020-09-29,246,4711,963
4,Biden,2020-09-29,269,6529,1224
...,...,...,...,...,...
303,Obama,2008-10-07,39,7046,1344
304,Brokaw,2008-10-07,58,1703,533
305,Mccain,2008-10-07,40,6281,1270
306,Transcription By,2008-10-07,0,0,0


In [460]:
# Create new column with calculated average number of words per turn
debate_dialogue_df['avg_words_per_turn'] = debate_dialogue_df['total_word_count'] / debate_dialogue_df['speaking_turn_count']

# Create new column with calculated average number of words per turn
debate_dialogue_df['avg_unique_words_per_turn'] = debate_dialogue_df['unique_word_count'] / debate_dialogue_df['speaking_turn_count']

# Fix Reagan's name in 1984
debate_dialogue_df['actor'].replace({"The President": "Reagan"})

0          Participants
1                 Trump
2             Moderator
3               Wallace
4                 Biden
             ...       
303               Obama
304              Brokaw
305              Mccain
306    Transcription By
307          descriptor
Name: actor, Length: 308, dtype: object

In [461]:
# Export dataframe as CSV
debate_dialogue_df.to_csv('./debate_data.csv', index=False)

debate_dialogue_df.sort_values(by="unique_word_count", ascending=False)

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count,avg_words_per_turn,avg_unique_words_per_turn
251,Kaine,2016-10-04,188,7560,1464,40.212766,7.787234
298,Lieberman,2000-10-05,34,6685,1462,196.617647,43.000000
158,Kerry,2004-10-08,37,7252,1449,196.000000,39.162162
301,Cheney,2000-10-05,30,6663,1446,222.100000,48.200000
106,Dole,1996-10-06,46,8077,1426,175.586957,31.000000
...,...,...,...,...,...,...,...
142,Audience Member,1992-10-15,0,0,0,,
286,McGee,1960-10-07,0,0,0,,
174,A Reminder,2004-10-05,0,0,0,,
175,The Rules,2004-10-05,0,0,0,,


In [462]:
# Create function to count number of times each word was used
def actor_word_count(debate, actor):
    remove_words = ['the', 'to', 'of', 'in', 'and', 'that', 'a', 'is', 'for', 'it']
    turn_count, dialogue_list = get_actor_dialogue(debate, actor)

    dialogue_list_cleaned = [word for word in dialogue_list if word not in remove_words]

    c = Counter(dialogue_list_cleaned)

    return c

In [463]:
for debate in all_debates_list:
    for actor in get_unique_actors(debate):
        if actor == 'Obama':
            print(actor, debate['date'], actor_word_count(debate, actor).most_common(10))

Obama ['September', '26', '2008'] [('we', 410), ('i', 234), ('have', 232), ('are', 184), ('not', 146), ('you', 132), ('our', 128), ('this', 118), ('with', 102), ('going', 100)]
Obama ['October', '15', '2008'] [('i', 156), ('we', 118), ('have', 79), ('on', 59), ('you', 56), ('think', 56), ('going', 55), ('what', 47), ('if', 44), ('are', 43)]
Obama ['OCTOBER', '3', '2012'] [('we', 115), ('i', 90), ('you', 75), ('are', 66), ('but', 62), ('were', 45), ('governor', 44), ('make', 43), ('not', 42), ('do', 41)]
Obama ['OCTOBER', '22', '2012'] [('we', 176), ('you', 89), ('have', 79), ('our', 78), ('i', 74), ('not', 66), ('are', 63), ('were', 61), ('with', 55), ('but', 53)]
Obama ['OCTOBER', '16', '2012'] [('i', 125), ('we', 121), ('are', 81), ('not', 71), ('thats', 70), ('but', 58), ('what', 57), ('going', 57), ('you', 56), ('governor', 55)]
Obama ['October', '7', '2008'] [('we', 124), ('i', 101), ('have', 97), ('you', 88), ('going', 67), ('are', 62), ('on', 52), ('but', 50), ('so', 48), ('were

In [464]:
debates_wiki_df = pd.read_html("http://en.wikipedia.org/wiki/United_States_presidential_debates")

candidates_table = debates_wiki_df[2]
viewership_table = debates_wiki_df[3]

## Data Cleaning

In [465]:
vp_candidates = pd.DataFrame()
pres_candidates = pd.DataFrame()

candidates_table = candidates_table.rename(columns={"Election":"year",
                                "Presidential debates":"pres_debate_count",
                                "Presidential debates.1":"pres_candidate",
                                "Vice presidential debates":"vp_debate_count",
                                "Vice presidential debates.1":"vp_candidate"})

candidates_cleaned_df = candidates_table.drop(index=2)

pres_candidates[['year', 'debate_count', 'candidate']] = candidates_table[['year', 'pres_debate_count','pres_candidate']]
pres_candidates['type'] = 'P'
vp_candidates[['year', 'debate_count', 'candidate']] = candidates_table[['year', 'vp_debate_count','vp_candidate']]
vp_candidates['type'] = 'VP'

In [466]:
candidates_df = pd.concat([pres_candidates, vp_candidates], ignore_index=True).sort_values(by='year').reset_index(drop=True)

candidates_df.drop(index=[56,57], inplace=True)

In [467]:
candidates_df["debate_count"][candidates_df["year"] == '2020'] = '2'

candidates_df["debate_count"][candidates_df["debate_count"].str.contains('ebate', na=False)] = 0

candidates_df.head() 

Unnamed: 0,year,debate_count,candidate,type
0,1960,4,Vice President Richard Nixon (R),P
1,1960,4,Senator John F. Kennedy (D),P
2,1960,0,No debates until 1976,VP
3,1960,0,No debates until 1976,VP
4,1976,3,President Gerald Ford (R),P


In [468]:
candidates_df[['debate_count', 'year']] = candidates_df[['debate_count', 'year']].astype(int)

In [469]:
candidates_df = candidates_df[candidates_df["candidate"].str.contains('ebate')==False]

In [470]:
candidates_df[["candidate", "party"]] = candidates_df["candidate"].str.split("(", expand=True)
candidates_df["party"] = candidates_df["party"].str[0]
candidates_df['last_name'] = candidates_df['candidate'].str.split().str[-1]
candidates_df = candidates_df.reset_index(drop=True)


In [471]:
won_election = [False, True, False, True, False, True, False, True, False, False,
                 True, True, False, False, True, True, False, False, True, False,
                 True, False, False, True, False, False, True, True, False, True,
                 False, False, False, True, True, True, False, True, False, False,
                 False, True, True, False, True, True, False, True, False, False,
                 True]

candidates_df.insert(6, 'won_election', won_election)

candidates_df.head()

Unnamed: 0,year,debate_count,candidate,type,party,last_name,won_election
0,1960,4,Vice President Richard Nixon,P,R,Nixon,False
1,1960,4,Senator John F. Kennedy,P,D,Kennedy,True
2,1976,3,President Gerald Ford,P,R,Ford,False
3,1976,3,Former Governor Jimmy Carter,P,D,Carter,True
4,1976,1,Senator Bob Dole,VP,R,Dole,False


In [472]:
debate_dialogue_df['year'] = pd.DatetimeIndex(debate_dialogue_df['date']).year

debate_dialogue_df = debate_dialogue_df.merge(candidates_df[['last_name', 'won_election']], left_on='actor', right_on='last_name')

In [473]:
# Create list of candidate titles
titles = ['Former Vice President', 'Vice President', 'President',  'Former Senator', 'Senator', 'Former Governor',
          'Governor', 'Congressman', 'Congresswoman', 'Businessman', 'Ret. Vice Admiral', 'Former HUD Secretary',
          'Former Secretary of State']

# Define function to return the title string contained in candidate name
def title_split(a):
    for title in titles:
        if title in a:
            return title

# Create new title column
candidates_df['title'] = candidates_df['candidate'].apply(lambda x: title_split(x))

# Fix existing candidate column name to remove title
candidates_df['candidate'] = candidates_df['candidate'].apply(lambda x: x.split(title_split(x))[1])

In [474]:
candidate_debates_df = pd.merge(candidates_df, debate_dialogue_df)


debates = candidate_debates_df.groupby(['date', 'type'], as_index=False).sum()

debates = debates.reset_index().rename(columns={'index':'debate_id'})

debates = debates[['debate_id', 'date', 'type']]

debates.to_csv('./Tables/debates.csv')

debates

Unnamed: 0,debate_id,date,type
0,0,1960-09-26,P
1,1,1960-10-07,P
2,2,1960-10-13,P
3,3,1960-10-21,P
4,4,1976-09-23,P
5,5,1976-10-06,P
6,6,1976-10-22,P
7,7,1980-09-21,P
8,8,1980-10-28,P
9,9,1984-10-07,P


In [475]:
candidates_df.sort_values('year')

# candidates = candidates_df.groupby(['candidate', 'party'], as_index=False).sum()
candidates = candidates_df.drop_duplicates(subset='candidate')

candidates = candidates[['candidate', 'party', 'debate_count', 'last_name']]

candidates = candidates.reset_index().rename(columns={'index':'candidate_id'})

candidates.to_csv('./Tables/candidates.csv')

candidates

Unnamed: 0,candidate_id,candidate,party,debate_count,last_name
0,0,Richard Nixon,R,4,Nixon
1,1,John F. Kennedy,D,4,Kennedy
2,2,Gerald Ford,R,3,Ford
3,3,Jimmy Carter,D,3,Carter
4,4,Bob Dole,R,1,Dole
5,5,Walter Mondale,D,1,Mondale
6,7,Ronald Reagan,R,2,Reagan
7,8,John B. Anderson,I,2,Anderson
8,10,George H. W. Bush,R,1,Bush
9,12,Geraldine Ferraro,D,1,Ferraro


In [476]:
debates

Unnamed: 0,debate_id,date,type
0,0,1960-09-26,P
1,1,1960-10-07,P
2,2,1960-10-13,P
3,3,1960-10-21,P
4,4,1976-09-23,P
5,5,1976-10-06,P
6,6,1976-10-22,P
7,7,1980-09-21,P
8,8,1980-10-28,P
9,9,1984-10-07,P


In [485]:
performances = pd.merge(debates, candidate_debates_df, on='date')

performances = performances.merge(candidates_df)

performances = performances.merge(candidates)

performances = performances[['debate_id', 'date', 'candidate_id', 'type_x', 'speaking_turn_count', 'total_word_count', 'won_election']]

performances.to_csv('./Tables/performances.csv')

# performances = candidate_debates_df.drop_duplicates(subset=['date', 'candidate_id'])

performances = performances.rename(columns={'type_x':'type'})

performances

Unnamed: 0,debate_id,date,candidate_id,type,speaking_turn_count,total_word_count,won_election
0,0,1960-09-26,0,P,10,4111,False
1,1,1960-10-07,0,P,12,4336,False
2,2,1960-10-13,0,P,14,4642,False
3,3,1960-10-21,0,P,10,4617,False
4,0,1960-09-26,1,P,17,4693,True
...,...,...,...,...,...,...,...
107,40,2016-10-09,46,P,57,5847,False
108,41,2016-10-19,46,P,82,6662,False
109,39,2016-10-04,43,VP,188,7560,False
110,39,2016-10-04,45,VP,207,6982,True


In [478]:
performances

Unnamed: 0,debate_id,date,candidate_id,type,speaking_turn_count,total_word_count,won_election
0,0,1960-09-26,1,P,17,4693,True
1,1,1960-10-07,1,P,13,4403,True
2,2,1960-10-13,1,P,14,4018,True
3,3,1960-10-21,1,P,11,4878,True
4,0,1960-09-26,0,P,10,4111,False
...,...,...,...,...,...,...,...
299,39,2016-10-04,45,VP,207,6982,False
300,43,2020-10-07,45,VP,89,6411,True
301,43,2020-10-07,45,VP,89,6411,False
302,39,2016-10-04,43,VP,188,7560,False
