In [384]:
# Import dependencies
import pandas as pd
import numpy as np
import json
import re

from os import listdir
from os.path import isfile, join
from collections import Counter
from datetime import datetime
from matplotlib import pyplot as plt

import plotly.express as px

In [385]:
# Define directory path
mypath = "./data_dir"

# Create list of json file names in directory
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [386]:
# Initialize empty list for all debate data
all_debates_list = []

# Loop through and read in all json files
for file in onlyfiles:
    
    # Open next JSON file
    with open(f"{mypath}/{file}") as json_data:

        # Load data from current JSON file
        data = json.load(json_data)

        # Fix dates for 1992 debate parts 1 and 2
        if (data['date'][0] == 'Part'):
            if data['date'][1] == '1':
                data['date'] = ['October', '11', '1992']
            elif data['date'][1] == '2':
                data['date'] = ['October', '15', '1992']

        # Append to full debate data list
        all_debates_list.append(data)

In [387]:
# Get date in datetime format from a particular debate JSON
def get_date(debate):
    
    # Create date string from provided date array
    date_string = '-'.join(debate['date'])

    # Create date object from date string
    date_object = datetime.strptime(date_string, '%B-%d-%Y')

    # Return date of given debate in datetime
    return date_object

In [388]:
# Get list of unique actors from a particular debate JSON
def get_unique_actors(debate):
    # Empty list to store all actors names (including duplicates)
    actor_list = []

    # Create list of prefixes to remove from names
    prefixes = ['Mr.', 'Ms.', 'Senator', 'Governor', 'Admiral']

    # Create dictionary of actor name corrections to be made
    typo_corrections = {
        # 'The President':'Reagan',
        '^Obam$':'Obama',
        'Barbara Walters':'Walters',
        'Bill Shadel': 'Shadel',
        'Edwin Newman': 'Newman',
        'Frank Mcgee': 'McGee',
        'Hal Bruno': 'Bruno',
        'Harry Ellis': 'Ellis',
        'Jim Lehrer': 'Lehrer',
        'Quincy Howe': 'Howe',
        'Sander Vanocur': 'Vanocur',
        'President Bush': 'Bush',
        '^Frederic$': 'Frederick'
    }

    # Loop through each speaking turn in the debate transcription content
    for turn in debate['content']:

        # Store actor name for each speaking turn
        actor = turn['actor']

        # Loop through prefixes in list
        for prefix in prefixes:

            # Check if the actor's name contains current prefix
            if prefix in actor:

                # Remove prefix and strip whitespace
                actor = actor.replace(prefix, '').strip()

        # Append actor to list of non-unique names
        actor_list.append(actor)

    # Loop through enumerated list of actor names
    for index, data in enumerate(actor_list):

        # Loop through items in typo corrections dict
        for k, v in typo_corrections.items():

            # Check if typo is present in current actor name
            if k in data:

                # Correct typo
                actor_list[index]=data.replace(k, typo_corrections[k])

    # Create unique list of cleaned actor names
    cleaned_actors = list(set(actor_list))

    # Return list of unique actor names for given debate
    return cleaned_actors

In [389]:
# Get non-unique words given actor name and debate JSON data
def get_actor_dialogue(debate, actor):

    # Initialize actor's dialogue to empty string
    filtered_dialogue = ''

    # Initialize speaking turn counter to 0
    speaking_turn_count = 0

    # Loop through speaking turns in debate JSON data
    for turn in debate['content']:
        
        # Check if speaking turn is made by specified actor
        if actor in turn['actor'].split():

            # Increment actor's speaking turn count
            speaking_turn_count += 1

            # Concatenate to running dialogue string for selected actor
            filtered_dialogue += f"{turn['dialogue']} "
    # Create nonunique word list from dialogue with punctuation and whitespace removed
    turn_dialogue_list = re.sub(r'[^\w\s]', '', filtered_dialogue).split()
    
    # Make lowercase list of all words
    dialogue_list_lower = [word.lower() for word in turn_dialogue_list]

    
    # Return actor's speaking turn count and nonunique word list
    return speaking_turn_count, dialogue_list_lower

In [390]:
### Collect data for dataframe

# Initialize row data for dataframe to empty list
row_data_list = []

# Loop through all debate data
for debate in all_debates_list:

    # Store date of currently selected debate
    debate_date = get_date(debate)

    # Loop through unique actor list for each debate
    for actor in get_unique_actors(debate):
        
        # Store speaking turn count and nonunique word list for currently selected debate/actor
        speaking_turn_count, actor_dialogue_list = get_actor_dialogue(debate, actor)

        # Store nonunique word count for selected debate/actor
        total_word_count = len(actor_dialogue_list)
    
        # Store unique word count for selected debate/actor
        unique_word_count = len(set(actor_dialogue_list))

        # Create row of relevant data for selected debate/actor
        row_data = [actor, debate_date, speaking_turn_count, total_word_count, unique_word_count]

        # Append row data to list
        row_data_list.append(row_data)

In [391]:
# Create dataframe from collected row data
debate_dialogue_df = pd.DataFrame(row_data_list, columns=['actor', 'date', 'speaking_turn_count','total_word_count', 'unique_word_count'])

# Preview dataframe
debate_dialogue_df

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count
0,Participants,2020-09-29,1,7,7
1,Trump,2020-09-29,341,7394,1155
2,Moderator,2020-09-29,1,4,4
3,Wallace,2020-09-29,246,4711,963
4,Biden,2020-09-29,269,6529,1224
...,...,...,...,...,...
303,Mccain,2008-10-07,40,6281,1270
304,Transcription By,2008-10-07,0,0,0
305,Obama,2008-10-07,39,7046,1344
306,descriptor,2008-10-07,7,7,2


In [392]:
# Create new column with calculated average number of words per turn
debate_dialogue_df['avg_words_per_turn'] = debate_dialogue_df['total_word_count'] / debate_dialogue_df['speaking_turn_count']

# Create new column with calculated average number of words per turn
debate_dialogue_df['avg_unique_words_per_turn'] = debate_dialogue_df['unique_word_count'] / debate_dialogue_df['speaking_turn_count']

# Fix Reagan's name in 1984
debate_dialogue_df['actor'].replace({"The President": "Reagan"})

0          Participants
1                 Trump
2             Moderator
3               Wallace
4                 Biden
             ...       
303              Mccain
304    Transcription By
305               Obama
306          descriptor
307              Brokaw
Name: actor, Length: 308, dtype: object

In [393]:
# Export dataframe as CSV
debate_dialogue_df.to_csv('./debate_data.csv', index=False)

debate_dialogue_df.sort_values(by="unique_word_count", ascending=False)

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count,avg_words_per_turn,avg_unique_words_per_turn
251,Kaine,2016-10-04,188,7560,1464,40.212766,7.787234
300,Lieberman,2000-10-05,34,6685,1462,196.617647,43.000000
148,Kerry,2004-10-08,37,7252,1449,196.000000,39.162162
299,Cheney,2000-10-05,30,6663,1446,222.100000,48.200000
106,Dole,1996-10-06,46,8077,1426,175.586957,31.000000
...,...,...,...,...,...,...,...
143,Audience Question,1992-10-15,0,0,0,,
147,Audience Member,1992-10-15,0,0,0,,
171,A Reminder,2004-10-05,0,0,0,,
176,The Rules,2004-10-05,0,0,0,,


In [394]:
# Create function to count number of times each word was used
def actor_word_count(debate, actor):
    remove_words = ['the', 'to', 'of', 'in', 'and', 'that', 'a', 'is', 'for', 'it']
    turn_count, dialogue_list = get_actor_dialogue(debate, actor)

    dialogue_list_cleaned = [word for word in dialogue_list if word not in remove_words]

    c = Counter(dialogue_list_cleaned)

    return c

In [395]:
for debate in all_debates_list:
    for actor in get_unique_actors(debate):
        if actor == 'Obama':
            print(actor, debate['date'], actor_word_count(debate, actor).most_common(10))

Obama ['September', '26', '2008'] [('we', 410), ('i', 234), ('have', 232), ('are', 184), ('not', 146), ('you', 132), ('our', 128), ('this', 118), ('with', 102), ('going', 100)]
Obama ['October', '15', '2008'] [('i', 156), ('we', 118), ('have', 79), ('on', 59), ('you', 56), ('think', 56), ('going', 55), ('what', 47), ('if', 44), ('are', 43)]
Obama ['OCTOBER', '3', '2012'] [('we', 115), ('i', 90), ('you', 75), ('are', 66), ('but', 62), ('were', 45), ('governor', 44), ('make', 43), ('not', 42), ('do', 41)]
Obama ['OCTOBER', '22', '2012'] [('we', 176), ('you', 89), ('have', 79), ('our', 78), ('i', 74), ('not', 66), ('are', 63), ('were', 61), ('with', 55), ('but', 53)]
Obama ['OCTOBER', '16', '2012'] [('i', 125), ('we', 121), ('are', 81), ('not', 71), ('thats', 70), ('but', 58), ('what', 57), ('going', 57), ('you', 56), ('governor', 55)]
Obama ['October', '7', '2008'] [('we', 124), ('i', 101), ('have', 97), ('you', 88), ('going', 67), ('are', 62), ('on', 52), ('but', 50), ('so', 48), ('were

In [396]:
df = debate_dialogue_df

annual_average_df = df.groupby(df.date.dt.year).mean().reset_index()
annual_average_df

Unnamed: 0,date,speaking_turn_count,total_word_count,unique_word_count,avg_words_per_turn,avg_unique_words_per_turn
0,1960,8.233333,1413.833333,352.866667,156.536252,49.036258
1,1976,14.368421,2119.105263,540.368421,133.760234,39.10922
2,1980,11.6875,1437.5625,387.875,129.599787,51.448356
3,1984,12.708333,1260.375,353.375,86.905834,33.629969
4,1988,26.578947,2382.105263,565.315789,90.09211,27.76428
5,1992,25.083333,1497.805556,398.055556,82.75919,28.088993
6,1996,12.709677,1510.677419,324.451613,71.469144,31.813999
7,2000,41.3125,3812.25,808.125,83.231956,17.741722
8,2004,15.35,1533.825,327.975,61.753283,24.725779
9,2008,44.7,3878.55,645.35,74.639637,14.603787


In [397]:
df = annual_average_df
fig = px.line(df, x="date", y="avg_words_per_turn", title=f"avg_words_per_turn vs. date")
# fig.add_scatter(x=df['date'], y=df['unique_word_count'])
fig.add_scatter(x=df['date'], y=df['avg_unique_words_per_turn'])
fig.show()

# fig.update_traces(marker_size=10)

In [398]:

fig = px.scatter(df, x="speaking_turn_count", y="avg_words_per_turn",
	         size="total_word_count", color="date",
                 hover_name="date", log_x=True, size_max=60)
fig.show()

In [399]:
df = debate_dialogue_df

fig = px.histogram(df, x="total_word_count",
                   marginal="box", # or violin, rug
                   hover_data=df.columns)
fig.show()

In [400]:
debates_wiki_df = pd.read_html("http://en.wikipedia.org/wiki/United_States_presidential_debates")

candidates_table = debates_wiki_df[2]
viewership_table = debates_wiki_df[3]

In [401]:
vp_candidates = pd.DataFrame()
pres_candidates = pd.DataFrame()

candidates_table = candidates_table.rename(columns={"Election":"year",
                                "Presidential debates":"pres_debate_count",
                                "Presidential debates.1":"pres_candidate",
                                "Vice presidential debates":"vp_debate_count",
                                "Vice presidential debates.1":"vp_candidate"})

candidates_cleaned_df = candidates_table.drop(index=2)

pres_candidates[['year', 'debate_count', 'candidate']] = candidates_table[['year', 'pres_debate_count','pres_candidate']]
pres_candidates['type'] = 'P'
vp_candidates[['year', 'debate_count', 'candidate']] = candidates_table[['year', 'vp_debate_count','vp_candidate']]
vp_candidates['type'] = 'VP'

In [402]:
candidates_df = pd.concat([pres_candidates, vp_candidates], ignore_index=True).sort_values(by='year').reset_index(drop=True)

candidates_df.drop(index=[56,57], inplace=True)

In [403]:
candidates_df["debate_count"][candidates_df["year"] == '2020'] = '2'

candidates_df["debate_count"][candidates_df["debate_count"].str.contains('ebate', na=False)] = 0

candidates_df.head() 

Unnamed: 0,year,debate_count,candidate,type
0,1960,4,Vice President Richard Nixon (R),P
1,1960,4,Senator John F. Kennedy (D),P
2,1960,0,No debates until 1976,VP
3,1960,0,No debates until 1976,VP
4,1976,3,President Gerald Ford (R),P


In [404]:
candidates_df[['debate_count', 'year']] = candidates_df[['debate_count', 'year']].astype(int)

In [405]:
candidates_df = candidates_df[candidates_df["candidate"].str.contains('ebate')==False]

In [406]:
candidates_df[["candidate", "party"]] = candidates_df["candidate"].str.split("(", expand=True)
candidates_df["party"] = candidates_df["party"].str[0]


In [407]:
candidates_df['last_name'] = candidates_df['candidate'].str.split().str[-1]

In [408]:
won_election = [False, True, False, True, False, True, False, True, False, False,
                 True, True, False, False, True, True, False, False, True, False,
                 True, False, False, True, False, False, True, True, False, True,
                 False, False, False, True, True, True, False, True, False, False,
                 False, True, True, False, True, True, False, True, False, False,
                 True]

In [409]:
candidates_df = candidates_df.reset_index(drop=True)

candidates_df.insert(6, 'won_election', won_election)

In [410]:
candidates_df

Unnamed: 0,year,debate_count,candidate,type,party,last_name,won_election
0,1960,4,Vice President Richard Nixon,P,R,Nixon,False
1,1960,4,Senator John F. Kennedy,P,D,Kennedy,True
2,1976,3,President Gerald Ford,P,R,Ford,False
3,1976,3,Former Governor Jimmy Carter,P,D,Carter,True
4,1976,1,Senator Bob Dole,VP,R,Dole,False
5,1976,1,Senator Walter Mondale,VP,D,Mondale,True
6,1980,2,President Jimmy Carter,P,D,Carter,False
7,1980,2,Former Governor Ronald Reagan,P,R,Reagan,True
8,1980,2,Congressman John B. Anderson,P,I,Anderson,False
9,1984,2,Former Vice President Walter Mondale,P,D,Mondale,False


In [411]:
debate_dialogue_df['year'] = pd.DatetimeIndex(debate_dialogue_df['date']).year

debate_dialogue_df.head()

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count,avg_words_per_turn,avg_unique_words_per_turn,year
0,Participants,2020-09-29,1,7,7,7.0,7.0,2020
1,Trump,2020-09-29,341,7394,1155,21.683284,3.387097,2020
2,Moderator,2020-09-29,1,4,4,4.0,4.0,2020
3,Wallace,2020-09-29,246,4711,963,19.150407,3.914634,2020
4,Biden,2020-09-29,269,6529,1224,24.271375,4.550186,2020


In [412]:
candidate_debates_df = pd.merge(candidates_df, debate_dialogue_df, left_on=['year', 'last_name'], right_on=['year', 'actor']).drop(columns='actor')

candidate_debates_df.head()

Unnamed: 0,year,debate_count,candidate,type,party,last_name,won_election,date,speaking_turn_count,total_word_count,unique_word_count,avg_words_per_turn,avg_unique_words_per_turn
0,1960,4,Vice President Richard Nixon,P,R,Nixon,False,1960-09-26,10,4111,820,411.1,82.0
1,1960,4,Vice President Richard Nixon,P,R,Nixon,False,1960-10-13,14,4642,934,331.571429,66.714286
2,1960,4,Vice President Richard Nixon,P,R,Nixon,False,1960-10-21,10,4617,948,461.7,94.8
3,1960,4,Vice President Richard Nixon,P,R,Nixon,False,1960-10-07,12,4336,945,361.333333,78.75
4,1960,4,Senator John F. Kennedy,P,D,Kennedy,True,1960-09-26,17,4693,953,276.058824,56.058824


In [413]:
df = candidate_debates_df
fig = px.scatter(df, x="speaking_turn_count", y="avg_words_per_turn", color="party",
	         size="total_word_count", log_x=True, size_max=60)
fig.show()

In [414]:
grouped_candidate_debates = candidate_debates_df.groupby(['year', 'last_name']).mean()

grouped_candidate_debates['avg_words_per_turn'] = grouped_candidate_debates['total_word_count'] / grouped_candidate_debates['speaking_turn_count']
grouped_candidate_debates['avg_unique_words_per_turn'] = grouped_candidate_debates['unique_word_count'] / grouped_candidate_debates['speaking_turn_count']
grouped_candidate_debates.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,debate_count,won_election,speaking_turn_count,total_word_count,unique_word_count,avg_words_per_turn,avg_unique_words_per_turn
year,last_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1960,Kennedy,4.0,True,13.75,4498.0,977.25,327.127273,71.072727
1960,Nixon,4.0,False,11.5,4426.5,911.75,384.913043,79.282609
1976,Carter,3.0,True,19.333333,5921.666667,1270.666667,306.293103,65.724138
1976,Ford,3.0,False,19.333333,4840.333333,1034.0,250.362069,53.482759
1980,Anderson,2.0,False,13.0,3697.0,1023.0,284.384615,78.692308


In [415]:
candidate_debates_df

Unnamed: 0,year,debate_count,candidate,type,party,last_name,won_election,date,speaking_turn_count,total_word_count,unique_word_count,avg_words_per_turn,avg_unique_words_per_turn
0,1960,4,Vice President Richard Nixon,P,R,Nixon,False,1960-09-26,10,4111,820,411.100000,82.000000
1,1960,4,Vice President Richard Nixon,P,R,Nixon,False,1960-10-13,14,4642,934,331.571429,66.714286
2,1960,4,Vice President Richard Nixon,P,R,Nixon,False,1960-10-21,10,4617,948,461.700000,94.800000
3,1960,4,Vice President Richard Nixon,P,R,Nixon,False,1960-10-07,12,4336,945,361.333333,78.750000
4,1960,4,Senator John F. Kennedy,P,D,Kennedy,True,1960-09-26,17,4693,953,276.058824,56.058824
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,2020,2,Former Vice President Joe Biden,P,D,Biden,True,2020-10-22,84,6979,1329,83.083333,15.821429
88,2020,2,Vice President Mike Pence,VP,R,Pence,False,2020-10-07,89,6411,1341,72.033708,15.067416
89,2020,2,President Donald Trump,P,R,Trump,False,2020-09-29,341,7394,1155,21.683284,3.387097
90,2020,2,President Donald Trump,P,R,Trump,False,2020-10-22,122,7654,1200,62.737705,9.836066


In [416]:
import requests
from bs4 import BeautifulSoup

In [417]:
url = "https://www.loc.gov/rr/print/list/059_vp_alpha.html"
page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

In [418]:
table = soup.find_all('table')[2]

vp_winners = []

for a in table.find_all('a')[13:62]:
    vp_winners.append(a.text.split(',')[0])

In [419]:
debate_dialogue_df

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count,avg_words_per_turn,avg_unique_words_per_turn,year
0,Participants,2020-09-29,1,7,7,7.000000,7.000000,2020
1,Trump,2020-09-29,341,7394,1155,21.683284,3.387097,2020
2,Moderator,2020-09-29,1,4,4,4.000000,4.000000,2020
3,Wallace,2020-09-29,246,4711,963,19.150407,3.914634,2020
4,Biden,2020-09-29,269,6529,1224,24.271375,4.550186,2020
...,...,...,...,...,...,...,...,...
303,Mccain,2008-10-07,40,6281,1270,157.025000,31.750000,2008
304,Transcription By,2008-10-07,0,0,0,,,2008
305,Obama,2008-10-07,39,7046,1344,180.666667,34.461538,2008
306,descriptor,2008-10-07,7,7,2,1.000000,0.285714,2008
