In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import json
import re
import nltk

from os import listdir
from os.path import isfile, join
from collections import Counter
from datetime import datetime
from matplotlib import pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import plotly.express as px


## Data Retrieval

In [2]:
# Define directory path
mypath = "./data_dir"

# Create list of json file names in directory
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [3]:
# Initialize empty list for all debate data
all_debates_list = []

# Loop through and read in all json files
for file in onlyfiles:
    
    # Open next JSON file
    with open(f"{mypath}/{file}") as json_data:

        # Load data from current JSON file
        data = json.load(json_data)

        # Fix dates for 1992 debate parts 1 and 2
        if (data['date'][0] == 'Part'):
            if data['date'][1] == '1':
                data['date'] = ['October', '11', '1992']
            elif data['date'][1] == '2':
                data['date'] = ['October', '15', '1992']

        # Append to full debate data list
        all_debates_list.append(data)

In [4]:
# Get date in datetime format from a particular debate JSON
def get_date(debate):
    
    # Create date string from provided date array
    date_string = '-'.join(debate['date'])

    # Create date object from date string
    date_object = datetime.strptime(date_string, '%B-%d-%Y')

    # Return date of given debate in datetime
    return date_object

In [5]:
# Get list of unique actors from a particular debate JSON
def get_unique_actors(debate):
    # Empty list to store all actors names (including duplicates)
    actor_list = []

    # Create list of prefixes to remove from names
    prefixes = ['Mr.', 'Ms.', 'Senator', 'Governor', 'Admiral']

    # Create dictionary of actor name corrections to be made
    typo_corrections = {
        'The President':'Reagan',
        '^Obam$':'Obama',
        'Barbara Walters':'Walters',
        'Bill Shadel': 'Shadel',
        'Edwin Newman': 'Newman',
        'Frank Mcgee': 'McGee',
        'Hal Bruno': 'Bruno',
        'Harry Ellis': 'Ellis',
        'Jim Lehrer': 'Lehrer',
        'Quincy Howe': 'Howe',
        'Sander Vanocur': 'Vanocur',
        'President Bush': 'Bush',
        '^Frederic$': 'Frederick'
    }

    # Loop through each speaking turn in the debate transcription content
    for turn in debate['content']:

        # Store actor name for each speaking turn
        actor = turn['actor']

        # Loop through prefixes in list
        for prefix in prefixes:

            # Check if the actor's name contains current prefix
            if prefix in actor:

                # Remove prefix and strip whitespace
                actor = actor.replace(prefix, '').strip()

        # Append actor to list of non-unique names
        actor_list.append(actor)

    # Loop through enumerated list of actor names
    for index, data in enumerate(actor_list):

        # Loop through items in typo corrections dict
        for k, v in typo_corrections.items():

            # Check if typo is present in current actor name
            if k in data:

                # Correct typo
                actor_list[index]=data.replace(k, typo_corrections[k])

    # Create unique list of cleaned actor names
    cleaned_actors = list(set(actor_list))

    # Return list of unique actor names for given debate
    return cleaned_actors

In [6]:
# Get non-unique words given actor name and debate JSON data
def get_actor_dialogue(debate, actor):

    # Initialize actor's dialogue to empty string
    filtered_dialogue = ''

    # ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Initialize speaking turn counter to 0
    speaking_turn_count = 0

    # Loop through speaking turns in debate JSON data
    for turn in debate['content']:
        
        # Check if speaking turn is made by specified actor
        if actor in turn['actor'].split():

            # Increment actor's speaking turn count
            speaking_turn_count += 1

            # Concatenate to running dialogue string for selected actor
            filtered_dialogue += f"{turn['dialogue']} "
            
    # Create nonunique word list from dialogue with punctuation and whitespace removed
    turn_dialogue_list = re.sub(r'[^\w\s]', '', filtered_dialogue).split()
    
    # Make lowercase list of all words
    dialogue_list_lemmed = ' '.join([lemmatizer.lemmatize(word) for word in turn_dialogue_list])
    
    # Return actor's speaking turn count and nonunique word list
    return speaking_turn_count, dialogue_list_lemmed

In [7]:
# Get non-unique words given actor name and debate JSON data
def get_actor_dialogue_string(debate, actor):

    # Initialize actor's dialogue to empty list
    dialogue_string = ''

    # Loop through speaking turns in debate JSON data
    for turn in debate['content']:
        
        # Check if speaking turn is made by specified actor
        if actor in turn['actor'].split():

            # Concatenate to running dialogue string for selected actor
            dialogue_string += f"{turn['dialogue']} "
    
    # Return actor's speaking turn count and nonunique word list
    return dialogue_string

In [8]:
# Create function to count number of times each word was used
def actor_word_count(debate, actor, n):

    remove_words = ['the', 'to', 'of', 'in', 'and', 'that', 'a', 'is', 'for', 'it']
    
    turn_count, dialogue_list = get_actor_dialogue(debate, actor)

    dialogue_list_cleaned = [word for word in dialogue_list if word not in remove_words]

    c = Counter(dialogue_list_cleaned).most_common(n)

    return c

In [9]:
# Return fully processed list of lemmatized words given list of turn dialogue for single actor

def processed_dialogue(dialogue_list):
    dialogue_string = ' '.join(dialogue_list)
    

In [10]:
### Collect data for dataframe

# Initialize row data for dataframe to empty list
row_data_list = []

# Initialize list of all debate dialogue
all_dialogue = []

# Loop through all debate data
for debate in all_debates_list:

    # Store date of currently selected debate
    debate_date = get_date(debate)

    # Loop through unique actor list for each debate
    for actor in get_unique_actors(debate):
        
        # Store speaking turn count and nonunique word list for currently selected debate/actor
        speaking_turn_count, actor_dialogue_list = get_actor_dialogue(debate, actor)
        
        actor_dialogue_string = get_actor_dialogue_string(debate, actor)

        # Append dialogue to all dialogue list
        all_dialogue += actor_dialogue_list

        # Store nonunique word count for selected debate/actor
        total_word_count = len(actor_dialogue_list)
    
        # Store unique word count for selected debate/actor
        unique_word_count = len(set(actor_dialogue_list))

        # Top ten words
        top_ten_words = actor_word_count(debate, actor, 10)

        # Create row of relevant data for selected debate/actor
        row_data = [actor, debate_date, speaking_turn_count, total_word_count, unique_word_count, actor_dialogue_string]

        # Append row data to list
        row_data_list.append(row_data)

all_dialogue_counter = Counter(all_dialogue)

In [11]:
all_dialogue_counter.most_common()

[(' ', 661403),
 ('e', 353101),
 ('t', 286244),
 ('o', 233835),
 ('a', 223758),
 ('n', 205934),
 ('i', 187571),
 ('r', 163745),
 ('h', 146947),
 ('s', 120929),
 ('d', 104760),
 ('l', 104049),
 ('u', 84584),
 ('c', 72877),
 ('m', 63170),
 ('w', 59682),
 ('g', 58425),
 ('y', 57971),
 ('p', 53320),
 ('f', 51504),
 ('b', 39434),
 ('v', 33296),
 ('k', 22817),
 ('I', 20367),
 ('A', 12085),
 ('T', 8139),
 ('S', 7854),
 ('W', 6800),
 ('x', 5698),
 ('C', 4571),
 ('B', 4563),
 ('0', 4393),
 ('M', 4270),
 ('j', 4131),
 ('P', 3903),
 ('q', 3345),
 ('N', 2976),
 ('H', 2607),
 ('G', 2552),
 ('R', 2304),
 ('O', 2231),
 ('D', 2205),
 ('L', 2064),
 ('1', 2062),
 ('Y', 2007),
 ('U', 1977),
 ('F', 1458),
 ('2', 1421),
 ('z', 1414),
 ('J', 1410),
 ('K', 1374),
 ('E', 1313),
 ('5', 1179),
 ('V', 1067),
 ('9', 988),
 ('3', 771),
 ('4', 690),
 ('8', 519),
 ('7', 508),
 ('6', 490),
 ('Q', 278),
 ('Z', 31),
 ('X', 25)]

In [12]:
# Create dataframe from collected row data
debate_dialogue_df = pd.DataFrame(row_data_list, columns=['actor', 'date', 'speaking_turn_count','total_word_count', 
                                                          'unique_word_count', 'dialogue_list'])

# Preview dataframe
debate_dialogue_df.head()

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count,dialogue_list
0,Wallace,2020-09-29,246,25464,60,Good evening from the Health Education Campus ...
1,Moderator,2020-09-29,1,22,16,Chris Wallace (Fox News)
2,Participants,2020-09-29,1,37,18,Former Vice President Joe Biden (D) and
3,Trump,2020-09-29,341,37450,59,"How are you doing? Thank you very much, Chris...."
4,Biden,2020-09-29,269,33043,61,"How you doing, man? Im well. Well, first of al..."


In [13]:
# Create new column with calculated average number of words per turn
debate_dialogue_df['avg_words_per_turn'] = debate_dialogue_df['total_word_count'] / debate_dialogue_df['speaking_turn_count']

# Create new column with calculated average number of words per turn
debate_dialogue_df['avg_unique_words_per_turn'] = debate_dialogue_df['unique_word_count'] / debate_dialogue_df['speaking_turn_count']

# Fix Reagan's name in 1984
# debate_dialogue_df['actor'].replace({"The President": "Reagan"})

# Preview dataframe
debate_dialogue_df = debate_dialogue_df.sort_values(by='date')

debate_dialogue_df.head()

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count,dialogue_list,avg_words_per_turn,avg_unique_words_per_turn
9,Voice,1960-09-26,1,82,20,This will allow three minutes and twenty secon...,82.0,20.0
5,Vanocur,1960-09-26,4,1770,40,"Uh Mr. Vice President, since the question of ...",442.5,10.0
6,Fleming,1960-09-26,2,1947,44,"Senator, the Vice President in his campaign ha...",973.5,22.0
7,Novins,1960-09-26,5,902,37,"Mr. Vice President, your campaign stresses the...",180.4,7.4
8,Smith,1960-09-26,23,3207,46,And now the opening statement by Vice Presiden...,139.434783,2.0


In [14]:
# Export dataframe as CSV
debate_dialogue_df.to_csv('./debate_data.csv', index=False)

In [15]:
debates_wiki_df = pd.read_html("http://en.wikipedia.org/wiki/United_States_presidential_debates")

candidates_table = debates_wiki_df[2]
viewership_table = debates_wiki_df[3]

## Data Cleaning

In [16]:
vp_candidates = pd.DataFrame()
pres_candidates = pd.DataFrame()

candidates_table = candidates_table.rename(columns={"Election":"year",
                                "Presidential debates":"pres_debate_count",
                                "Presidential debates.1":"pres_candidate",
                                "Vice presidential debates":"vp_debate_count",
                                "Vice presidential debates.1":"vp_candidate"})

candidates_cleaned_df = candidates_table.drop(index=2)

pres_candidates[['year', 'debate_count', 'candidate']] = candidates_table[['year', 'pres_debate_count','pres_candidate']]
pres_candidates['type'] = 'P'
vp_candidates[['year', 'debate_count', 'candidate']] = candidates_table[['year', 'vp_debate_count','vp_candidate']]
vp_candidates['type'] = 'VP'

In [17]:
candidates_df = pd.concat([pres_candidates, vp_candidates], ignore_index=True).sort_values(by='year').reset_index(drop=True)

candidates_df.drop(index=[56,57], inplace=True)

In [18]:
candidates_df["debate_count"][candidates_df["year"] == '2020'] = '2'

candidates_df["debate_count"][candidates_df["debate_count"].str.contains('ebate', na=False)] = 0

candidates_df.head()

Unnamed: 0,year,debate_count,candidate,type
0,1960,4,Vice President Richard Nixon (R),P
1,1960,4,Senator John F. Kennedy (D),P
2,1960,0,No debates until 1976,VP
3,1960,0,No debates until 1976,VP
4,1976,3,President Gerald Ford (R),P


In [19]:
candidates_df[['debate_count', 'year']] = candidates_df[['debate_count', 'year']].astype(int)

In [20]:
# Remove rows with string in candidate column
candidates_df = candidates_df[candidates_df["candidate"].str.contains('ebate')==False]

In [21]:
candidates_df[["candidate", "party"]] = candidates_df["candidate"].str.split("(", expand=True)
candidates_df["party"] = candidates_df["party"].str[0]
candidates_df['last_name'] = candidates_df['candidate'].str.split().str[-1]
candidates_df = candidates_df.reset_index(drop=True)


In [22]:
won_election = [False, True, False, True, False, True, False, True, False, False,
                 True, True, False, False, True, True, False, False, True, False,
                 True, False, False, True, False, False, True, True, False, True,
                 False, False, False, True, True, True, False, True, False, False,
                 False, True, True, False, True, True, False, True, False, False,
                 True]

candidates_df.insert(6, 'won_election', won_election)

candidates_df.head()

Unnamed: 0,year,debate_count,candidate,type,party,last_name,won_election
0,1960,4,Vice President Richard Nixon,P,R,Nixon,False
1,1960,4,Senator John F. Kennedy,P,D,Kennedy,True
2,1976,3,President Gerald Ford,P,R,Ford,False
3,1976,3,Former Governor Jimmy Carter,P,D,Carter,True
4,1976,1,Senator Bob Dole,VP,R,Dole,False


In [23]:
# Create list of candidate titles
titles = ['Former Vice President', 'Vice President', 'President',  'Former Senator', 'Senator', 'Former Governor',
          'Governor', 'Congressman', 'Congresswoman', 'Businessman', 'Ret. Vice Admiral', 'Former HUD Secretary',
          'Former Secretary of State']

# Define function to return the title string contained in candidate name
def title_split(a):
    for title in titles:
        if title in a:
            return title

# Create new title column
candidates_df['title'] = candidates_df['candidate'].apply(lambda x: title_split(x))

# Fix existing candidate column name to remove title
candidates_df['candidate'] = candidates_df['candidate'].apply(lambda x: x.split(title_split(x))[1]).str.strip()

In [24]:
debate_dialogue_df['year'] = pd.DatetimeIndex(debate_dialogue_df['date']).year

# debate_dialogue_df = debate_dialogue_df.merge(candidates_df[['last_name', 'won_election']], left_on='actor', right_on='last_name')

debate_dialogue_df.head()

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count,dialogue_list,avg_words_per_turn,avg_unique_words_per_turn,year
9,Voice,1960-09-26,1,82,20,This will allow three minutes and twenty secon...,82.0,20.0,1960
5,Vanocur,1960-09-26,4,1770,40,"Uh Mr. Vice President, since the question of ...",442.5,10.0,1960
6,Fleming,1960-09-26,2,1947,44,"Senator, the Vice President in his campaign ha...",973.5,22.0,1960
7,Novins,1960-09-26,5,902,37,"Mr. Vice President, your campaign stresses the...",180.4,7.4,1960
8,Smith,1960-09-26,23,3207,46,And now the opening statement by Vice Presiden...,139.434783,2.0,1960


## Database Normalization

In [25]:
# Create dataframe with all data including debate/candidate/performance IDs
candidate_debates_df = pd.merge(debate_dialogue_df, candidates_df, left_on=['actor', 'year'], right_on=['last_name','year'],how='inner')

# Create dataframe of unique candidates to set unique IDs
unique_candidates = pd.DataFrame(candidate_debates_df.sort_values(by='date')['candidate'].unique())
# Reset index for unique candidate IDs
unique_candidates = unique_candidates.reset_index()
# Rename columns
unique_candidates.columns = ['candidate_id', 'name']
# Merge dataframes back together preserving candidate IDs
candidate_debates_df = candidate_debates_df.merge(unique_candidates, left_on='candidate', right_on='name')

candidate_debates_df.head()

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count,dialogue_list,avg_words_per_turn,avg_unique_words_per_turn,year,debate_count,candidate,type,party,last_name,won_election,title,candidate_id,name
0,Kennedy,1960-09-26,17,25110,60,"Mr. Smith, Mr. Nixon. In the election of 1860,...",1477.058824,3.529412,1960,4,John F. Kennedy,P,D,Kennedy,True,Senator,0,John F. Kennedy
1,Kennedy,1960-10-07,13,23946,61,In the first place Ive never suggested that Cu...,1842.0,4.692308,1960,4,John F. Kennedy,P,D,Kennedy,True,Senator,0,John F. Kennedy
2,Kennedy,1960-10-13,14,21793,60,"Good evening, Mr. Shadel. Mr. McGee, we have a...",1556.642857,4.285714,1960,4,John F. Kennedy,P,D,Kennedy,True,Senator,0,John F. Kennedy
3,Kennedy,1960-10-21,11,26548,60,"Good evening, Mr. Howe. Mr. Howe, Mr. Vice Pre...",2413.454545,5.454545,1960,4,John F. Kennedy,P,D,Kennedy,True,Senator,0,John F. Kennedy
4,Nixon,1960-09-26,10,22040,55,"Mr. Smith, Senator Kennedy. The things that Se...",2204.0,5.5,1960,4,Richard Nixon,P,R,Nixon,False,Vice President,1,Richard Nixon


In [26]:
# Create dataframe of unique debates to set unique IDs
unique_debates = pd.DataFrame(candidate_debates_df['date'].unique())
# Reset index for unique debates IDs
unique_debates = unique_debates.reset_index()
# Rename columns
unique_debates.columns = ['debate_id', 'date']
# Merge dataframes back together preserving candidate IDs
candidate_debates_df = candidate_debates_df.merge(unique_debates, on='date')

candidate_debates_df.head()

Unnamed: 0,actor,date,speaking_turn_count,total_word_count,unique_word_count,dialogue_list,avg_words_per_turn,avg_unique_words_per_turn,year,debate_count,candidate,type,party,last_name,won_election,title,candidate_id,name,debate_id
0,Kennedy,1960-09-26,17,25110,60,"Mr. Smith, Mr. Nixon. In the election of 1860,...",1477.058824,3.529412,1960,4,John F. Kennedy,P,D,Kennedy,True,Senator,0,John F. Kennedy,0
1,Nixon,1960-09-26,10,22040,55,"Mr. Smith, Senator Kennedy. The things that Se...",2204.0,5.5,1960,4,Richard Nixon,P,R,Nixon,False,Vice President,1,Richard Nixon,0
2,Kennedy,1960-10-07,13,23946,61,In the first place Ive never suggested that Cu...,1842.0,4.692308,1960,4,John F. Kennedy,P,D,Kennedy,True,Senator,0,John F. Kennedy,1
3,Nixon,1960-10-07,12,23097,57,"Well first of all, I dont agree with Senator K...",1924.75,4.75,1960,4,Richard Nixon,P,R,Nixon,False,Vice President,1,Richard Nixon,1
4,Kennedy,1960-10-13,14,21793,60,"Good evening, Mr. Shadel. Mr. McGee, we have a...",1556.642857,4.285714,1960,4,John F. Kennedy,P,D,Kennedy,True,Senator,0,John F. Kennedy,2


In [27]:
# Reset index to set as unique performance ID
candidate_debates_df = candidate_debates_df.reset_index()
# Rename column
candidate_debates_df = candidate_debates_df.rename(columns={'index':'performance_id'})
# Drop redundant columns
candidate_debates_df = candidate_debates_df.drop(columns=['year', 'actor', 'last_name', 'name', 'avg_words_per_turn', 'avg_unique_words_per_turn'])

# Preview dataframe
candidate_debates_df.head()

Unnamed: 0,performance_id,date,speaking_turn_count,total_word_count,unique_word_count,dialogue_list,debate_count,candidate,type,party,won_election,title,candidate_id,debate_id
0,0,1960-09-26,17,25110,60,"Mr. Smith, Mr. Nixon. In the election of 1860,...",4,John F. Kennedy,P,D,True,Senator,0,0
1,1,1960-09-26,10,22040,55,"Mr. Smith, Senator Kennedy. The things that Se...",4,Richard Nixon,P,R,False,Vice President,1,0
2,2,1960-10-07,13,23946,61,In the first place Ive never suggested that Cu...,4,John F. Kennedy,P,D,True,Senator,0,1
3,3,1960-10-07,12,23097,57,"Well first of all, I dont agree with Senator K...",4,Richard Nixon,P,R,False,Vice President,1,1
4,4,1960-10-13,14,21793,60,"Good evening, Mr. Shadel. Mr. McGee, we have a...",4,John F. Kennedy,P,D,True,Senator,0,2


In [28]:
debates = candidate_debates_df.groupby(['debate_id','date','type'], as_index=False).sum()

debates = debates[['debate_id', 'date', 'type']]

debates.to_csv('./Tables/debates.csv', index=False)

debates.head()

Unnamed: 0,debate_id,date,type
0,0,1960-09-26,P
1,1,1960-10-07,P
2,2,1960-10-13,P
3,3,1960-10-21,P
4,4,1976-09-23,P


In [29]:
candidates_grouped = candidate_debates_df.groupby(['candidate_id','candidate','party'], as_index=False).sum()

candidates = candidates_grouped[['candidate_id', 'candidate', 'party']]

candidates = candidates.rename(columns={'candidate':'name'})

candidates.to_csv('./Tables/candidates.csv', index=False)

candidates.head()

Unnamed: 0,candidate_id,name,party
0,0,John F. Kennedy,D
1,1,Richard Nixon,R
2,2,Gerald Ford,R
3,3,Jimmy Carter,D
4,4,Ronald Reagan,R


In [30]:
# Combine dataframes to get debate ids/types and dialogue stats
performances = candidate_debates_df[['performance_id', 'debate_id', 'candidate_id', 'title', 'won_election',
                                     'speaking_turn_count', 'total_word_count', 'unique_word_count']]

# Create calculated column for avg number of words per turn
performances['words_per_turn'] = performances['total_word_count'] / performances['speaking_turn_count']
performances['unique_words_per_turn'] = performances['unique_word_count'] / performances['speaking_turn_count']

# Export performances table as csv
performances.to_csv('./Tables/performances.csv', index=False)

performances


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,performance_id,debate_id,candidate_id,title,won_election,speaking_turn_count,total_word_count,unique_word_count,words_per_turn,unique_words_per_turn
0,0,0,0,Senator,True,17,25110,60,1477.058824,3.529412
1,1,0,1,Vice President,False,10,22040,55,2204.000000,5.500000
2,2,1,0,Senator,True,13,23946,61,1842.000000,4.692308
3,3,1,1,Vice President,False,12,23097,57,1924.750000,4.750000
4,4,2,0,Senator,True,14,21793,60,1556.642857,4.285714
...,...,...,...,...,...,...,...,...,...,...
89,89,42,29,Businessman,True,128,32205,60,251.601562,0.468750
90,90,43,31,Senator,False,188,40893,61,217.515957,0.324468
91,91,43,32,Governor,True,207,38013,62,183.637681,0.299517
92,92,44,32,Vice President,False,89,35414,60,397.910112,0.674157


In [31]:
viewership_table

Unnamed: 0,Election,Debate,Sponsor,Location,Location.1,Moderators,Viewership,Source
0,1960,First debate,"ABC, CBS, and NBC",WBBM-TV studios,"Chicago, Illinois",Howard K. Smith of CBS,66.4 million,[27]
1,1960,Second debate,"ABC, CBS, and NBC",WRC-TV studios,"Washington, DC",Frank McGee of NBC,61.9 million,[27]
2,1960,Third debate,"ABC, CBS, and NBC",Split-screen telecast with Nixon and panelists...,Split-screen telecast with Nixon and panelists...,Bill Shadel of ABC,63.7 million,[27]
3,1960,Fourth debate,"ABC, CBS, and NBC",ABC Studios,"New York City, New York",Quincy Howe of ABC,60.4 million,[27]
4,1976,First debate,League of Women Voters,Walnut Street Theater,"Philadelphia, Pennsylvania",Edwin Newman of NBC,69.7 million,[28]
5,1976,Second debate,League of Women Voters,Palace of Fine Arts,"San Francisco, California",Pauline Frederick of NPR,63.9 million,[28]
6,1976,Third debate,League of Women Voters,Phi Beta Kappa Memorial Hall at W&M,"Williamsburg, Virginia",Barbara Walters of ABC,62.7 million,[28]
7,1976,VP Debate,League of Women Voters,Alley Theatre,"Houston, Texas",James Hoge of the Chicago Sun-Times,43.2 million,[28]
8,1980,First debate,League of Women Voters,Baltimore Convention Center,"Baltimore, Maryland",Bill Moyers of PBS,,[29]
9,1980,Second debate,League of Women Voters,Public Music Hall,"Cleveland, Ohio",Howard K. Smith of ABC,80.6 million,[29]


In [32]:
debates

Unnamed: 0,debate_id,date,type
0,0,1960-09-26,P
1,1,1960-10-07,P
2,2,1960-10-13,P
3,3,1960-10-21,P
4,4,1976-09-23,P
5,5,1976-10-06,P
6,6,1976-10-22,P
7,7,1980-10-28,P
8,8,1980-09-21,P
9,9,1984-10-07,P


In [33]:
candidate_debates_df

# candidate_debates_df['text'] = candidate_debates_df[['party','dialogue_list']].groupby('party')['dialogue_list'].transform(lambda x: ','.join(x))
# candidate_debates_df[['party','dialogue_list']].drop_duplicates()

dialogue_analysis = candidate_debates_df.groupby(['party', 'won_election'])['dialogue_list'].apply(lambda x: ' '.join(x)).reset_index()

In [34]:
dialogue_analysis

Unnamed: 0,party,won_election,dialogue_list
0,D,False,"Mr. Stone, Ive had to make thousands of decisi..."
1,D,True,"Mr. Smith, Mr. Nixon. In the election of 1860,..."
2,I,False,"fiscal restraint, I think is necessary. Mr. Mo..."
3,R,False,"Mr. Smith, Senator Kennedy. The things that Se..."
4,R,True,"I dont know what the differences might be, bec..."


## Machine Learning

https://goodboychan.github.io/python/machine_learning/natural_language_processing/2020/10/23/01-Text-Classification-with-NLTK.html

In [64]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
label = enc.fit_transform(candidate_debates_df['party'])

text = candidate_debates_df['dialogue_list']

In [65]:
# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = text.str.replace(r'£|\$', 'moneysymb')

processed = processed.str.replace(r'[^\w\d\s]', ' ')


# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

processed = processed.str.lower()

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

lemmatizer = WordNetLemmatizer()

processed = processed.apply(lambda x: ' '.join(lemmatizer.lemmatize(term) for term in x.split()))



  
  after removing the cwd from sys.path.
  
  # This is added back by InteractiveShellApp.init_path()


In [66]:
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

# Print the result
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 11854
Most common words: [('people', 2938), ('think', 2419), ('going', 2359), ('president', 2236), ('thats', 1925), ('year', 1900), ('would', 1797), ('one', 1756), ('want', 1645), ('tax', 1621), ('country', 1551), ('know', 1547), ('get', 1545), ('make', 1539), ('dont', 1526)]


In [67]:

dialogue_analysis['text'] = processed

dialogue_analysis

Unnamed: 0,party,won_election,dialogue_list,text
0,D,False,"Mr. Stone, Ive had to make thousands of decisi...",mr smith mr nixon election 1860 abraham lincol...
1,D,True,"Mr. Smith, Mr. Nixon. In the election of 1860,...",mr smith senator kennedy thing senator kennedy...
2,I,False,"fiscal restraint, I think is necessary. Mr. Mo...",first place ive never suggested cuba lost exce...
3,R,False,"Mr. Smith, Senator Kennedy. The things that Se...",well first dont agree senator kennedy cuba los...
4,R,True,"I dont know what the differences might be, bec...",good evening mr shadel mr mcgee contractual ri...


In [68]:
word_features = [x[0] for x in all_words.most_common(1500)]

In [69]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

In [70]:
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key, value)

people True
think True
going True
president True
thats True
year True
would True
one True
want True
tax True
country True
know True
get True
make True
dont True
well True
said True
american True
state True
say True
im True
u True
time True
weve True
thing True
go True
job True
right True
america True
uh True
let True
way True
need True
world True
like True
believe True
united True
take True
look True
ive True
work True
government True
also True
back True
million True
good True
done True
theyre True
sure True
money True
administration True
mr True
fact True
new True
care True
family True
billion True
program True
made True
last True
put True
come True
percent True
see True
youre True
war True
question True
health True
security True
problem True
give True
every True
two True
business True
nation True
better True
important True
kind True
economy True
help True
policy True
could True
didnt True
there True
much True
school True
great True
number True
mean True
child True
issue True
life Tru

In [71]:
list(features.items())[:10]

[('people', True),
 ('think', True),
 ('going', True),
 ('president', True),
 ('thats', True),
 ('year', True),
 ('would', True),
 ('one', True),
 ('want', True),
 ('tax', True)]

In [72]:
messages = list(zip(processed, label))

np.random.seed(1)
np.random.shuffle(messages)

# Call find_features function for each SMS message
feature_set = [(find_features(text), label) for (text, label) in messages]

In [73]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(feature_set, test_size=0.25, random_state=1)

print(len(training))
print(len(test))

70
24


In [75]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier',
         'Naive Bayes', 'Support Vector Classifier']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, test)
    print("{} model Accuracy: {}".format(name, accuracy))

K Nearest Neighbors model Accuracy: 0.7916666666666666
Decision Tree model Accuracy: 0.4583333333333333
Random Forest model Accuracy: 0.75
Logistic Regression model Accuracy: 0.75
SGD Classifier model Accuracy: 0.75
Naive Bayes model Accuracy: 0.7916666666666666
Support Vector Classifier model Accuracy: 0.75


In [77]:
from sklearn.ensemble import VotingClassifier

# Since VotingClassifier can accept list type of models
models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators=models, voting='hard', n_jobs=-1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, test)
print("Voting Classifier model Accuracy: {}".format(accuracy))

Voting Classifier model Accuracy: 0.75
