# Modeling
Data is separated on a candidate-basis to observe polls prior to and before a candidate suspends their campaign.

## Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import datetime
import numpy as np
import pandas as pd
import os
import math

import stats

## Read File Written in 02-DataCleaning

In [2]:
candidates = pd.read_csv('candidates.csv', index_col='name')

## Read File Written in 03-Modeling

In [3]:
polls = pd.read_csv('bootPolls.csv')

Convert date from string to datetime.

In [4]:
polls.date = pd.Series(pd.DatetimeIndex(polls.date))
polls.index = polls.date
del polls['date']

candidates.date = pd.to_datetime(candidates.date)

stats.Equals100(polls)

## Data Modeling Functions Defintions

### Before Campaign Suspension

Function to get average polling for a single active candidate before a candidate drops out.

In [5]:
def getBeforeAverage(pollGroup, dropout, cand):
    """Returns average polling for a single candidate in the week prior to another candidate dropping out of the race.
    
    Parameters
    ----------
    pollGroup : DataFrame
        A snippet of the main DataFrame used sliced to only include polls take a week prior to and a week 
        after the dropout suspends their campaign.
    dropout : str
        The name of the candidate that dropped out.
    cand : str
        The name of the candidate whose average polling is being calculated.
    """
    
    tot = 0                             # used to as numerator in calculating the mean
    count = 0                           # regular iterator. Also used to as denominator in calculating the mean
    iterDate = pollGroup.index[count]   # used to check where date is relative to the date of suspension
    
    while candidates['date'][dropout] - iterDate >= datetime.timedelta(days=0):
        tot += pollGroup[cand][count]
        count += 1
        iterDate = pollGroup.index[count]

    return tot/count

Function to create list of average polling for all other candidates before a candidate drops out.

In [6]:
def BeforeAverages(pollGroup, dropout):
    """Returns a list of average polling for all active candidate in the week prior to another
       candidate dropping out of the race.
    
    Parameters
    ----------
    pollGroup : DataFrame
        A snippet of the main DataFrame used sliced to only include polls take a week prior to and a week 
        after the dropout suspends their campaign.
    dropout : str
        The name of the candidate that dropped out.
    """
    
    averages = []
    for c in pollGroup.columns:    # for each candidate
        averages.append(getBeforeAverage(pollGroup, dropout, c))
        
    return averages

### After Campaign Suspension

Function to retrieve difference in days since date of suspension/dropout.

In [7]:
def dateDiff(dropout, iterDate):
    """Returns difference in days between date iterator and the date a candidate dropped out of the race.
    
    Parameters
    ----------
    dropout : str
        The name of the candidate that dropped out.
    iterDate : str
        The name of the candidate whose average polling is being calculated.
    """
    return (iterDate - candidates['date'][dropout]).days

Function to get weighted average polling for a single active candidate after a candidate drops out. Weight is determined by days after campaign suspension; the further away from the date of suspension, the more weight the polling data holds.

In [8]:
def getAfterAverage(pollGroup, dropout, cand):
    """Returns average polling for a candidate in the week after another candidate drops out of the race.
    
    Parameters
    ----------
    pollGroup : DataFrame
        A snippet of the main DataFrame used sliced to only include polls take a week prior to and a week 
        after the dropout suspends their campaign.
    dropout : str
        The name of the candidate that dropped out.
    cand : str
        The name of the candidate whose average polling is being calculated.
    """
    
    tot = 0                             # used to as numerator in calculating the mean
    count = 0                           # regular iterator
    dateCount = 0                       # used to as denomiator in calculating the mean
    iterDate = pollGroup.index[count]   # used to check where date is relative to the date of suspension
    
    # Get the date iterator to the dropout date
    while candidates['date'][dropout] - iterDate >= datetime.timedelta(days=0):
        count += 1
        iterDate = pollGroup.index[count]
    
    # Iterates through poll grouping conducted in the week after a candidate dropped out of the race
    while count < len(pollGroup.index):
        iterDate = pollGroup.index[count]
        diff = dateDiff(dropout, iterDate)
        candCount = pollGroup[cand][count]
        
        if math.isnan(candCount) == False:    # only uses numbers
            tot += candCount * diff           # gives weight to polls conducted further away from the date of dropping
            dateCount += diff
        count += 1
        
    if dateCount == 0:    # can't divide by zero, so just return 0 if no polls conducted after date of dropping
        return float('nan')
    else:
        return tot/dateCount

Function to get weighted average polling for all other candidtates after a candidate drops out.

In [9]:
def AfterAverages(pollGroup, dropout):
    """Returns a list of average polling for all active candidate in the week after another
       candidate drops out of the race.
    
    Parameters
    ----------
    pollGroup : DataFrame
        A snippet of the main DataFrame used sliced to only include polls take a week prior to and a week 
        after the dropout suspends their campaign.
    dropout : str
        The name of the candidate that dropped out.
    """
    
    averages = []
    for c in pollGroup.columns:
        averages.append(getAfterAverage(pollGroup, dropout, c))
    return averages

Function to create a list of dictionaries holding the candidate name and a table, that is a snippet of all polling data from the week before to the week after a campaign was suspended, for each candidate that has suspended their campaign.

In [10]:
# def BiWeekPolling(cand):
#     """Returns a snippet of the DataFrame holding only polls conducted 7 days prior to the date of dropping to 9 after
#        the date of dropping.
    
#     Parameters
#     ----------
#     cand : str
#         The name of the candidate that dropped out.
#     """
    
#     return polls[(polls.index > candidates['date'][cand] - datetime.timedelta(days=7)) \
#      & (polls.index < candidates['date'][cand] + datetime.timedelta(days=9))]

## Data Modeling Functions Apllications

Create a list of dictionaries holding the name of each candidate that suspended their campaign and the DataFrames of polling data from a week before to a week after that candidate suspended their campaign.

In [11]:
pollList = []

pollIndex = [c for c in candidates.index if candidates.dropped[c] == True]
for p in pollIndex:
    pollList.append({'name': p, 'poll': stats.BiWeekPolling(polls, candidates, p)})

NameError: name 'candidates' is not defined

Create a DataFrame indexed by the candidates that dropped out of the race during the primaries. Each column holds other candidates' average polling numbers in the week prior to a candidate's campaign suspension.

Each cell contains the polling for a candidate in the column the week before the candidate in the row dropped out.

In [None]:
WeekBeforeDrop = []

for p in range(len(pollList)):
    WeekBeforeDrop.append(BeforeAverages(pollList[p]['poll'], pollList[p]['name']))
    
PollingBeforeDrop = pd.DataFrame(WeekBeforeDrop, columns=polls.columns, index=pollIndex)
PollingBeforeDrop.index.names = ['name']
stats.Equals100(PollingBeforeDrop)
PollingBeforeDrop

Create a DataFrame indexed by the candidates that dropped out of the race during the primaries. Each column holds other candidates' average polling numbers in the week after a candidate's campaign suspension.

Each cell contains the polling for a candidate in the column the week after the candidate in the row dropped out.

In [None]:
WeekAfterDrop = []

for p in range(len(pollList)):
    WeekAfterDrop.append(AfterAverages(pollList[p]['poll'], pollList[p]['name']))
    
PollingAfterDrop = pd.DataFrame(WeekAfterDrop, columns=polls.columns, index=pollIndex)
PollingAfterDrop.index.names = ['name']
stats.Equals100(PollingAfterDrop)
PollingAfterDrop

## Testing
Confirm DataFrame snippets are working as expected. Test on Jeb Bush.

Display date of drop.

In [None]:
candidates['date']['Bush']

Confirm data is saved properly after being pushed to a list of dictionaries.

In [None]:
BushPolls1 = polls[(polls.index > candidates['date']['Bush'] - datetime.timedelta(days=7)) \
     & (polls.index < candidates['date']['Bush'] + datetime.timedelta(days=9))]
BushPoll2 = pollList[1]['poll']
assert all(BushPolls1) == all(BushPoll2)

Display polling for candidates a week before to a week after Jeb Bush dropped.

In [None]:
BushPolls1

Confirm polling data sums up to 100 for each poll

In [None]:
sum(BushPolls1.loc['2016-02-22'].dropna())

In [None]:
assert sum(BushPolls1.loc['2016-02-10'].dropna()) == 100
assert sum(BushPolls1.loc['2016-02-22'].dropna()) == 100

Confirm polling before drop is accurate.

In [None]:
BushBeforeAvgs1 = BeforeAverages(BushPolls1, 'Bush')
BushBeforeAvgs2 = PollingBeforeDrop.loc['Bush']
assert all(BushBeforeAvgs1) == all(list(BushBeforeAvgs2))
assert sum(BushBeforeAvgs2.dropna()) == 100

Display candidates' average polling data in the week before Jeb Bush suspended his campaign.

In [None]:
print(BushBeforeAvgs1)

Confrim polling after drop is accurate.

In [None]:
BushAfterAvgs1 = AfterAverages(BushPolls1, 'Bush')
BushAfterAvgs2 = PollingAfterDrop.loc['Bush']
assert all(BushAfterAvgs1) == all(list(BushAfterAvgs2))

Display candidates' average polling data in the week after Jeb Bush suspended his campaign.

In [None]:
print(BushAfterAvgs1)

Polling data sums up to 100.

In [None]:
for p in range(len(PollingBeforeDrop.index)):
    assert sum(PollingBeforeDrop.iloc[p].dropna()) == 100
    
for p in range(len(PollingAfterDrop.index)):
    assert sum(PollingAfterDrop.iloc[p].dropna()) == 100

Graph Jeb Bush's data.

In [None]:
plt.figure(figsize=(20,10))

plt.plot(BushPolls1.Trump)
plt.plot(BushPolls1.Cruz)
plt.plot(BushPolls1.Rubio)
plt.plot(BushPolls1.Kasich)
plt.plot(BushPolls1.Carson)
plt.plot(BushPolls1.Bush)
plt.plot(BushPolls1.Undecided)

plt.axvline(candidates['date']['Bush'])

plt.title("GOP Candidate Polling a Week Before/After Bush Dropped", size=20)
plt.xlabel("Date of Poll", size=16)
plt.ylabel("Polling Percentage", size=16)

# y limits are a little greater than needed to display the legend without blocking out data
plt.ylim(0, 55)
plt.legend(fontsize=14)

## Write to File
Write the DataFrames PollingBeforeDrop and PollingAfterDrop to two separte files to be analyzed in 05-Analysis.ipynb

In [None]:
PollingBeforeDrop.to_csv('PollingBeforeDrop.csv')
PollingAfterDrop.to_csv('PollingAfterDrop.csv')