# Process Election Results

In [1]:
import os
os.chdir('..')

from tqdm.notebook import tqdm
import pandas as pd

from scripts.data_transformations import (
    list_candidates
    , most_recent_smd
    )

from scripts.common import (
    hash_dataframe
    , match_names
)

In [2]:
results_files = {
    2020: 'November_3_2020_General_Election_Certified_Results.csv'
    , 2022: 'November_8_2022_General_Election_Election_Night_Unofficial_Results.csv'
}

for f in results_files:
    temp = pd.read_csv(f'data/dcboe/election_results/{results_files[f]}')
    

In [3]:
df = pd.read_csv(f'data/dcboe/election_results/{results_files[2022]}')
df = df.rename(columns={
    'Candidate': 'candidate_name'
    , 'ContestName': 'contest_name'
    , 'ContestNumber': 'contest_number'
    , 'Votes': 'votes'
})

In [4]:
# Topline race: Mayor
topline_results = (
    df[df['contest_name'] == 'MAYOR OF THE DISTRICT OF COLUMBIA DISTRICT OF COLUMBIA']
    .groupby('candidate_name')
    .votes.sum()
)

topline_results['total'] = topline_results.sum()
topline_results

candidate_name
DEM Muriel E. Bowser    145737
IND Rodney Red Grant     29181
LIB Dennis Sobin          2491
OVER VOTES                 235
REP Stacia R. Hall       11384
UNDER VOTES               7472
Write-in                  6509
total                   203009
Name: votes, dtype: int64

In [5]:
anc = df[df['contest_name'].str.contains('SINGLE MEMBER DISTRICT')].copy()

In [6]:
anc.votes.sum()

203009

In [7]:
# No certified results yet

# cert = pd.read_csv('../data/dcboe/November_3_2020_General_Election_Certified_Results.csv')
# cert = cert.rename(columns={
#     'Candidate': 'candidate_name'
#     , 'ContestName': 'contest_name'
#     , 'ContestNumber': 'contest_number'
#     , 'Votes': 'votes'
# })

# cert_anc = df[df['contest_name'].str.contains('SINGLE MEMBER DISTRICT')].copy()

# cert_anc.votes.sum()

In [8]:
anc.groupby('contest_name').votes.sum()

contest_name
ANC - 1A01 SINGLE MEMBER DISTRICT  01-ANC 1A    584
ANC - 1A02 SINGLE MEMBER DISTRICT  02-ANC 1A    517
ANC - 1A03 SINGLE MEMBER DISTRICT  03-ANC 1A    372
ANC - 1A04 SINGLE MEMBER DISTRICT  04-ANC 1A    635
ANC - 1A05 SINGLE MEMBER DISTRICT  05-ANC 1A    420
                                               ... 
ANC - 8F01 SINGLE MEMBER DISTRICT 01-ANC-8F     567
ANC - 8F02 SINGLE MEMBER DISTRICT 02-ANC-8F     980
ANC - 8F03 SINGLE MEMBER DISTRICT 03-ANC-8F     716
ANC - 8F04 SINGLE MEMBER DISTRICT 04-ANC-8F     995
ANC - 8F05 SINGLE MEMBER DISTRICT 05-ANC-8F     749
Name: votes, Length: 345, dtype: int64

Did ANC vote counts change between Pre-Certified and Certified?

In [9]:
# todo
# sum(anc.groupby('contest_name').votes.sum() == cert_anc.groupby('contest_name').votes.sum())

In [10]:
anc['smd_id'] = 'smd_2022_' + anc['contest_name'].str.extract('(?<=ANC - )(.*)(?=SINGLE MEMBER)')
anc['smd_id'] = anc['smd_id'].str.strip()

In [11]:
# Check that smd_id parsed correctly
votes_by_anc = anc.groupby(['contest_name', 'smd_id']).votes.sum().reset_index()

districts = pd.read_csv('data/districts.csv')
districts = districts[districts.redistricting_year == 2022].copy()

votes_by_anc['valid'] = votes_by_anc['smd_id'].isin(districts.smd_id)

districts['in_results'] = districts['smd_id'].isin(votes_by_anc.smd_id)

# votes_by_anc.to_clipboard(index=False)

# There should be 345 valid districts
districts['in_results'].sum() == votes_by_anc['valid'].sum() == 345

True

In [12]:
candidates_results = (
    anc
    [~anc.candidate_name.isin(['OVER VOTES', 'UNDER VOTES'])]
    .groupby(['smd_id', 'contest_number', 'candidate_name'])
    .votes.sum()
    .reset_index()
)
# candidates

In [13]:
candidates_results['candidate_name_upper'] = candidates_results['candidate_name'].str.upper()
candidates_results['dcboe_hash_id'] = hash_dataframe(candidates_results, ['smd_id', 'candidate_name_upper'])
# candidates.loc[candidates.candidate_name == 'Write-in', 'dcboe_hash_id'] = pd.NA

In [14]:
candidates_results['ranking'] = candidates_results.groupby('smd_id').votes.rank(method='first', ascending=False)
candidates_results['winner'] = candidates_results['ranking'] == 1

In [15]:
candidates_results['write_in_winner'] = (
    candidates_results['winner'] & (candidates_results['candidate_name'] == 'Write-in')
)

In [16]:
# Sort candidates by the number of votes they got within SMD, making the winner first
candidates_results = candidates_results.sort_values(by=['smd_id', 'votes'], ascending=[True, False])

In [17]:
# Calculate the total votes cast in each SMD
total_votes = candidates_results.groupby('smd_id').votes.sum()
total_votes.name = 'total_votes'
candidates_results = pd.merge(candidates_results, total_votes, how='inner', on='smd_id')
candidates_results['vote_share'] = candidates_results['votes'] / candidates_results['total_votes']

In [18]:
# Create columns showing the number of votes the winner in each SMD received.
# This will be used to calculate the "margin of defeat" for all other candidates.

winning_votes = candidates_results.groupby('smd_id').agg(
    winning_votes=('votes', max)
    , winning_vote_percentage=('vote_share', max)
)

candidates_results = pd.merge(candidates_results, winning_votes, how='inner', on='smd_id')

In [19]:
# Create columns showing the number of votes for the next candidate in the DataFrame.
# For the first place candidate in an SMD, these _shifted columns will have the votes
# of the second-place candidate.

shift_one = candidates_results[['smd_id', 'votes', 'vote_share']].shift(-1)
shift_one = shift_one.rename(columns={
    'smd_id': 'smd_id_shifted'
    , 'votes': 'votes_shifted'
    , 'vote_share': 'vote_share_shifted'
})

candidates_results = pd.concat([candidates_results, shift_one], axis=1)

In [20]:
# Calculate the margin of victory - positive for winners, negative for losers
candidates_results['margin_of_victory'] = None
candidates_results['margin_of_victory_percentage'] = None

contested_winners = (
    (candidates_results['smd_id'] == candidates_results['smd_id_shifted']) & (candidates_results['winner'])
)

# For winners, the margin of victory is their votes minus the second-place votes
candidates_results.loc[contested_winners, 'margin_of_victory'] = (
    candidates_results['votes'] - candidates_results['votes_shifted']
)
candidates_results.loc[contested_winners, 'margin_of_victory_percentage'] = (
    candidates_results['vote_share'] - candidates_results['vote_share_shifted']
)

# For losers, the margin of defeat is the their votes minus the first-place votes
candidates_results.loc[~contested_winners, 'margin_of_victory'] = (
    candidates_results['votes'] - candidates_results['winning_votes']
)

candidates_results.loc[~contested_winners, 'margin_of_victory_percentage'] = (
    candidates_results['vote_share'] - candidates_results['winning_vote_percentage']
)

In [21]:
# Count the number of candidates who received votes. This lumps all write-ins as one candidate.
# num_candidates is not currently used by the frontend, just for humans checking the data.

num_candidates = candidates_results.groupby('smd_id').candidate_name.count()
num_candidates.name = 'num_candidates'
candidates_results = pd.merge(candidates_results, num_candidates, how='inner', on='smd_id')

## Counts

In [22]:
# Winners with the smallest margin of victory
(
    candidates_results
    [(candidates_results.winner) & (candidates_results.num_candidates > 1)]
    .sort_values(by='margin_of_victory_percentage')
    .head(10)
)

Unnamed: 0,smd_id,contest_number,candidate_name,votes,candidate_name_upper,dcboe_hash_id,ranking,winner,write_in_winner,total_votes,vote_share,winning_votes,winning_vote_percentage,smd_id_shifted,votes_shifted,vote_share_shifted,margin_of_victory,margin_of_victory_percentage,num_candidates
269,smd_2022_3F05,144,James Tandaric,253,JAMES TANDARIC,ed7b3f51826d49136959476509153714f23444c9ff6d91...,1.0,True,False,512,0.494141,253,0.494141,smd_2022_3F05,248.0,0.484375,5.0,0.009766,3
350,smd_2022_4E04,188,Carla Ferris,294,CARLA FERRIS,80845f8d7ce9bb471c8d9a6a6052efe3d0b4a1f7075910...,1.0,True,False,594,0.494949,294,0.494949,smd_2022_4E04,288.0,0.484848,6.0,0.010101,3
614,smd_2022_7F07,317,Shirley Thompson-Wright,174,SHIRLEY THOMPSON-WRIGHT,18826c65515fe7fd269c41df3b3b2f2f6615845703e91c...,1.0,True,False,463,0.37581,174,0.37581,smd_2022_7F07,168.0,0.362851,6.0,0.012959,4
559,smd_2022_7C05,289,Shirley A. Boykins,137,SHIRLEY A. BOYKINS,1004c264e49d4f500f9800edef7c8462df6ba610027caf...,1.0,True,False,391,0.350384,137,0.350384,smd_2022_7C05,131.0,0.335038,6.0,0.015345,4
681,smd_2022_8D06,346,"Wendy ""Hope Dealer"" Hamilton",209,"WENDY ""HOPE DEALER"" HAMILTON",6331cdd90969e1238f6f2e60a13b21cbbf71a32ed628b4...,1.0,True,False,420,0.497619,209,0.497619,smd_2022_8D06,202.0,0.480952,7.0,0.016667,3
276,smd_2022_4A01,153,Paula Y. Edwards,507,PAULA Y. EDWARDS,cb65a96724ad7b0e156c83f95df6e97f8542b83cd5a6d1...,1.0,True,False,999,0.507508,507,0.507508,smd_2022_4A01,476.0,0.476476,31.0,0.031031,3
225,smd_2022_3C07,123,Gawain Kripke,344,GAWAIN KRIPKE,80eacd334c0a3c011ec57fae4ad46f4a50df16f3de1b9e...,1.0,True,False,667,0.515742,344,0.515742,smd_2022_3C07,311.0,0.466267,33.0,0.049475,3
416,smd_2022_5D06,219,Kathy Henderson,304,KATHY HENDERSON,9c114f4bf9e8b4dae95428034e012714ce70f41a1bf780...,1.0,True,False,582,0.522337,304,0.522337,smd_2022_5D06,274.0,0.47079,30.0,0.051546,3
219,smd_2022_3C05,121,Sauleh Ahmad Siddiqui,462,SAULEH AHMAD SIDDIQUI,258c83f1bb1a68cb87d470cb7d41c13004a4005fc764a9...,1.0,True,False,883,0.523216,462,0.523216,smd_2022_3C05,413.0,0.467724,49.0,0.055493,3
239,smd_2022_3D05,129,Bernie Horn,224,BERNIE HORN,259254ee26a2a1d6be942b9c0880dd29bf14decf8ca870...,1.0,True,False,427,0.52459,224,0.52459,smd_2022_3D05,198.0,0.4637,26.0,0.06089,3


In [23]:
candidates_results['write_in_winner'].sum()

60

In [24]:
# candidates_results[candidates_results['write_in_winner']] #  & (candidates_results['num_candidates'] > 1)]

In [25]:
# Most votes
# candidates_results.sort_values(by='votes', ascending=False).head(10)

In [26]:
# candidates_results[candidates_results.smd_id == 'smd_2022_3F05']

## Match to OpenANC

Compare the candidates from DCBOE results to the OpenANC candidates table for matches

In [27]:
candidates_oa = list_candidates(election_year=2022)

In [28]:
merged = pd.merge(candidates_results, candidates_oa, how='left', on='dcboe_hash_id')

In [29]:
merged.candidate_id.notnull().sum()

362

## District-wide stats

In [30]:
anc.votes.sum()

203009

In [31]:
candidates_results.votes.sum()

151456

In [32]:
anc['candidate_type'] = anc.candidate_name
anc.loc[~anc.candidate_name.isin(['OVER VOTES', 'UNDER VOTES', 'Write-in']), 'candidate_type'] = 'Ballot Candidate'

In [33]:
anc.groupby('candidate_type').votes.sum()

candidate_type
Ballot Candidate    140825
OVER VOTES             149
UNDER VOTES          51404
Write-in             10631
Name: votes, dtype: int64

## Match to OpenANC database

In [34]:
people = pd.read_csv('data/people.csv')
people = most_recent_smd(people)

In [35]:
def match_to_openanc(df):

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        
        if row['candidate_name'] == 'Write-in':
            continue
        
        best_id, best_score = match_names(row['candidate_name'], people['full_name'], people['person_id'])

        df.loc[idx, 'match_score'] = best_score
        df.loc[idx, 'match_person_id'] = best_id
        df.loc[idx, 'match_full_name'] = people[people.person_id == best_id].full_name.iloc[0]
        df.loc[idx, 'match_smd_id'] = people[people.person_id == best_id].most_recent_smd_id.iloc[0]
        
    return df.copy()

In [36]:
candidates_results = match_to_openanc(candidates_results)

  0%|          | 0/720 [00:00<?, ?it/s]

In [37]:
# candidates_results.to_clipboard()

In [38]:
candidates_results

Unnamed: 0,smd_id,contest_number,candidate_name,votes,candidate_name_upper,dcboe_hash_id,ranking,winner,write_in_winner,total_votes,...,smd_id_shifted,votes_shifted,vote_share_shifted,margin_of_victory,margin_of_victory_percentage,num_candidates,match_score,match_person_id,match_full_name,match_smd_id
0,smd_2022_1A01,18,Max Ewart,472,MAX EWART,5562e6cb282ae08f0a57700a9f21b2ac6b13b6082f4088...,1.0,True,False,479,...,smd_2022_1A01,7.0,0.014614,465.0,0.970772,2,100.0,10460.0,Max Ewart,smd_2022_1A01
1,smd_2022_1A01,18,Write-in,7,WRITE-IN,3fe498bd427542a3d129749bd0ad4409c7bf704b911158...,2.0,False,False,479,...,smd_2022_1A02,405.0,0.975904,-465,-0.970772,2,,,,
2,smd_2022_1A02,19,Dieter Lehmann Morales,405,DIETER LEHMANN MORALES,dcca894eb6fbad029f0602aea05dba22af14b0866045b7...,1.0,True,False,415,...,smd_2022_1A02,10.0,0.024096,395.0,0.951807,2,100.0,10285.0,Dieter Lehmann Morales,smd_2022_1A02
3,smd_2022_1A02,19,Write-in,10,WRITE-IN,d150179dc346bc3fda316b31c540fba347400a0d2f5b13...,2.0,False,False,415,...,smd_2022_1A03,288.0,0.972973,-395,-0.951807,2,,,,
4,smd_2022_1A03,20,Carlo Perri,288,CARLO PERRI,f9e78f92331964dbb13882c00022c44b62a83a73673a07...,1.0,True,False,296,...,smd_2022_1A03,8.0,0.027027,280.0,0.945946,2,100.0,11345.0,Carlo Perri,smd_2022_1A03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,smd_2022_8F03,360,Write-in,81,WRITE-IN,7dca68f21c558742e97a5dc5fabcb20de3a7379321e6d5...,2.0,False,False,577,...,smd_2022_8F04,542.0,0.723632,-415,-0.719237,2,,,,
716,smd_2022_8F04,361,Edward Daniels,542,EDWARD DANIELS,03e06e6cea8a0a69c48caf8221946ddbb8b13a6ca8fe28...,1.0,True,False,749,...,smd_2022_8F04,183.0,0.244326,359.0,0.479306,3,100.0,10210.0,Edward Daniels,smd_2022_8F04
717,smd_2022_8F04,361,Jesse Kamzol,183,JESSE KAMZOL,d4969a7e92a7e32a1d60a24a70ca3d8f7542cda8d5a57b...,2.0,False,False,749,...,smd_2022_8F04,24.0,0.032043,-359,-0.479306,3,100.0,11544.0,Jesse Kamzol,smd_2022_8F04
718,smd_2022_8F04,361,Write-in,24,WRITE-IN,59f529f8ecfc06f4da27c18ed88e48425ec28df71f7e1a...,3.0,False,False,749,...,smd_2022_8F05,129.0,1.000000,-518,-0.691589,3,,,,


## Save output

In [39]:
candidates_results['person_id'] = candidates_results['match_person_id']

candidates_results[[
    'person_id'
    , 'dcboe_hash_id'
    , 'smd_id'
    , 'candidate_name'
    , 'votes'
    , 'vote_share'
    , 'ranking'
    , 'winner'
    , 'write_in_winner'
    , 'margin_of_victory'
    , 'margin_of_victory_percentage'
    , 'num_candidates'
    , 'total_votes'
]].to_csv('data/dcboe/candidate_votes_2022.csv', index=False)

In [40]:
# candidates_results.to_csv('candidates_results.csv', index=False)