# Imports, Functions & Testing

In [1]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup

In [2]:
def gen_election_cleaner(df, verbose=1):
    import pandas as pd
    
    df_cln = df.copy()
    
    if verbose in [1,2]:
        ## Printing startpoint
        print('O.G. Copy:')
        print('---'*20)
        display(df_cln)

    ## Setting columns manually + Q.C.
    df_cln.columns = ['Null', 'Party', 'Candidate', 'Votes', '%of_Vote']
    if verbose == 2:
        print('\nNew Column Headers:')
        print('---'*20)
        display(df_cln)

    ## Dropping irrelevant information + Q.C.
    df_cln.drop(index=[0, 6, 8], inplace=True)
    if verbose == 2:
        print('\nDropping Rows:')
        print('---'*20)
        display(df_cln)

    ## Flip and place Turnout as its own feature + Q.C.
    for idx, v, k in zip(df_cln.index, df_cln['Null'], df_cln['Null'].isna()):
        if not k:
            df_cln[v] = df_cln['Party'][idx]

    ## Removing the row with turnout info/column with nulls + Q.C.
    df_cln.drop(columns='Null', index=7, inplace=True)
    if verbose in [1,2]:
        print('\nFinal Version:')
        print('---'*20)
        display(df_cln)

    return df_cln

In [3]:
def wiki_senate_scraper():
    import pandas as pd
    import numpy as np
    import requests
    from bs4 import BeautifulSoup
    
    ## Starting page + Q.C. of response
    start_url = 'https://en.wikipedia.org/wiki/List_of_United_States_Senate_elections'
    start_resp = requests.get(start_url)
    print(f'Starting Response: {start_resp}')
    
    ## Creating soup + pulling all links for senate elections after 17th amendment
    start_soup = BeautifulSoup(start_resp.text, 'html.parser')
    start_links = start_soup.findAll('a')
    start_sen_links = start_links[78:134] ## Previously located
    
    ## Base of url for all senate election pages
    base_url = 'https://en.wikipedia.org'
    
    ## Loop same process for all links + storage
    yr_dfs = {}
    yr_tocs = {}
    count = 0
    for link in start_sen_links:

        ## Q.C during execute
        count += 1
        if count in [25, 50]:
            print('Checkpoint! (25 loops)')
        
        ## Collecting strings for use
        end_url = link.get('href')
        year = link.get_text()
        full_url = base_url + end_url
        
        ## Making soup + collecting all tables
        link_resp = requests.get(full_url)
        link_soup = BeautifulSoup(link_resp.text, 'html.parser')
        link_tables = link_soup.findAll('table', attrs={'class': ['wikitable', 'infobox vevent', 'infobox']})
        
        ## Collecting list of states with elections in each year
        link_toc = link_soup.find('div', attrs={'id':'toc'})
        link_toc = link_toc.findAll('a', href=is_state)
        toc_list = [tag.get('href').replace('#', '') for tag in link_toc]
        
        ## Converting to dataframe + storage
        elect_df = pd.read_html(str(link_tables))
        yr_dfs[year] = elect_df
        yr_tocs[year] = toc_list
    
    print(f'Total pages scraped: {count}')
    return yr_dfs, yr_tocs

## https://www.crummy.com/software/BeautifulSoup/bs4/doc/#kinds-of-filters

In [4]:
def is_state(href):
    states_list = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 
                   'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
                   'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
                   'Kentucky', 'Louisiana', 'Maine', 'Maryland',
                   'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
                   'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New_Hampshire',
                   'New_Jersey', 'New_Mexico', 'New_York', 'North_Carolina',
                   'North_Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
                   'Rhode_Island', 'South_Carolina', 'South_Dakota',
                   'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
                   'Washington', 'West_Virginia', 'Wisconsin', 'Wyoming']
    for state in states_list:
        if state in href:
            return href

# Initial Data

> [Link to Dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/PEJ5QU)

In [5]:
df = pd.read_csv('dataverse_files/1976-2018-senate.csv', encoding='latin-1')
df.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Sam Steiger,republican,False,total,321236,741210,False,20171011.0
1,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Wm. Mathews Feighan,independent,False,total,1565,741210,False,20171011.0
2,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Dennis DeConcini,democrat,False,total,400334,741210,False,20171011.0
3,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Allan Norwitz,libertarian,False,total,7310,741210,False,20171011.0
4,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Bob Field,independent,False,total,10765,741210,False,20171011.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421 entries, 0 to 3420
Data columns (total 18 columns):
year              3421 non-null int64
state             3421 non-null object
state_po          3421 non-null object
state_fips        3421 non-null int64
state_cen         3421 non-null int64
state_ic          3421 non-null int64
office            3421 non-null object
district          3421 non-null object
stage             3421 non-null object
special           3421 non-null bool
candidate         3011 non-null object
party             2858 non-null object
writein           3421 non-null bool
mode              3421 non-null object
candidatevotes    3421 non-null int64
totalvotes        3421 non-null int64
unofficial        3421 non-null bool
version           3420 non-null float64
dtypes: bool(3), float64(1), int64(6), object(8)
memory usage: 411.0+ KB


In [7]:
df[(df['state_po'] == 'AZ') & (df['year'] == 1976)]

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Sam Steiger,republican,False,total,321236,741210,False,20171011.0
1,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Wm. Mathews Feighan,independent,False,total,1565,741210,False,20171011.0
2,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Dennis DeConcini,democrat,False,total,400334,741210,False,20171011.0
3,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Allan Norwitz,libertarian,False,total,7310,741210,False,20171011.0
4,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,False,Bob Field,independent,False,total,10765,741210,False,20171011.0


In [8]:
# len(df[df['candidate'].isna()])

# df['writein'].value_counts()

# for col in df:
#     display(df[col].value_counts())

# pd.Series.value_counts()

# Additional Data

Current set of features includes:
* Year of election
* State (encoded)
* 

> [Link to Brookings](https://www.brookings.edu/multi-chapter-report/vital-statistics-on-congress/)

> [Link to BallotPedia](https://ballotpedia.org/Legislative_Branch)

> [Link to WikiPedia](https://en.wikipedia.org/wiki/List_of_United_States_Senate_elections)

> [Link to Wiki-Category (U.S. Senator)](https://commons.wikimedia.org/wiki/Category:Senators_of_the_United_States)

## Testing tables 'infobox vevent'

In [9]:
# url3 = 'https://en.wikipedia.org/wiki/1978_United_States_Senate_elections'
# response3 = requests.get(url3)
# print(response3)

# soup3 = BeautifulSoup(response3.text, 'html.parser')
# wiki_tables3 = soup3.findAll('table', attrs={'class': ['wikitable', 'infobox vevent']})

# print(type(wiki_tables3))

# test_df3 = pd.read_html(str(wiki_tables3))

# tester = test_df3[0]

In [10]:
# https://stackoverflow.com/questions/27156278/index-pandas-dataframe-by-column-numbers-when-column-names-are-integers
# tester.columns = ['a', 'b', 'c']
# display(tester.iloc[6:18,0:3])

# display(test_df3[2])

In [11]:
# url = 'https://en.wikipedia.org/wiki/1976_United_States_Senate_elections'
# response = requests.get(url)
# print(response)

# soup = BeautifulSoup(response.text, 'html.parser')
# wiki_tables = soup.findAll('table', attrs={'class': 'wikitable'})

# type(wiki_tables)

In [12]:
# test_df = pd.read_html(str(wiki_tables))
# display(test_df[0])

# len(test_df)

## Pulling state election tables

In [13]:
# ## Pulling Arizona gen election results
# tester = test_df[5].copy()
# print('O.G. Copy:')
# print('---'*20)
# display(tester)

# ## Setting columns manually + Q.C.
# tester.columns = ['Null', 'Party', 'Candidate', 'Votes', '%of_Vote']
# print('\nNew Column Headers:')
# print('---'*20)
# display(tester)

# ## Dropping irrelevant information + Q.C.
# tester.drop(index=[0, 6, 8], inplace=True)
# print('\nDropping Rows:')
# print('---'*20)
# display(tester)

# ## Flip and place Turnout as its own feature + Q.C.
# for idx, v, k in zip(tester.index, tester.Null, tester.Null.isna()):
#     if not k:
#         #print(idx, v)
#         tester[v] = tester.Party[idx]

# ## Removing the row with turnout info/column with nulls + Q.C.
# try:
#     tester.drop(columns='Null', index=7, inplace=True)
#     print('\nFinal Version:')
#     print('---'*20)
#     display(tester)
# except:
#     print('Something went wrong!')

In [14]:
# type(tester['Votes'][5]) # str

In [15]:
# tester2 = gen_election_cleaner(test_df[6], verbose=2)

In [16]:
# test_df[-1]

In [17]:
# url2 = 'https://en.wikipedia.org/wiki/List_of_United_States_Senate_elections'
# response2 = requests.get(url2)
# print(response2)

In [18]:
# url2 = 'https://en.wikipedia.org/wiki/2018_United_States_Senate_elections'
# response2 = requests.get(url2)
# print(response2)

In [19]:
# soup2 = BeautifulSoup(response2.text, 'html.parser')
# wiki_toc = soup2.find('div', attrs={'id':'toc'})
# wiki_toc = wiki_toc.findAll('a', href=is_state)
# display(wiki_toc[0:5])
# jiji = wiki_toc[0:5]

# jiji = [tag.get('href').replace('#', '') for tag in jiji]
# jiji = jiji
# jiji

## Working

In [20]:
# test = wiki_links[78:134]
# beeb = test[0].get_text()
# geeb = 'https://en.wikipedia.org'
# geeb + beeb

In [21]:
elect_yr_tables, elect_yr_tocs = wiki_senate_scraper()

Starting Response: <Response [200]>
Checkpoint! (25 loops)
Checkpoint! (25 loops)
Total pages scraped: 56


In [None]:
# display(elect_yr_dfs.keys())
# display(elect_yr_tocs.keys())
# display(len(elect_yr_dfs) == len(elect_yr_tocs))
# elect_yr_dfs.keys() != elect_yr_tocs.keys()
# list(elect_yr_dfs.keys())

In [None]:
# elect_yr_dfs['1918']

In [None]:
# elect_yr_tocs['1918']

In [107]:
# year = '1924'

# for i, df in enumerate(elect_yr_dfs[year]):
#     print(i, df.shape)

# display(elect_yr_dfs[year][10])
# elect_yr_dfs[year][10].shape[1] is 6

0 (46, 53)
1 (1, 3)
2 (18, 3)
3 (1, 2)
4 (11, 10)
5 (11, 10)
6 (11, 10)
7 (3, 2)
8 (6, 6)
9 (34, 6)
10 (6, 5)
11 (6, 5)
12 (8, 5)
13 (9, 5)
14 (7, 5)
15 (6, 5)
16 (3, 5)
17 (7, 5)
18 (11, 5)
19 (9, 5)
20 (8, 5)
21 (6, 5)
22 (3, 5)
23 (6, 5)
24 (8, 5)
25 (10, 5)
26 (10, 5)
27 (9, 5)
28 (3, 5)
29 (17, 17)
30 (5, 4)
31 (1, 2)
32 (9, 5)
33 (6, 5)
34 (6, 5)
35 (11, 5)
36 (6, 5)
37 (6, 5)
38 (7, 5)
39 (8, 5)
40 (8, 5)
41 (9, 5)
42 (3, 5)
43 (11, 5)
44 (7, 5)
45 (6, 5)
46 (7, 5)
47 (7, 5)
48 (8, 5)


Unnamed: 0,0,1,2,3,4
0,Party,Candidate,Votes,%,
1,,Democratic,James Thomas Heflin (incumbent),154560,79.52%
2,,Republican,Frank H. Lathrop,39818,20.48%
3,Majority,114742,59.04%,,
4,Turnout,194378,,,
5,,Democratic hold,,,


False

In [92]:
koko = {v:elect_yr_dfs[v] for v in ['1918', '1924', '1976', '2016', '2018']}
lplp = {v:elect_yr_tocs[v] for v in ['1918', '1924', '1976', '2016', '2018']}

* Create a table with names as index and state as values .loc name of candidates to get state value

In [119]:
ftft = election_collector(koko, lplp)

In [126]:
year = '2016'

for v in ftft[year]:
    print(len(v))
    
ftft[year][1]

23
1
104


[                                  0                   1                  2  \
 0   State(linked to sections below)           Incumbent            Results   
 1                           Senator               Party  Electoral history   
 2                           Alabama      Richard Shelby         Republican   
 3                            Alaska      Lisa Murkowski         Republican   
 4                           Arizona         John McCain         Republican   
 5                          Arkansas        John Boozman         Republican   
 6                        California       Barbara Boxer         Democratic   
 7                          Colorado      Michael Bennet         Democratic   
 8                       Connecticut  Richard Blumenthal         Democratic   
 9                           Florida         Marco Rubio         Republican   
 10                          Georgia      Johnny Isakson         Republican   
 11                           Hawaii        Brian Sc

In [127]:
ftft[year][2]

[              0               1             2   3
 0           NaN             NaN           NaN NaN
 1       Nominee  Richard Shelby  Ron Crumpton NaN
 2         Party      Republican    Democratic NaN
 3  Popular vote         1335104        748709 NaN
 4    Percentage           64.0%         35.9% NaN,
              0           1                   2       3       4
 0        Party   Candidate               Votes       %     NaN
 1          NaN  Republican      Richard Shelby  505586  64.91%
 2          NaN  Republican  Jonathan McConnell  214770  27.58%
 3          NaN  Republican         John Martin   23558   3.02%
 4          NaN  Republican       Marcus Bowman   19707   2.53%
 5          NaN  Republican     Shadrack McGill   15230   1.96%
 6  Total votes      778851             100.00%     NaN     NaN,
              0           1             2       3       4
 0        Party   Candidate         Votes       %     NaN
 1          NaN  Democratic  Ron Crumpton  145681  55.97%
 2    

In [118]:
def election_collector(dict_tables, dict_lists):
    
    ## Requirement of proper dictionaries
    if dict_tables.keys() != dict_lists.keys():
        print('**WARNING**')
        print('Keys do not match in dictionaries passed. Adjust and try again.')
        return '***'*10
    
    ## Containers for results
    coll_elects = {}
    
    ## List creation for looping through dicts
    yr_list = list(dict_lists.keys())
    
    ## Looping + storage
    for year in yr_list:
        yr_tables = dict_tables[year]
        yr_toc = dict_lists[year]
        
        ## Creating containers to further separate data
        yr_sum_ldrs = yr_tables[2]
        most_tables = yr_tables[3:]
        yr_summary = []
        yr_states = []
        count = 0
        
        for i, df in enumerate(most_tables):
            if count < 1:
                if df.shape[1] is 6 and most_tables[i+1].shape[1] is 6:
                    count += 1
                    yr_summary.append(most_tables[i])
                    yr_summary.append(most_tables[i+1])
                    continue
                elif df.shape[1] is 6:
                    count += 1
                    yr_summary.append(most_tables[i])
            
            if df.shape[1] in [4,5]:
                yr_states.append(most_tables[i])
        
        coll_elects[year] = [yr_sum_ldrs, yr_summary, yr_states]
        
        
    return coll_elects

In [85]:
jiji = [2,5,4,4,3,4,64,4,26,4,4,4, 5, 4,111]
lolo = []
rere = []
yuyu = 0
for i, k in enumerate(jiji):
    if yuyu < 1:
        if k == 4 & jiji[i+1] == 4:
            yuyu += 1
            lolo.append(jiji[i])
            lolo.append(jiji[i+1])
            jiji.pop(i)
            jiji.pop(i)
            continue
        elif k == 4:
            yuyu += 1
            lolo.append(jiji[i])
            
    
    if k == 4:
        rere.append(jiji[i])

lolo, rere

([4, 4], [4, 4, 4, 4, 4, 4])

In [86]:
rere[1:]

[4, 4, 4, 4, 4]

* 1924 = 10
* 1926 = 12
* 1928 = 10
* Pull only tables with `shape[1]` of 5 or 6.
* Table 2 is always summary of senate leaders

Why after the 17th amendment?? [Link](https://en.wikipedia.org/wiki/Seventeenth_Amendment_to_the_United_States_Constitution)

For 1920 forward, proceed like 1976. Prior to this, will need to use first table with general info.

* **Clean up NaN value table from top of pages** -- Use index 2 to grab  nested table from `'infobox vevent'`.
* Take list of states of top tables and map to dfs
* Get demographic info wikidata
* eda time series of state party (heatmap)
* NLP on names, etc.
* [link for slider viz](https://medium.com/@sjacks/the-journey-to-an-attractive-visualization-bac019506a49)