In [19]:
import requests as rqst
from bs4 import BeautifulSoup as bs
import re
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('election-data')

sc = SparkContext(conf=conf)

spark = SparkSession.builder \
    .appName('election-data') \
    .config(conf=sc.getConf()) \
    .getOrCreate()

24/03/06 10:17:00 WARN Utils: Your hostname, Chases-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.9 instead (on interface en0)
24/03/06 10:17:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/06 10:17:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
class SoupScraper:
    def __init__(self, html):
        self.html = html
        self.table_head = []
        self.table_body = []
        
    def scrape(self, html):
      elections_html = rqst.get(html)
      soup = bs(elections_html.content, "html.parser")

      # Remove superscripts from html doc
      sup_tags = soup.find_all('sup')
      for tag in sup_tags:
          tag.decompose()

      # Get the headers
      table_head = soup.thead
      row_headers = []
      for header in table_head.find_all('tr'):
          for header_value in header.find_all('th'):
              row_headers.append(header_value.get_text(strip=True))    
      self.table_head = row_headers

      # Get the table values
      table_body = soup.tbody
      table_values = []
      for row in table_body.find_all('tr'):
          td_tags = row.find_all('td')
          for value in td_tags:
              td_val = [value.get_text(strip=True) for value in td_tags]
          table_values.append(td_val)
      self.table_body = table_values

In [30]:
elections_data = SoupScraper("https://www.britannica.com/topic/United-States-Presidential-Election-Results-1788863")

In [34]:
elections_data.scrape(elections_data.html)

In [37]:
def is_year(input_string):
    pattern = r'^\d{4}$'
    if re.match(pattern, input_string):
        return True
    else:
        return False

def insert_years(table):
    for idx, row in enumerate(table):
        if idx > 0:
            prev_row = idx - 1
        else: 
            prev_row = 0
        if is_year(row[0]) == False:
            row.insert(0, table[prev_row][0])
    return table

In [39]:
# Format the election data
insert_years(elections_data.table_body)

[['1789', 'George Washington', 'no formally organized parties', '69', '', ''],
 ['1789', 'John Adams', '', '34', '', ''],
 ['1789', 'John Jay', '', '9', '', ''],
 ['1789', 'R.H. Harrison', '', '6', '', ''],
 ['1789', 'John Rutledge', '', '6', '', ''],
 ['1789', 'John Hancock', '', '4', '', ''],
 ['1789', 'George Clinton', '', '3', '', ''],
 ['1789', 'Samuel Huntington', '', '2', '', ''],
 ['1789', 'John Milton', '', '2', '', ''],
 ['1789', 'James Armstrong', '', '1', '', ''],
 ['1789', 'Benjamin Lincoln', '', '1', '', ''],
 ['1789', 'Edward Telfair', '', '1', '', ''],
 ['1789', 'not voted', '', '44', '', ''],
 ['1792', 'George Washington', 'Federalist', '132', '', ''],
 ['1792', 'John Adams', 'Federalist', '77', '', ''],
 ['1792', 'George Clinton', 'Democratic-Republican', '50', '', ''],
 ['1792', 'Thomas Jefferson', '', '4', '', ''],
 ['1792', 'Aaron Burr', '', '1', '', ''],
 ['1796', 'John Adams', 'Federalist', '71', '', ''],
 ['1796', 'Thomas Jefferson', 'Democratic-Republican', '68

In [136]:
#TESTING CELLS BELOW
html_page = rqst.get("https://www.britannica.com/topic/United-States-Presidential-Election-Results-1788863")
soup = bs(html_page.content, "html.parser")

In [137]:
# Remove superscripts from html doc
sup_tags = soup.find_all('sup')
for tag in sup_tags:
    tag.decompose()

In [138]:
table_head = soup.thead

In [139]:
row_headers = []
for header in table_head.find_all('tr'):
    for header_value in header.find_all('th'):
        row_headers.append(header_value.get_text(strip=True))    

In [140]:
print(row_headers)

['year', 'candidate', 'political party', 'electoral votes', 'popular votes', 'popular percentage']


In [141]:
table_body = soup.tbody

In [142]:
table_values = []
for row in table_body.find_all('tr'):
    td_tags = row.find_all('td')
    for value in td_tags:
        td_val = [value.get_text(strip=True) for value in td_tags]
    table_values.append(td_val)
table_values

[['1789', 'George Washington', 'no formally organized parties', '69', '', ''],
 ['John Adams', '', '34', '', ''],
 ['John Jay', '', '9', '', ''],
 ['R.H. Harrison', '', '6', '', ''],
 ['John Rutledge', '', '6', '', ''],
 ['John Hancock', '', '4', '', ''],
 ['George Clinton', '', '3', '', ''],
 ['Samuel Huntington', '', '2', '', ''],
 ['John Milton', '', '2', '', ''],
 ['James Armstrong', '', '1', '', ''],
 ['Benjamin Lincoln', '', '1', '', ''],
 ['Edward Telfair', '', '1', '', ''],
 ['not voted', '', '44', '', ''],
 ['1792', 'George Washington', 'Federalist', '132', '', ''],
 ['John Adams', 'Federalist', '77', '', ''],
 ['George Clinton', 'Democratic-Republican', '50', '', ''],
 ['Thomas Jefferson', '', '4', '', ''],
 ['Aaron Burr', '', '1', '', ''],
 ['1796', 'John Adams', 'Federalist', '71', '', ''],
 ['Thomas Jefferson', 'Democratic-Republican', '68', '', ''],
 ['Thomas Pinckney', 'Federalist', '59', '', ''],
 ['Aaron Burr', 'Antifederalist', '30', '', ''],
 ['Samuel Adams', 'Democr

In [143]:
print(table_body)

<tbody>
<tr class="has-rs">
<td class="image-cell" rowspan="13">
<p class="topic-paragraph">
<strong><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/event/United-States-presidential-election-of-1789">1789</a></strong>
</p><div class="w-100"><figure class="md-assembly no-caption print-false" data-asm-type="image" data-assembly-id="205951"><div class="md-assembly-wrapper card-media" data-type="image"><a class="position-relative d-flex align-items-center justify-content-center media-overlay-link" data-href="/media/1/1788863/205951" href="https://cdn.britannica.com/97/73697-003-A365F91A/election-Votes-Results-Candidate-American-Political-Party-1804.jpg" style="min-height: 160px;"><img alt="American presidential election, 1789" data-height="79" data-width="100" loading="eager" src="https://cdn.britannica.com/97/73697-003-A365F91A/election-Votes-Results-Candidate-American-Political-Party-1804.jpg"/><button aria-label="Zoom in" class="magnifying-glass btn btn

In [144]:
def is_year(input_string):
    pattern = r'^\d{4}$'
    if re.match(pattern, input_string):
        return True
    else:
        return False

In [145]:
for idx, row in enumerate(table_values):
    if idx > 0:
        prev_row = idx - 1
    else: 
        prev_row = 0
    if is_year(row[0]) == False:
        row.insert(0, table_values[prev_row][0])

In [146]:
table_values

[['1789', 'George Washington', 'no formally organized parties', '69', '', ''],
 ['1789', 'John Adams', '', '34', '', ''],
 ['1789', 'John Jay', '', '9', '', ''],
 ['1789', 'R.H. Harrison', '', '6', '', ''],
 ['1789', 'John Rutledge', '', '6', '', ''],
 ['1789', 'John Hancock', '', '4', '', ''],
 ['1789', 'George Clinton', '', '3', '', ''],
 ['1789', 'Samuel Huntington', '', '2', '', ''],
 ['1789', 'John Milton', '', '2', '', ''],
 ['1789', 'James Armstrong', '', '1', '', ''],
 ['1789', 'Benjamin Lincoln', '', '1', '', ''],
 ['1789', 'Edward Telfair', '', '1', '', ''],
 ['1789', 'not voted', '', '44', '', ''],
 ['1792', 'George Washington', 'Federalist', '132', '', ''],
 ['1792', 'John Adams', 'Federalist', '77', '', ''],
 ['1792', 'George Clinton', 'Democratic-Republican', '50', '', ''],
 ['1792', 'Thomas Jefferson', '', '4', '', ''],
 ['1792', 'Aaron Burr', '', '1', '', ''],
 ['1796', 'John Adams', 'Federalist', '71', '', ''],
 ['1796', 'Thomas Jefferson', 'Democratic-Republican', '68

In [147]:
df = spark.createDataFrame(data = table_values, schema = row_headers)

In [148]:
df.printSchema()

root
 |-- year: string (nullable = true)
 |-- candidate: string (nullable = true)
 |-- political party: string (nullable = true)
 |-- electoral votes: string (nullable = true)
 |-- popular votes: string (nullable = true)
 |-- popular percentage: string (nullable = true)



In [149]:
df.show()

+----+-----------------+--------------------+---------------+-------------+------------------+
|year|        candidate|     political party|electoral votes|popular votes|popular percentage|
+----+-----------------+--------------------+---------------+-------------+------------------+
|1789|George Washington|no formally organ...|             69|             |                  |
|1789|       John Adams|                    |             34|             |                  |
|1789|         John Jay|                    |              9|             |                  |
|1789|    R.H. Harrison|                    |              6|             |                  |
|1789|    John Rutledge|                    |              6|             |                  |
|1789|     John Hancock|                    |              4|             |                  |
|1789|   George Clinton|                    |              3|             |                  |
|1789|Samuel Huntington|                    |     

In [150]:
df_pandas = pd.DataFrame(table_values, columns=row_headers)

In [151]:
df_pandas

Unnamed: 0,year,candidate,political party,electoral votes,popular votes,popular percentage
0,1789,George Washington,no formally organized parties,69,,
1,1789,John Adams,,34,,
2,1789,John Jay,,9,,
3,1789,R.H. Harrison,,6,,
4,1789,John Rutledge,,6,,
...,...,...,...,...,...,...
197,2016,John Kasich,not a candidate,1,,
198,2016,Ron Paul,not a candidate,1,,
199,2016,Faith Spotted Eagle,not a candidate,1,,
200,2020,Joe Biden,Democratic,306,81268924,51.3


In [152]:
df_pandas.to_csv('elections.csv')