In [8]:
import requests as rqst
from bs4 import BeautifulSoup as bs
import re
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [9]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('election-data')

sc = SparkContext(conf=conf)

spark = SparkSession.builder \
    .appName('election-data') \
    .config(conf=sc.getConf()) \
    .getOrCreate()

24/03/06 11:23:46 WARN Utils: Your hostname, Chases-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.9 instead (on interface en0)
24/03/06 11:23:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/06 11:23:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
class SoupScraper:
    def __init__(self, html):
        self.html = html
        self.table_head = []
        self.table_body = []
        
    def scrape(self, html):
      elections_html = rqst.get(html)
      soup = bs(elections_html.content, "html.parser")

      # Remove superscripts from html doc
      sup_tags = soup.find_all('sup')
      for tag in sup_tags:
          tag.decompose()

      # Get the headers
      table_head = soup.thead
      row_headers = []
      for header in table_head.find_all('tr'):
          for header_value in header.find_all('th'):
              row_headers.append(header_value.get_text(strip=True))    
      self.table_head = row_headers

      # Get the table values
      table_body = soup.tbody
      table_values = []
      for row in table_body.find_all('tr'):
          td_tags = row.find_all('td')
          for value in td_tags:
              td_val = [value.get_text(strip=True) for value in td_tags]
          table_values.append(td_val)
      self.table_body = table_values

In [11]:
elections_data = SoupScraper("https://www.britannica.com/topic/United-States-Presidential-Election-Results-1788863")

In [12]:
elections_data.scrape(elections_data.html)

In [13]:
def is_year(input_string):
    pattern = r'^\d{4}$'
    if re.match(pattern, input_string):
        return True
    else:
        return False

def insert_years(table):
    for idx, row in enumerate(table):
        if idx > 0:
            prev_row = idx - 1
        else: 
            prev_row = 0
        if is_year(row[0]) == False:
            row.insert(0, table[prev_row][0])

In [14]:
# Format the election data
insert_years(elections_data.table_body)

In [15]:
df = spark.createDataFrame(data = elections_data.table_body, schema = elections_data.table_head)

In [16]:
df.printSchema()

root
 |-- year: string (nullable = true)
 |-- candidate: string (nullable = true)
 |-- political party: string (nullable = true)
 |-- electoral votes: string (nullable = true)
 |-- popular votes: string (nullable = true)
 |-- popular percentage: string (nullable = true)



In [17]:
df.show()

                                                                                

+----+-----------------+--------------------+---------------+-------------+------------------+
|year|        candidate|     political party|electoral votes|popular votes|popular percentage|
+----+-----------------+--------------------+---------------+-------------+------------------+
|1789|George Washington|no formally organ...|             69|             |                  |
|1789|       John Adams|                    |             34|             |                  |
|1789|         John Jay|                    |              9|             |                  |
|1789|    R.H. Harrison|                    |              6|             |                  |
|1789|    John Rutledge|                    |              6|             |                  |
|1789|     John Hancock|                    |              4|             |                  |
|1789|   George Clinton|                    |              3|             |                  |
|1789|Samuel Huntington|                    |     