In [8]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [9]:
# CONSTANTS
SEASONS = "Seasons"
# The Bachelor Wikipedia 
WIKIPEDIA = "https://en.wikipedia.org/"
BACHELOR_WIKI = "wiki/The_Bachelor_(American_TV_series)"

CLASS_TAG = "class"
SEASONS_TABLE_CLASS = "wikitable plainrowheaders"
CONTESTANTS_TABLE_CLASS = "wikitable sortable"
TABLE_ROW_TAG = "tr"
TABLE_DATA_TAG = "td"
TABLE_HEADER_TAG = "th"
TABLE_TAG = "table"
TABLE_BODY_TAG = "tbody"
HYPERLINK_TAG = "a"

In [10]:
def clean_tag(value):
    return str(value.text).strip()

def get_row_values(table_value):
    column_name_list = []
    for value in table_value:
        column_name_list.append(str(value.text).strip())
    return column_name_list
    
def get_table_headers(rows):
    column_names = []
    for row in rows:
        table_headers = row.find_all(TABLE_HEADER_TAG)
        if table_headers:
            column_names = get_row_values(table_headers)   
            return column_names
    return column_names

def get_hyper_links(row, season_refs):
    hrefs = row.find_all(HYPERLINK_TAG)
    if len(hrefs) > 2:
        # I ONLY WANT THE FIRST REF BECAUSE IT IS RELATED TO THE SEASON
        first_tag = str(hrefs[0])
        soup = BeautifulSoup(first_tag, 'html.parser')
        href_value = soup.a['href']
        season_refs.append(href_value)

def get_table_data(rows, season_refs):
    data_list = []
    for row in rows:
        get_hyper_links(row, season_refs)
        table_data = row.find_all(TABLE_DATA_TAG)
        if table_data:
            new_row_values = get_row_values(table_data)
            # NOTE: RUNNERS UP ISNT COMPLETELY ACCURATE
            # BUT IT DOESN'T MATTER BECAUSE WE ONLY CARE ABOUT WINNERS
            if len(new_row_values) > 1:
                data_list.append(new_row_values)
    return data_list

In [11]:
# HERE I AM CREATING THE SEASONS CSV TO LOOK LIKE THE TABLE HERE
# https://en.wikipedia.org/wiki/The_Bachelor_(American_TV_series)#Seasons

html = requests.get(WIKIPEDIA+BACHELOR_WIKI)
soup = BeautifulSoup(html.text)
season_refs = []

seasons_table = soup(TABLE_TAG, {CLASS_TAG:SEASONS_TABLE_CLASS})
# create some new soup so we can use the find_all method
new_soup = BeautifulSoup(str(seasons_table), 'html.parser')

table_rows = new_soup.find_all(TABLE_ROW_TAG)

column_names = get_table_headers(table_rows)
data_list = get_table_data(table_rows, season_refs)

# create data frame from wikipedia article
df = pd.DataFrame(data_list, columns=column_names)
df.to_csv('./data/seasons.csv', index=False)

In [None]:
# HERE I AM CREATING THE CONTESTANTS CSV TO LOOK LIKE THE TABLE HERE
# https://en.wikipedia.org/wiki/The_Bachelor_(American_season_1)
# for each available season
for ref in season_refs:
    html = requests.get(WIKIPEDIA+ref)
    soup = BeautifulSoup(html.text)

    contestants_table = soup(TABLE_TAG, {CLASS_TAG:CONTESTANTS_TABLE_CLASS})

    # create some new soup so we can use the find_all method
    new_soup = BeautifulSoup(str(contestants_table), 'html.parser')

    table_rows = new_soup.find_all(TABLE_ROW_TAG)

    column_names = get_table_headers(table_rows)
    data_list = get_table_data(table_rows, [])

    # create data frame from wikipedia article
    contestant_df = pd.DataFrame(data_list, columns=column_names)
    contestant_df.to_csv('./data/'+ref[6:]+'.csv', index=False)

REF  /wiki/The_Bachelor_(American_season_1)
REF  /wiki/The_Bachelor_(American_season_2)
REF  /wiki/The_Bachelor_(American_season_5)
REF  #cite_note-12
REF  /wiki/Charlie_O%27Connell
REF  /wiki/The_Bachelor_(American_season_9)
REF  /wiki/The_Bachelor_(American_season_10)
REF  /wiki/The_Bachelor_(American_season_11)
REF  /wiki/The_Bachelor_(American_season_12)
