# Setup

In [1]:
# Basic Imports
import requests
import datetime
import sys
import re
import unicodedata

# Webscraping
from bs4 import BeautifulSoup

# Data Wrangling
import pandas as pd

In [2]:
# Global variables 
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"
booster_version = []
payload_mass = []
orbit = []
launch_site = []
outcome = []
flights = []
grid_fins = []
reused = []
legs = []
landing_pad = []
block = []
reused_count = []
serial = []
longitude = []
latitude = []

In [3]:
# Global Functions
def get_webscraping_data(url : str) -> dict:
    launch_dict = dict()
    extra_column_names = [
        "Version Booster",
        "Booster landing",
        "Date",
        "Time",
    ]

    response = requests.get(static_url)
    soup = BeautifulSoup(response.content, "html.parser")
    first_launch_table = soup.find("table", "wikitable plainrowheaders collapsible")

    for table_header in first_launch_table.find_all("th"):
        column_name = extract_column_from_header(table_header)
        if (
            column_name is not None
            and len(column_name) > 0
            and column_name != "Date and time ( )"
        ):
            launch_dict[column_name] = list()

    launch_dict.update({name: [] for name in extra_column_names})

    return soup, launch_dict


def date_time(table_cells):
    """
    This function returns the data and time from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]


def booster_version(table_cells):
    """
    This function returns the booster version from the HTML  table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=''.join([booster_version for i,booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])
    return out


def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=[i for i in table_cells.strings][0]
    return out


def get_mass(table_cells):
    mass=unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass=mass[0:mass.find("kg")+2]
    else:
        new_mass=0
    return new_mass


def extract_column_from_header(row):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    colunm_name = ' '.join(row.contents)
    
    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name


In [4]:
def extract_info_from_html_table(soup: BeautifulSoup, launch_dict: dict) -> pd.DataFrame:
    extracted_row = 0

    # Extract each table
    for table_number, table in enumerate(soup.find_all("table", "wikitable plainrowheaders collapsible")):
        # Get table rows
        for row in table.find_all("tr"):
            # Check to see if the first table heading is a number corresponding to launch number
            if row.th:
                if row.th.string:
                    flight_number = row.th.string.strip()
                    flag = flight_number.isdigit()
            else:
                flag = False
            
            # Get table elements
            # `row.find_all("td")` gets all the cells in the current row
            columns = row.find_all("td")
            
            # If it is a number, save cells in a dictionary
            if flag:
                extracted_row += 1
                # Flight Number value
                launch_dict["Flight No."].append(flight_number)
                
                datatimelist = date_time(columns[0])
                
                # Date value
                date = datatimelist[0].strip(",")
                launch_dict["Date"].append(date)
                
                # Time value
                time = datatimelist[1]
                launch_dict["Time"].append(time)
                
                # Booster version
                bv = booster_version(columns[1])
                if not bv:
                    bv = columns[1].a.string if columns[1].a else columns[1].text.strip()
                launch_dict["Version Booster"].append(bv)
                
                # Launch Site
                launch_site = columns[2].a.string if columns[2].a else columns[2].text.strip()
                launch_dict["Launch site"].append(launch_site)
                
                # Payload
                payload = columns[3].a.string if columns[3].a else columns[3].text.strip()
                launch_dict["Payload"].append(payload)
                
                # Payload Mass
                payload_mass = get_mass(columns[4])
                launch_dict["Payload mass"].append(payload_mass)
                
                # Orbit
                orbit = columns[5].a.string if columns[5].a else columns[5].text.strip()
                launch_dict["Orbit"].append(orbit)
                
                # Customer
                customer = columns[6].a.string if columns[6].a else columns[6].text.strip()
                launch_dict["Customer"].append(customer)
                
                # Launch outcome
                launch_outcome = list(columns[7].strings)[0]
                launch_dict["Launch outcome"].append(launch_outcome)
                
                # Booster landing
                booster_landing = landing_status(columns[8])
                launch_dict["Booster landing"].append(booster_landing)
    
    return pd.DataFrame({key:pd.Series(value) for key, value in launch_dict.items()})

# Collecting Data

In [5]:
soup, launch_dict = get_webscraping_data(static_url)
df = extract_info_from_html_table(soup, launch_dict)

In [6]:
display(df)

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,1,CCAFS,Dragon Spacecraft Qualification Unit,0,LEO,SpaceX,Success\n,F9 v1.07B0003.18,Failure,4 June 2010,18:45
1,2,CCAFS,Dragon,0,LEO,NASA,Success,F9 v1.07B0004.18,Failure,8 December 2010,15:43
2,3,CCAFS,Dragon,525 kg,LEO,NASA,Success,F9 v1.07B0005.18,No attempt\n,22 May 2012,07:44
3,4,CCAFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success\n,F9 v1.07B0006.18,No attempt,8 October 2012,00:35
4,5,CCAFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success\n,F9 v1.07B0007.18,No attempt\n,1 March 2013,15:10
...,...,...,...,...,...,...,...,...,...,...,...
116,117,CCSFS,Starlink,"15,600 kg",LEO,SpaceX,Success\n,F9 B5B1051.10657,Success,9 May 2021,06:42
117,118,KSC,Starlink,"~14,000 kg",LEO,SpaceX,Success\n,F9 B5B1058.8660,Success,15 May 2021,22:56
118,119,CCSFS,Starlink,"15,600 kg",LEO,SpaceX,Success\n,F9 B5B1063.2665,Success,26 May 2021,18:59
119,120,KSC,SpaceX CRS-22,"3,328 kg",LEO,NASA,Success\n,F9 B5B1067.1668,Success,3 June 2021,17:29


# Saving Data

In [7]:
df.to_csv("../data/processed/spacex_web_scraped.csv", index=False)