In [65]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

In [66]:
def date_time(table_cells): 
    return [date_time.strip() for date_time in list(table_cells.strings)][0:2]

def booster_version(table_cells): 
    out = "".join([booster_version for i, booster_version in enumerate(table_cells) if i%2 == 0][0:-1])

def landing_status(table_cells):
    out = [i for i in table_cells.strings][0]
    return out

def get_mass(table_cells): 
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass: 
        mass.find("kg")
        new_mass = mass[0: mass.find("kg") + 2]
    
    else: 
        new_mass = 0 
    return new_mass

def extract_column_from_header(row): 

    if (row.br): 
        row.br.extract()
    if row.a: 
        row.a.extract()
    if row.sup: 
        row.sup.extract()
    
    column_name = " ".join(row.contents)

    if not(column_name.strip().isdigit()): 
        column_name = column_name.strip()
        return column_name



In [67]:
url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"

response = requests.get(url)

if response.status_code == 200: 
    print(f"Success: {response.status_code}")

else: 
    print(f"Failure {response.status_code}")


Success: 200


In [68]:
soup = BeautifulSoup(response.text, "html.parser")

In [69]:
test = soup.title.string
print(test)

List of Falcon 9 and Falcon Heavy launches - Wikipedia


In [70]:
html_tables = soup.find_all("table")

print(f"Found {len(html_tables)} tables")

Found 19 tables


In [71]:
first_launch_table = html_tables[2]
print(first_launch_table)

<table class="wikitable plainrowheaders collapsible" style="width: 100%;">
<tbody><tr>
<th scope="col">Flight No.
</th>
<th scope="col">Date and<br/>time (<a href="/wiki/Coordinated_Universal_Time" title="Coordinated Universal Time">UTC</a>)
</th>
<th scope="col"><a href="/wiki/List_of_Falcon_9_first-stage_boosters" title="List of Falcon 9 first-stage boosters">Version,<br/>booster</a><sup class="reference" id="cite_ref-booster_16-0"><a href="#cite_note-booster-16">[a]</a></sup>
</th>
<th scope="col">Launch<br/>site
</th>
<th scope="col">Payload<sup class="reference" id="cite_ref-Dragon_17-0"><a href="#cite_note-Dragon-17">[b]</a></sup>
</th>
<th scope="col">Payload mass
</th>
<th scope="col">Orbit
</th>
<th scope="col">Customer
</th>
<th scope="col">Launch<br/>outcome
</th>
<th scope="col"><a href="/wiki/Falcon_9_first-stage_landing_tests" title="Falcon 9 first-stage landing tests">Booster<br/>landing</a>
</th></tr>
<tr id="F9-078">
<th rowspan="2" scope="row" style="text-align:center

In [72]:
column_names = []

for row in first_launch_table.find_all("th"):
    column_name = extract_column_from_header(row)
   
    if column_name is not None and len(column_name) > 0: 
        column_names.append(column_name)
        
print(column_names)






['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


In [73]:
launch_dict = dict.fromkeys(column_names)

del launch_dict["Date and time ( )"]

launch_dict["Flight No."] = []
launch_dict["Launch site"] = []
launch_dict["Payload"] = []
launch_dict["Payload mass"] = []
launch_dict["Orbit"] = []
launch_dict["Customer"] = []
launch_dict["Launch outcome"] = []

# New Columns
launch_dict["Version Booster"] = []
launch_dict["Booster landing"] = []
launch_dict["Date"] = []
launch_dict["Time"] = []

In [74]:
extract_row = 0 

for table_number, table in enumerate(soup.find_all("table","wikitable plainrowheaders collapsible")): 

    for rows in table.find_all("tr"): 

        if rows.th: 
            if rows.th.string: 
                flight_number = rows.th.string.strip()
                flag = flight_number.isdigit()
        
        else: 
            flag = False

            row = rows.find_all("td")

            if flag: 

                extract_row += 1

                # Flight Number Value
                # TODO: Append the flight_number into launch_dict with key `Flight No.`
                #print (flight_number)
                datatimelist = date_time(row[0])

                # Date Value 
                # TODO: Append the date into launch_dict with key "Date"
                date = datatimelist[0].strip(",")
                #print(date)

                # Time Value
                # TODO: Append the time into launch_dict with key `Time`
                time = datatimelist[1]
                #print(time)

                # Booster Version
                # TODO: Append the bv into launch_dict with key "Version Booster"
                bv = booster_version(row[1])
                if not(bv): 
                    bv = row[1].a.string
                print(bv)

                # Launch Site
                # TODO: Append the bv into launch_dict with key `Launch Site`
                launch_site = row[2].a.string
                #print(launch_site)

                # Payload
                # TODO: Append the payload into launch_dict with key `Payload`
                payload = row[3].a.string
                #print(payload)

                # Payload Mass
                # TODO: Append the payload_mass into launch_dict with key `Payload mass`
                payload_mass = get_mass(row[4])
                #print(payload_mass)

                # Orbit
                # TODO: Append the orbit into launch_dict with key `Orbit`
                orbit = row[5].a.string
                #print(orbit)
                
                # Customer
                # TODO: Append the customer into launch_dict with key `Customer`
                customer = row[6].a.string
                #print(customer)
                
                # Launch outcome
                # TODO: Append the launch_outcome into launch_dict with key `Launch outcome`
                launch_outcome = list(row[7].strings)[0]
                #print(launch_outcome)
                
                # Booster landing
                # TODO: Append the launch_outcome into launch_dict with key `Booster landing`
                booster_landing = landing_status(row[8])
                #print(booster_landing)

In [75]:
df = pd.DataFrame({ key:pd.Series(value) for key, value in launch_dict.items() })

In [76]:
df.to_csv("web_scraping.csv", index=False)