In [153]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import unicodedata

In [154]:
def extract_text_column(row, index):
    cells = row.find_all(['td', 'th'])
    
    if len(cells) > index:
        text = cells[index].get_text(separator=" ", strip=True)
        text = " ".join(text.split())  # fazla boşlukları temizler
        return text if text != "" else None
    
    return None


In [155]:
def extract_date_time(row):
    cells = row.find_all(['td', 'th'])
    
    if len(cells) > 1:
        cell = cells[1]
        parts = cell.text.strip().split('\n')
        
        if len(parts) >= 2:
            date = parts[0].strip()
            time = parts[1].strip()
        else:
            date = parts[0].strip()
            time = None
        
        return date, time
    
    return None, None


In [156]:
def extract_version_booster(row):
    """
    Extracts Version Booster from 3rd column
    Returns single string (no separation)
    """
    
    cells = row.find_all(['td', 'th'])
    
    if len(cells) > 2:   # 3. sütun güvenlik kontrolü
        
        cell = cells[2]
        
        text = cell.text.strip()
        
        # newline'ları temizleyip tek satır haline getirelim
        text = " ".join(text.split())
        
        return text
    
    else:
        return None


In [157]:
def extract_launch_site(row):
    #Extracts launch site from a table row.
    
    
    cells = row.find_all(['td', 'th'])
    
    if len(cells) > 3:
        
        # Eğer içinde link varsa
        link = cells[3].find('a')
        
        if link:
            return link.text.strip()
        else:
            return cells[3].text.strip()
    
    else:
        return None


In [158]:
def extract_payload_mass(row):
    cells = row.find_all(['td', 'th'])
    
    if len(cells) > 5:
        text = " ".join(cells[5].get_text(separator=" ", strip=True).split())
        
        match = re.search(r'[\d,]+', text)
        if match:
            return float(match.group().replace(",", ""))
    
    return None


In [159]:
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/91.0.4472.124 Safari/537.36"
}


In [160]:
# use requests.get() method with the provided static_url and headers
# assign the response to a object
response = requests.get(static_url, headers = headers)
# Check if the request was successful
if response.status_code == 200:
    print("Request successful!")
    html_data = response.text
else:
    print(f"Request failed with status code: {response.status_code}")

Request successful!


In [161]:
# Initialize dictionary with required columns
launch_dict = {
    'Flight No.': [], 'Launch site': [], 'Version, Booster': [],
    'Date': [], 'Time': [], 'Payload': [], 'Payload mass': [],
    'Orbit': [], 'Customer': [], 'Launch outcome': [], 'Booster landing': []
}

In [162]:
# Find all tables on the page
html_tables = soup.find_all('table', "wikitable plainrowheaders collapsible")

In [163]:
# Iterate through each table (years 2010-2021)
for table in html_tables:
    # Remove superscript (reference tags) to clean data
    for sup in table.find_all('sup'):
        sup.decompose()
        
    rows = table.find_all('tr')
    
    for row in rows:
        # Check if the first cell is a digit (Flight Number)
        # This filters out header rows and descriptive rows
        flight_no_cell = extract_text_column(row, 0)
        
        if flight_no_cell and flight_no_cell.isdigit():
            # Extract data using your helper functions
            date, time = extract_date_time(row)
            
            launch_dict['Flight No.'].append(flight_no_cell)
            launch_dict['Date'].append(date)
            launch_dict['Time'].append(time)
            launch_dict['Version, Booster'].append(extract_version_booster(row))
            launch_dict['Launch site'].append(extract_launch_site(row))
            launch_dict['Payload'].append(extract_text_column(row, 4))
            launch_dict['Payload mass'].append(extract_payload_mass(row))
            launch_dict['Orbit'].append(extract_text_column(row, 6))
            launch_dict['Customer'].append(extract_text_column(row, 7))
            launch_dict['Launch outcome'].append(extract_text_column(row, 8))
            launch_dict['Booster landing'].append(extract_text_column(row, 9))

# Convert to DataFrame to verify counts
df = pd.DataFrame(launch_dict)
print(f"\nTotal launches extracted: {len(df)}")
print(df.head())


Total launches extracted: 121
  Flight No. Launch site Version, Booster                   Date  Time  \
0          1       CCAFS   F9 v1.0B0003.1      4 June 2010,18:45  None   
1          2       CCAFS   F9 v1.0B0004.1  8 December 2010,15:43  None   
2          3       CCAFS   F9 v1.0B0005.1      22 May 2012,07:44  None   
3          4       CCAFS   F9 v1.0B0006.1   8 October 2012,00:35  None   
4          5       CCAFS   F9 v1.0B0007.1     1 March 2013,15:10  None   

                                Payload  Payload mass        Orbit  \
0  Dragon Spacecraft Qualification Unit           NaN          LEO   
1   Dragon demo flight C1 (Dragon C101)           NaN  LEO ( ISS )   
2  Dragon demo flight C2+ (Dragon C102)         525.0  LEO ( ISS )   
3            SpaceX CRS-1 (Dragon C103)        4700.0  LEO ( ISS )   
4            SpaceX CRS-2 (Dragon C104)        4877.0  LEO ( ISS )   

            Customer Launch outcome      Booster landing  
0             SpaceX        Success  Failure

In [164]:
# Write to CSV
df.to_csv('falcon9_launches.csv', index=False, encoding='utf-8')

print(f"Extraction complete! Launches saved to 'falcon9_launches.csv'.")

Extraction complete! Launches saved to 'falcon9_launches.csv'.
