In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import unicodedata
import re

# -------------------------------
# 1. Lấy HTML page
# -------------------------------
url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

# -------------------------------
# 2. Các hàm hỗ trợ
# -------------------------------
def date_time(table_cell):
    """Trả về [date, time] từ cell"""
    return [data_time.strip() for data_time in list(table_cell.strings)][0:2]

def booster_version(table_cell):
    """Trả về booster version"""
    out=''.join([v for i,v in enumerate(table_cell.strings) if i%2==0][0:-1])
    return out.strip()

def landing_status(table_cell):
    """Trả về landing status"""
    strings_list = list(table_cell.strings)
    return strings_list[0].strip() if strings_list else ''

def get_mass(table_cell):
    mass = unicodedata.normalize("NFKD", table_cell.text).strip()
    if "kg" in mass:
        new_mass = mass[:mass.find("kg")+2]
    else:
        new_mass = mass
    return new_mass

def get_cell_text(cell):
    """Lấy text an toàn từ cell, ưu tiên <a>"""
    if cell.a:
        return cell.a.text.strip()
    else:
        return cell.text.strip()

# -------------------------------
# 3. Khởi tạo launch_dict
# -------------------------------
columns = ['Flight No.', 'Date', 'Time', 'Version Booster', 'Launch Site', 
           'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome', 'Booster landing']
launch_dict = {col: [] for col in columns}

# -------------------------------
# 4. Loop qua tất cả table và row
# -------------------------------
for table in soup.find_all('table', class_="wikitable plainrowheaders collapsible"):
    for row in table.find_all('tr'):
        # Chỉ xử lý row có Flight No.
        if row.th and row.th.string and row.th.string.strip().isdigit():
            flight_no = row.th.string.strip()
            
            # Lấy tất cả td
            cells = row.find_all('td')
            
            # Bỏ qua row không đủ 9+ cells
            if len(cells) < 9:
                continue
            
            # Date & Time
            dt_list = date_time(cells[0])
            date = dt_list[0].strip(',') if len(dt_list) > 0 else ''
            time = dt_list[1] if len(dt_list) > 1 else ''
            
            # Booster version
            bv = booster_version(cells[1])
            if not bv:
                bv = get_cell_text(cells[1])
            
            # Launch Site
            launch_site = get_cell_text(cells[2])
            
            # Payload
            payload = get_cell_text(cells[3])
            
            # Payload Mass
            payload_mass = get_mass(cells[4])
            
            # Orbit
            orbit = get_cell_text(cells[5])
            
            # Customer
            customer = get_cell_text(cells[6])
            
            # Launch outcome
            launch_outcome = landing_status(cells[7])
            
            # Booster landing
            booster_landing = landing_status(cells[8])
            
            # Append vào dict
            launch_dict['Flight No.'].append(flight_no)
            launch_dict['Date'].append(date)
            launch_dict['Time'].append(time)
            launch_dict['Version Booster'].append(bv)
            launch_dict['Launch Site'].append(launch_site)
            launch_dict['Payload'].append(payload)
            launch_dict['Payload mass'].append(payload_mass)
            launch_dict['Orbit'].append(orbit)
            launch_dict['Customer'].append(customer)
            launch_dict['Launch outcome'].append(launch_outcome)
            launch_dict['Booster landing'].append(booster_landing)

# -------------------------------
# 5. Tạo DataFrame
# -------------------------------
df_wiki = pd.DataFrame(launch_dict)

# Kiểm tra 5 hàng đầu
print(df_wiki.head())

# -------------------------------
# 6. Lọc Falcon 9 & đếm missing Booster landing
# -------------------------------
df_falcon9 = df_wiki[df_wiki['Version Booster'].str.contains('F9', case=False, na=False)]
num_falcon9 = df_falcon9.shape[0]
num_missing_landing = df_falcon9['Payload mass'].isna().sum() + (df_falcon9[] == '').sum()

print("Number of Falcon 9 launches:", num_falcon9)
print("Number of missing Booster landing values:", num_missing_landing)


SyntaxError: invalid syntax (1641224497.py, line 127)