In [1]:
# Import Libraries and Define Auxiliary Functions
import requests
import pandas as pd
import numpy as np
import datetime

# Setting the options to print all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Define functions to extract data from APIs

def getBoosterVersion(data):
    """
    Extract booster name from rocket ID
    """
    BoosterVersion = []
    for rocket in data['rocket']:
        if rocket:
            response = requests.get(f"https://api.spacexdata.com/v4/rockets/{str(rocket)}").json()
            BoosterVersion.append(response['name'])
        else:
            BoosterVersion.append(None)
    return BoosterVersion

def getLaunchSite(data):
    """
    Extract launch site name, longitude, and latitude
    """
    LaunchSite = []
    Longitude = []
    Latitude = []
    for launchpad in data['launchpad']:
        if launchpad:
            response = requests.get(f"https://api.spacexdata.com/v4/launchpads/{str(launchpad)}").json()
            LaunchSite.append(response['name'])
            Longitude.append(response['longitude'])
            Latitude.append(response['latitude'])
        else:
            LaunchSite.append(None)
            Longitude.append(None)
            Latitude.append(None)
    return LaunchSite, Longitude, Latitude

def getPayloadData(data):
    """
    Extract payload mass and orbit
    """
    PayloadMass = []
    Orbit = []
    for payload in data['payloads']:
        if payload:
            response = requests.get(f"https://api.spacexdata.com/v4/payloads/{str(payload)}").json()
            PayloadMass.append(response['mass_kg'])
            Orbit.append(response['orbit'])
        else:
            PayloadMass.append(None)
            Orbit.append(None)
    return PayloadMass, Orbit

def getCoreData(data):
    """
    Extract core information including landing outcome, flights, etc.
    """
    Block = []
    ReusedCount = []
    Serial = []
    Outcome = []
    Flights = []
    Gridfins = []
    Reused = []
    Legs = []
    LandingPad = []
    
    for core in data['cores']:
        if core['core'] is not None:
            response = requests.get(f"https://api.spacexdata.com/v4/cores/{core['core']}").json()
            Block.append(response['block'])
            ReusedCount.append(response['reuse_count'])
            Serial.append(response['serial'])
        else:
            Block.append(None)
            ReusedCount.append(None)
            Serial.append(None)
        
        # Create landing outcome string
        landing_success = core['landing_success']
        landing_type = core['landing_type']
        outcome_str = f"{landing_success} {landing_type}" if landing_success is not None else "None"
        Outcome.append(outcome_str)
        
        Flights.append(core['flight'])
        Gridfins.append(core['gridfins'])
        Reused.append(core['reused'])
        Legs.append(core['legs'])
        LandingPad.append(core['landpad'])
    
    return Block, ReusedCount, Serial, Outcome, Flights, Gridfins, Reused, Legs, LandingPad

In [3]:
# Request and parse the SpaceX launch data using GET request
print("=== TASK 1: REQUEST AND PARSE SPACEX LAUNCH DATA ===")

spacex_url = "https://api.spacexdata.com/v4/launches/past"
response = requests.get(spacex_url)

print(f"Response status code: {response.status_code}")

# Use static URL for consistent results
static_json_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/API_call_spacex_api.json"
response = requests.get(static_json_url)

print(f"Static URL status code: {response.status_code}")

# Convert JSON to DataFrame
data = pd.json_normalize(response.json())

print(f"DataFrame created successfully! Shape: {data.shape}")

=== TASK 1: REQUEST AND PARSE SPACEX LAUNCH DATA ===
Response status code: 200
Static URL status code: 200
DataFrame created successfully! Shape: (107, 42)


In [4]:
# Using the dataframe data print the first 5 rows
print("First 5 rows of the dataframe:")
print(data.head())

First 5 rows of the dataframe:
       static_fire_date_utc  static_fire_date_unix    tbd    net  window  \
0  2006-03-17T00:00:00.000Z           1.142554e+09  False  False     0.0   
1                      None                    NaN  False  False     0.0   
2                      None                    NaN  False  False     0.0   
3  2008-09-20T00:00:00.000Z           1.221869e+09  False  False     0.0   
4                      None                    NaN  False  False     0.0   

                     rocket  success  \
0  5e9d0d95eda69955f709d1eb    False   
1  5e9d0d95eda69955f709d1eb    False   
2  5e9d0d95eda69955f709d1eb    False   
3  5e9d0d95eda69955f709d1eb     True   
4  5e9d0d95eda69955f709d1eb     True   

                                             details crew ships capsules  \
0   Engine failure at 33 seconds and loss of vehicle   []    []       []   
1  Successful first stage burn and transition to ...   []    []       []   
2  Residual stage 1 thrust led to collision

In [5]:
# Data preprocessing and filtering
print("=== DATA PREPROCESSING ===")

# Take a subset of dataframe keeping only the features we want
data = data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]
print(f"After feature selection - Shape: {data.shape}")

# Remove rows with multiple cores and multiple payloads
data = data[data['cores'].map(len) == 1]
data = data[data['payloads'].map(len) == 1]
print(f"After single core/payload filter - Shape: {data.shape}")

# Extract single values from lists
data['cores'] = data['cores'].map(lambda x: x[0])
data['payloads'] = data['payloads'].map(lambda x: x[0])

# Convert date_utc to datetime and extract date
data['date'] = pd.to_datetime(data['date_utc']).dt.date

# Restrict dates of launches
data = data[data['date'] <= datetime.date(2020, 11, 13)]
print(f"After date filtering - Final shape: {data.shape}")

print("\nFirst 5 rows after preprocessing:")
print(data.head())

=== DATA PREPROCESSING ===
After feature selection - Shape: (107, 6)
After single core/payload filter - Shape: (95, 6)
After date filtering - Final shape: (94, 7)

First 5 rows after preprocessing:
                     rocket                  payloads  \
0  5e9d0d95eda69955f709d1eb  5eb0e4b5b6c3bb0006eeb1e1   
1  5e9d0d95eda69955f709d1eb  5eb0e4b6b6c3bb0006eeb1e2   
3  5e9d0d95eda69955f709d1eb  5eb0e4b7b6c3bb0006eeb1e5   
4  5e9d0d95eda69955f709d1eb  5eb0e4b7b6c3bb0006eeb1e6   
5  5e9d0d95eda69973a809d1ec  5eb0e4b7b6c3bb0006eeb1e7   

                  launchpad  \
0  5e9e4502f5090995de566f86   
1  5e9e4502f5090995de566f86   
3  5e9e4502f5090995de566f86   
4  5e9e4502f5090995de566f86   
5  5e9e4501f509094ba4566f84   

                                               cores  flight_number  \
0  {'core': '5e9e289df35918033d3b2623', 'flight':...              1   
1  {'core': '5e9e289ef35918416a3b2624', 'flight':...              2   
3  {'core': '5e9e289ef3591855dc3b2626', 'flight':...       

In [6]:
# Apply API functions to enrich the data
print("=== APPLYING API FUNCTIONS ===")

# Call getBoosterVersion
BoosterVersion = getBoosterVersion(data)
print(f"BoosterVersion extracted: {len(BoosterVersion)} values")

# Call getLaunchSite
LaunchSite, Longitude, Latitude = getLaunchSite(data)
print(f"LaunchSite extracted: {len(LaunchSite)} values")

# Call getPayloadData
PayloadMass, Orbit = getPayloadData(data)
print(f"Payload data extracted: {len(PayloadMass)} values")

# Call getCoreData
Block, ReusedCount, Serial, Outcome, Flights, Gridfins, Reused, Legs, LandingPad = getCoreData(data)
print(f"Core data extracted: {len(Block)} values")

=== APPLYING API FUNCTIONS ===
BoosterVersion extracted: 94 values
LaunchSite extracted: 94 values
Payload data extracted: 94 values
Core data extracted: 94 values


In [7]:
# Create final dataset dictionary
print("=== CREATING FINAL DATASET ===")

launch_dict = {
    'FlightNumber': list(data['flight_number']),
    'Date': list(data['date']),
    'BoosterVersion': BoosterVersion,
    'PayloadMass': PayloadMass,
    'Orbit': Orbit,
    'LaunchSite': LaunchSite,
    'Outcome': Outcome,
    'Flights': Flights,
    'GridFins': Gridfins,
    'Reused': Reused,
    'Legs': Legs,
    'LandingPad': LandingPad,
    'Block': Block,
    'ReusedCount': ReusedCount,
    'Serial': Serial,
    'Longitude': Longitude,
    'Latitude': Latitude
}

# Create DataFrame from dictionary
df = pd.DataFrame(launch_dict)

print(f"Final dataset shape: {df.shape}")
print("\nFirst 5 rows of final dataset:")
print(df.head())

=== CREATING FINAL DATASET ===
Final dataset shape: (94, 17)

First 5 rows of final dataset:
   FlightNumber        Date BoosterVersion  PayloadMass Orbit  \
0             1  2006-03-24       Falcon 1         20.0   LEO   
1             2  2007-03-21       Falcon 1          NaN   LEO   
2             4  2008-09-28       Falcon 1        165.0   LEO   
3             5  2009-07-13       Falcon 1        200.0   LEO   
4             6  2010-06-04       Falcon 9          NaN   LEO   

        LaunchSite Outcome  Flights  GridFins  Reused   Legs LandingPad  \
0  Kwajalein Atoll    None        1     False   False  False       None   
1  Kwajalein Atoll    None        1     False   False  False       None   
2  Kwajalein Atoll    None        1     False   False  False       None   
3  Kwajalein Atoll    None        1     False   False  False       None   
4     CCSFS SLC 40    None        1     False   False  False       None   

   Block  ReusedCount    Serial   Longitude   Latitude  
0    NaN

In [8]:
# Task 2: Filter the dataframe to only include Falcon 9 launches
print("=== TASK 2: FILTER FOR FALCON 9 ONLY ===")

# Filter to keep only Falcon 9 launches
data_falcon9 = df[df['BoosterVersion'].str.contains('Falcon 9', na=False)].copy()
print(f"After Falcon 9 filter - Shape: {data_falcon9.shape}")

# Reset the FlightNumber column
data_falcon9.loc[:, 'FlightNumber'] = range(1, len(data_falcon9) + 1)
print("FlightNumber column reset")

=== TASK 2: FILTER FOR FALCON 9 ONLY ===
After Falcon 9 filter - Shape: (90, 17)
FlightNumber column reset


In [9]:
# Check for missing values
print("=== CHECKING MISSING VALUES ===")
print("Missing values in each column:")
print(data_falcon9.isnull().sum())

print(f"\nTotal rows with missing values: {data_falcon9.isnull().any(axis=1).sum()}")

=== CHECKING MISSING VALUES ===
Missing values in each column:
FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        5
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

Total rows with missing values: 29


In [10]:
# Task 3: Dealing with Missing Values
print("=== TASK 3: DEALING WITH MISSING VALUES ===")

# Calculate the mean value of PayloadMass column
payload_mean = data_falcon9['PayloadMass'].mean()
print(f"Mean PayloadMass: {payload_mean:.2f} kg")

# Replace the np.nan values with the mean value
data_falcon9['PayloadMass'].fillna(payload_mean, inplace=True)

print("After handling missing values in PayloadMass:")
print(f"Missing values in PayloadMass: {data_falcon9['PayloadMass'].isnull().sum()}")

=== TASK 3: DEALING WITH MISSING VALUES ===
Mean PayloadMass: 6123.55 kg
After handling missing values in PayloadMass:
Missing values in PayloadMass: 0


In [11]:
# Export to CSV for the next section
print("=== EXPORTING DATA ===")

data_falcon9.to_csv("dataset_part_1.csv", index=False)
print("Dataset exported successfully as 'dataset_part_1.csv'")

print(f"Final dataset shape: {data_falcon9.shape}")
print("\nDataset info:")
print(data_falcon9.info())

=== EXPORTING DATA ===
Dataset exported successfully as 'dataset_part_1.csv'
Final dataset shape: (90, 17)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 90 entries, 4 to 93
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   FlightNumber    90 non-null     int64  
 1   Date            90 non-null     object 
 2   BoosterVersion  90 non-null     object 
 3   PayloadMass     90 non-null     float64
 4   Orbit           90 non-null     object 
 5   LaunchSite      90 non-null     object 
 6   Outcome         90 non-null     object 
 7   Flights         90 non-null     int64  
 8   GridFins        90 non-null     bool   
 9   Reused          90 non-null     bool   
 10  Legs            90 non-null     bool   
 11  LandingPad      64 non-null     object 
 12  Block           90 non-null     float64
 13  ReusedCount     90 non-null     int64  
 14  Serial          90 non-null     object 
 15  Longi

In [12]:
# Display final summary
print("=== FINAL DATASET SUMMARY ===")
print(f"Dataset shape: {data_falcon9.shape}")
print(f"Columns: {list(data_falcon9.columns)}")
print(f"Date range: {data_falcon9['Date'].min()} to {data_falcon9['Date'].max()}")
print(f"Number of Falcon 9 launches: {len(data_falcon9)}")
print(f"Unique booster versions: {data_falcon9['BoosterVersion'].unique()}")

=== FINAL DATASET SUMMARY ===
Dataset shape: (90, 17)
Columns: ['FlightNumber', 'Date', 'BoosterVersion', 'PayloadMass', 'Orbit', 'LaunchSite', 'Outcome', 'Flights', 'GridFins', 'Reused', 'Legs', 'LandingPad', 'Block', 'ReusedCount', 'Serial', 'Longitude', 'Latitude']
Date range: 2010-06-04 to 2020-11-05
Number of Falcon 9 launches: 90
Unique booster versions: ['Falcon 9']
