# **SpaceX  Falcon 9 first stage Landing Prediction**


# Part 1: Collecting the data using API requests

Import the following libraries.

In [1]:
import requests

import pandas as pd

import numpy as np

import datetime

import sys

from bs4 import BeautifulSoup

import re

import unicodedata

# Setting this option will print all collumns of a dataframe
pd.set_option('display.max_columns', None)

# Setting this option will print all of the data in a feature
pd.set_option('display.max_colwidth', None)

Below are a series of functions that will help us use the API to extract information using identification numbers in the launch data.

From the <code>rocket</code> column we would like to learn the booster name.

In [2]:
# Takes the dataset and uses the rocket column to call the API and append the data to the list
def getBoosterVersion(data):
    for x in data['rocket']:
        if x:
            response = requests.get("https://api.spacexdata.com/v4/rockets/"+str(x)).json()
            BoosterVersion.append(response['name'])

From the <code>launchpad</code> we would like to learn the following:
    the name of the launch site being used;
    the logitude;
    the latitude

In [3]:
# Takes the dataset and uses the launchpad column to call the API and append the data to the list
def getLaunchSite(data):
    for x in data['launchpad']:
        if x:
            response = requests.get("https://api.spacexdata.com/v4/launchpads/"+str(x)).json() 
            Longitude.append(response['longitude']) 
            Latitude.append(response['latitude']) 
            LaunchSite.append(response['name'])

From the <code>payload</code> we would like to learn the following: 
    mass of the payload;
    the orbit that it is going to

In [4]:
# Takes the dataset and uses the payloads column to call the API and append the data to the lists
def getPayloadData(data):
    for load in data['payloads']:
        if load:
            response = requests.get("https://api.spacexdata.com/v4/payloads/"+load).json()
            PayloadMass.append(response['mass_kg'])
            Orbit.append(response['orbit'])

From <code>cores</code> we would like to learn the following: 
    outcome of the landing; 
    the type of the landing; 
    number of flights with that core; 
    whether gridfins were used; 
    wheter the core is reused; 
    wheter legs were used; 
    the landing pad used; 
    the block of the core which is a number used to seperate version of cores; 
    the number of times this specific core has been reused; 
    the serial of the core; 

In [5]:
# Takes the dataset and uses the cores column to call the API and append the data to the lists
def getCoreData(data):
    for core in data['cores']:
        if core['core'] != None:
            response = requests.get("https://api.spacexdata.com/v4/cores/"+core['core']).json()
            Block.append(response['block'])
            ReusedCount.append(response['reuse_count'])
            Serial.append(response['serial'])
        else:
            Block.append(None)
            ReusedCount.append(None)
            Serial.append(None)
        Outcome.append(str(core['landing_success'])+' '+str(core['landing_type']))
        Flights.append(core['flight'])
        GridFins.append(core['gridfins'])
        Reused.append(core['reused'])
        Legs.append(core['legs'])
        LandingPad.append(core['landpad'])

Use the below URL to request data from the SpaceX API.

In [6]:
spacex_url="https://api.spacexdata.com/v4/launches/past"

In [7]:
response = requests.get(spacex_url)

(Optional) Check the content of the response

In [8]:
# Uncomment the below line to see the response from the API request. 
#print(response.content)

### Step 1: Request and parse the SpaceX launch data using the GET request

Check the status of the request (200 means request was successful).

In [9]:
response.status_code

200

Decode the response content as a Json using <code>.json()</code> and turn it into a Pandas dataframe using <code>.json_normalize()</code>

In [10]:
# Use json_normalize method to convert the json result into a dataframe
data = pd.json_normalize(response.json())

(Optional): Using the dataframe <code>data</code> print the first 5 rows

In [11]:
# Uncomment the below line to see the first 5 rows of the dataframe
#data.head()

Majority of the data are IDs. For example the rocket column has no information about the rocket just an identification number, so we need to use the API to get more information. Specifically <code>rocket</code>, <code>payloads</code>, <code>launchpad</code>, and <code>cores</code>.

In [12]:
# Lets take a subset of our dataframe keeping only the features we want and the flight number, and date_utc.
data = data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

# We will remove rows with multiple cores because those are falcon rockets with 2 extra rocket boosters and rows that have multiple payloads in a single rocket.
data = data[data['cores'].map(len)==1]
data = data[data['payloads'].map(len)==1]

# Since payloads and cores are lists of size 1 we will also extract the single value in the list and replace the feature.
data['cores'] = data['cores'].map(lambda x : x[0])
data['payloads'] = data['payloads'].map(lambda x : x[0])

# We also want to convert the date_utc to a datetime datatype and then extracting the date leaving the time
data['date'] = pd.to_datetime(data['date_utc']).dt.date

# Using the date we will restrict the dates of the launches
data = data[data['date'] <= datetime.date(2020, 11, 13)]

The data from these requests will be stored in lists and will be used to create a new dataframe.

In [13]:
#Global variables 
BoosterVersion = []
PayloadMass = []
Orbit = []
LaunchSite = []
Outcome = []
Flights = []
GridFins = []
Reused = []
Legs = []
LandingPad = []
Block = []
ReusedCount = []
Serial = []
Longitude = []
Latitude = []

The predefined functions from the top of the notebook will supply the outputs to the globally defined variables above.

In [14]:
# Call getLaunchSite
getLaunchSite(data)

In [15]:
#Call getBoosterVersion
getBoosterVersion(data)

In [16]:
# Call getPayloadData
getPayloadData(data)

In [17]:
# Call getCoreData
getCoreData(data)

Create a dictionay using the obtained data.

In [18]:
launch_dict = {'FlightNumber': list(data['flight_number']),
'Date': list(data['date']),
'BoosterVersion':BoosterVersion,
'PayloadMass':PayloadMass,
'Orbit':Orbit,
'LaunchSite':LaunchSite,
'Outcome':Outcome,
'Flights':Flights,
'GridFins':GridFins,
'Reused':Reused,
'Legs':Legs,
'LandingPad':LandingPad,
'Block':Block,
'ReusedCount':ReusedCount,
'Serial':Serial,
'Longitude': Longitude,
'Latitude': Latitude}


Create a Pandas data frame from the above dictionary.

In [19]:
# Create a data from launch_dict
data_2 = pd.DataFrame(launch_dict)

(Optional): Show the first 5 rows of the dataframe.

In [20]:
# Uncomment the below line to show the head of the dataframe
#data_2.head()

### Step 2: Filter the dataframe to only include `Falcon 9` launches

We are only interested in the Falcon 9 launches.

In [21]:
# Filter out Falcon 1 launches
data_falcon9 = data_2[data_2['BoosterVersion']!='Falcon 1']

We need to reset the FlightNumber column.

In [22]:
# Renumber the FlightNumber column 
data_falcon9.loc[:,'FlightNumber'] = list(range(1, data_falcon9.shape[0]+1))

# Part 2: Data Wrangling

### Step 1: Dealing with Missing Values

We need to check for missing values in our new dataset.

In [23]:
# Sum all of the null values in each column of the dataset
data_falcon9.isnull().sum()

FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        5
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

We are missing data from the <code>PayLoadMass</code> column, and the <code>LandingPad</code> column. <code>LandingPad</code> missing values show no use of a landing pad after a launch. No need to change those values.

We are going to replace the missing <code>PayLoadMass</code> values with the average or mean of the <code>PayLoadMass</code>.

In [24]:
# Calculate the mean value of PayloadMass column
payload_mean = data_falcon9['PayloadMass'].mean()

# Replace the np.nan values with its mean value
data_falcon9['PayloadMass'] = data_falcon9['PayloadMass'].replace(np.nan,payload_mean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_falcon9['PayloadMass'] = data_falcon9['PayloadMass'].replace(np.nan,payload_mean)


We can now export it to a <b>CSV</b>.

In [25]:
data_falcon9.to_csv('dataset_part_1.csv', index=False)

### Step 2: Classify Outcomes

Using the <code>Outcome</code> column, we can classify failures and successes using One Hot Encoding.

In [26]:
# Use landing_outcomes variable to see all possible outcomes
landing_outcomes = data_falcon9['Outcome'].value_counts()
for i,outcome in enumerate(landing_outcomes.keys()):
    print(i,outcome)

0 True ASDS
1 None None
2 True RTLS
3 False ASDS
4 True Ocean
5 False Ocean
6 None ASDS
7 False RTLS


In [27]:
# Assing failures to bad_outcomes
bad_outcomes=set(landing_outcomes.keys()[[1,3,5,6,7]])

In [28]:
# Assign classification to new variable landing_class
# landing_class = 0 if bad_outcome
# landing_class = 1 otherwise
landing_class = data_falcon9['Outcome'].map(lambda x: 0 if x in bad_outcomes else 1)

In [29]:
# Add a new column with outcome classifications
data_falcon9['Class']=landing_class

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_falcon9['Class']=landing_class


We can now export it to a <b>CSV</b>.

In [30]:
data_falcon9.to_csv("dataset_part_2.csv", index=False)

# Part 3: Features Engineering

We will select the features that will be used in success prediction.

In [31]:
features = data_falcon9[['FlightNumber', 'PayloadMass', 'Orbit', 'LaunchSite', 'Flights', 'GridFins', 'Reused', 'Legs', 'LandingPad', 'Block', 'ReusedCount', 'Serial']]

### Step  1: Create dummy variables to categorical columns

In [32]:
# Use get_dummies() function on the categorical columns
features_one_hot = pd.get_dummies(features, columns=['Orbit', 'LaunchSite','LandingPad', 'Serial'], drop_first=False)

### Step  2: Cast all numeric columns to `float64`

In [33]:
# Use astype function
features_one_hot = features_one_hot.astype('float64', errors='ignore')

We can now export it to a <b>CSV</b>.

In [34]:
features_one_hot.to_csv('dataset_part_3.csv', index=False)