In [1]:
import pandas as pd
import src.wrangle as wrangle
import src.features as features
import src.preprocessing as preprocessing

In [2]:
df = wrangle.get_space_data()

In [3]:
df.head()

Unnamed: 0_level_0,company_name,location,rocket_type,rocket_status,mission_cost,mission_status
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-08-07 05:12:00+00:00,SpaceX,"LC-39A, Kennedy Space Center, Florida, USA",Falcon 9 Block 5 | Starlink V1 L9 & BlackSky,active,50.0,Success
2020-08-06 04:01:00+00:00,CASC,"Site 9401 (SLS-2), Jiuquan Satellite Launch Ce...",Long March 2D | Gaofen-9 04 & Q-SAT,active,29.75,Success
2020-08-04 23:57:00+00:00,SpaceX,"Pad A, Boca Chica, Texas, USA",Starship Prototype | 150 Meter Hop,active,0.0,Success
2020-07-30 21:25:00+00:00,Roscosmos,"Site 200/39, Baikonur Cosmodrome, Kazakhstan",Proton-M/Briz-M | Ekspress-80 & Ekspress-103,active,65.0,Success
2020-07-30 11:50:00+00:00,ULA,"SLC-41, Cape Canaveral AFS, Florida, USA",Atlas V 541 | Perseverance,active,145.0,Success


In [4]:
df.mission_status.value_counts()

Success              3879
Failure               339
Partial Failure       102
Prelaunch Failure       4
Name: mission_status, dtype: int64

> Let's make a new column that encodes the values from `mission_status` into a 1 or a 0 for success or failure

In [5]:
df = df.pipe(features.mission_result)

In [6]:
df.head()

Unnamed: 0_level_0,company_name,location,rocket_type,rocket_status,mission_cost,mission_status,mission_result
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-08-07 05:12:00+00:00,SpaceX,"LC-39A, Kennedy Space Center, Florida, USA",Falcon 9 Block 5 | Starlink V1 L9 & BlackSky,active,50.0,Success,1
2020-08-06 04:01:00+00:00,CASC,"Site 9401 (SLS-2), Jiuquan Satellite Launch Ce...",Long March 2D | Gaofen-9 04 & Q-SAT,active,29.75,Success,1
2020-08-04 23:57:00+00:00,SpaceX,"Pad A, Boca Chica, Texas, USA",Starship Prototype | 150 Meter Hop,active,0.0,Success,1
2020-07-30 21:25:00+00:00,Roscosmos,"Site 200/39, Baikonur Cosmodrome, Kazakhstan",Proton-M/Briz-M | Ekspress-80 & Ekspress-103,active,65.0,Success,1
2020-07-30 11:50:00+00:00,ULA,"SLC-41, Cape Canaveral AFS, Florida, USA",Atlas V 541 | Perseverance,active,145.0,Success,1


# Preprocessing

## Splitting the data

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train, test = train_test_split(df, test_size=.2, random_state=123)

In [9]:
print(train.shape)
print(test.shape)

(3459, 7)
(865, 7)


In [10]:
train.reset_index(inplace=True)

## Encoding

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
cols_to_encode = ['company_name', 'location', 'rocket_type']

In [13]:
for col in cols_to_encode:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])

In [14]:
train, split = preprocessing.split_data(df)

In [15]:
train = preprocessing.encode_data(train)
test = preprocessing.encode_data(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = encoder.fit_transform(df[col])


# Modeling

In [16]:
X_train = train[['company_name', 'location', 'rocket_type', 'mission_cost']]
y_train = train.mission_result

X_test = test[['company_name', 'location', 'rocket_type', 'mission_cost']]
y_test = test.mission_result

## Logistic Regression

In [68]:
from sklearn.linear_model import LogisticRegression

In [69]:
log = LogisticRegression(random_state=123)

In [70]:
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [71]:
log.score(X_train, y_train)

0.897080080948251

In [72]:
log.score(X_test, y_test)

0.8855491329479769

## Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
tree = DecisionTreeClassifier(max_depth=3, random_state=123)

In [19]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [20]:
tree.score(X_train, y_train)

0.8993928881179531

In [21]:
tree.score(X_test, y_test)

0.8855491329479769

## Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
forest = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=123)

In [25]:
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [26]:
forest.score(X_train, y_train)

0.8993928881179531

In [27]:
forest.score(X_test, y_test)

0.8855491329479769

## K Nearest Neighbors

In [62]:
from sklearn.neighbors import KNeighborsClassifier

In [63]:
knn = KNeighborsClassifier(n_neighbors=7)

In [64]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [65]:
knn.score(X_train, y_train)

0.9193408499566349

In [66]:
knn.score(X_test, y_test)

0.838150289017341