### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as warn
import sys
import env
import numpy as np

sys.path.append(env.util_repo)

from acquire import get_titanic_data
from sklearn.model_selection import train_test_split
from scipy import stats

warn.filterwarnings("ignore")

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [28]:
titanic_df = get_titanic_data()
titanic_df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [29]:
titanic_df.shape

(891, 14)

In [30]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    891 non-null    int64  
 1   passenger_id  891 non-null    int64  
 2   survived      891 non-null    int64  
 3   pclass        891 non-null    int64  
 4   sex           891 non-null    object 
 5   age           714 non-null    float64
 6   sibsp         891 non-null    int64  
 7   parch         891 non-null    int64  
 8   fare          891 non-null    float64
 9   embarked      889 non-null    object 
 10  class         891 non-null    object 
 11  deck          203 non-null    object 
 12  embark_town   889 non-null    object 
 13  alone         891 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 97.6+ KB


In [31]:
# Drop these columns due to redundancy, missing values, irrelevant
titanic_df.drop(columns=['Unnamed: 0', 'passenger_id', 'age', 'sibsp', 'parch', 'deck', 'embarked', 'pclass'], inplace=True)
titanic_df.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone
0,0,male,7.25,Third,Southampton,0
1,1,female,71.2833,First,Cherbourg,0
2,1,female,7.925,Third,Southampton,1
3,1,female,53.1,First,Southampton,0
4,0,male,8.05,Third,Southampton,1


In [38]:
titanic_df.embark_town.value_counts()
titanic_df.fillna(value="Southampton", inplace=True)
titanic_df.embark_town.value_counts()

Southampton    646
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [32]:
# Split the data
train, test = train_test_split(titanic_df, test_size=.2, random_state=1414, stratify=titanic_df['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=1414, stratify=train['survived'])

In [33]:
train.survived.mean()

0.38353413654618473

In [34]:
# Baseline prediction is that passenger did NOT survive (survived = 0)
train['baseline_survived'] = 0
train.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,baseline_survived
824,0,male,39.6875,Third,Southampton,0,0
822,0,male,0.0,First,Southampton,1,0
149,0,male,13.0,Second,Southampton,1,0
752,0,male,9.5,Third,Southampton,1,0
94,0,male,7.25,Third,Southampton,1,0


In [35]:
baseline_accuracy = (train.survived == train.baseline_survived).mean()
print(f"baseline accuracy:  {baseline_accuracy:.2%}")

baseline accuracy:  61.65%


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [21]:
# Prepare inputs (X) and targets (y)
X_test = train.drop(columns='survived')
y_test = train.survived



(     pclass     sex   age  sibsp  parch     fare embarked   class deck  \
 824       3    male   2.0      4      1  39.6875        S   Third  NaN   
 822       1    male  38.0      0      0   0.0000        S   First  NaN   
 149       2    male  42.0      0      0  13.0000        S  Second  NaN   
 752       3    male  33.0      0      0   9.5000        S   Third  NaN   
 94        3    male  59.0      0      0   7.2500        S   Third  NaN   
 ..      ...     ...   ...    ...    ...      ...      ...     ...  ...   
 512       1    male  36.0      0      0  26.2875        S   First    E   
 775       3    male  18.0      0      0   7.7500        S   Third  NaN   
 533       3  female   NaN      0      2  22.3583        C   Third  NaN   
 823       3  female  27.0      0      1  12.4750        S   Third    E   
 172       3  female   1.0      1      1  11.1333        S   Third  NaN   
 
      embark_town  alone  baseline_survived  
 824  Southampton      0                  0  
 822  