In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

from acquire import get_titanic_data
from prepare import prep_titanic
import warnings
warnings.filterwarnings("ignore")

In [None]:
train, validate, test = prep_titanic()
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')
train.head()

### 1. Start by defining your baseline model.

In [None]:
train.dropna(inplace=True)

In [None]:
train.survived.value_counts()

In [None]:
train['baseline_prediction'] = 'not survived'
train.head()

In [None]:
pd.crosstab(train.baseline_prediction, train.survived)

In [None]:
baseline_accuracy = 1- (train.survived).mean()
baseline_accuracy

### 2. Create another model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [None]:
X_train = train[['pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['pclass','age','fare']]
y_test = test[['survived']]

In [None]:
logit = LogisticRegression()

In [None]:
logit = logit.fit(X_train, y_train)

In [None]:
print(logit.coef_)

print(logit.intercept_)

In [None]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [None]:
y_pred_proba

In [None]:
# accuracy
logit.score(X_train, y_train)

In [None]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute percision and support
print(classification_report(y_train, y_pred))

### I calculated a baseline accuracy 62% and the model showed a accuracy of 72%

### 3. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [None]:
X_train = train[['sex_male','pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','age','fare']]
y_test = test[['survived']]

In [None]:
logit2 = LogisticRegression()

In [None]:
logit2 = logit.fit(X_train, y_train)

In [None]:
print(logit2.coef_)

print(logit2.intercept_)

In [None]:
y_pred = logit2.predict(X_train)
y_pred_proba = logit2.predict_proba(X_train)

In [None]:
# accuracy
logit2.score(X_train, y_train)

In [None]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute percision and support
print(classification_report(y_train, y_pred))

### Accuracy is at 80% now with sex included

### 4. Try out other combinations of features and models.

In [None]:
# Creating a model with alone embarked Q & S & fare

In [None]:
train.head()

In [None]:
X_train = train[['alone','embarked_Q','embarked_S','fare']]
y_train = train[['survived']]
X_validate = validate[['alone','embarked_Q','embarked_S','fare']]
y_validate = validate[['survived']]
X_test = test[['alone','embarked_Q','embarked_S','fare']]
y_test = test[['survived']]

In [None]:
logit3 = LogisticRegression()

In [None]:
logit3 = logit3.fit(X_train, y_train)

In [None]:
print(logit3.coef_)

print(logit3.intercept_)

In [None]:
# accuracy
logit3.score(X_train, y_train)

In [None]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute percision and support
print(classification_report(y_train, y_pred))

### Going to do 3. the model set with sex and change some of the 

In [None]:
X_train = train[['sex_male','pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','age','fare']]
y_test = test[['survived']]

In [None]:
logit4 = LogisticRegression(C=1000, class_weight={0:1, 1:3}, random_state=123, max_iter=500, solver='lbfgs')


In [None]:
logit4.fit(X_train, y_train)

In [None]:
LogisticRegression(C=1000, random_state=123)


In [None]:
print('Coefficient: \n', logit4.coef_)
print('Intercept: \n', logit4.intercept_)

In [None]:
y_pred = logit4.predict(X_train)

In [None]:
y_pred_proba = logit4.predict_proba(X_train)

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit4.score(X_train, y_train)))

With a low c ~.1 accuracy went down 1 hundredth .01 to .79. I increased C to 1500 and accuracy stayed at 80%. When weights were added class_weight={0: 1, 1: 99}, the accuracy went down tremendously to .38. Adjusted the class_weight={0: 1, 1: 3} and got 73%


In [None]:
X_train = train[['sex_male','pclass','sibsp','age']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','sibsp','age']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','sibsp','age']]
y_test = test[['survived']]

In [None]:
logit5 = LogisticRegression()

In [None]:
logit5 = logit5.fit(X_train, y_train)

In [None]:
print(logit5.coef_)

print(logit5.intercept_)

In [None]:
# accuracy
logit5.score(X_train, y_train)

In [None]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute percision and support
print(classification_report(y_train, y_pred))

### 5. Use you best 3 models to predict and evaluate on your validate sample. Logit 2, 4, & 5

In [None]:
y_pred2 = logit2.predict(X_validate)
y_pred4 = logit4.predict(X_validate)
y_pred5 = logit5.predict(X_validate)

print("Model 2: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred2))

print(classification_report(y_validate, y_pred2))

print("Model 4: solver = lbfgs, c = 1000 ,class_weight={0:1, 1:3}, random_state=123, max_iter=500")

print('Accuracy: {:.2f}'.format(logit4.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred4))

print(classification_report(y_validate, y_pred4))

print("Model 5: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit5.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred5))

print(classification_report(y_validate, y_pred5))

### 6. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [None]:
y_pred = logit5.predict(X_test)
y_pred_proba = logit5.predict_proba(X_test)

print("Model 5: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit5.score(X_test, y_test)))

print(confusion_matrix(y_validate, y_pred5))

print(classification_report(y_validate, y_pred5))

In [None]:
#My model got an accuracy of .79 on test, .76 on validate and .81 on test

In [None]:
!git status

# Decision Tree Exercises

## Working through the curric example

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd

df = data('iris')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [None]:
# Cleaning up the data to get rid of capitalized letters and periods
df.columns = [col.lower().replace('.', '_') for col in df]
df.head()

In [None]:
# Dropping species because that is the variable our decision tree is going to try and predict
X = df.drop(['species'], axis = 1)
y = df[['species']]

In [None]:
# Split into train, validate, and test datasets
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)


In [None]:
# Creating the decision tree object
# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# Fitting the data to the trained data
clf.fit(X_train, y_train)

In [None]:
DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# Estimating the species
y_pred = clf.predict(X_train)
y_pred[0:5]

In [None]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
y_train.species.value_counts()

In [None]:
labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train, y_pred))

### 1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [3]:
train, validate, test = prep_titanic()

In [4]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male
583,0,1,36.0,0,0,40.125,1,0,0,1
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,0,1,1
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0


In [5]:
# Decision tree object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [6]:
# Split the data for the train set
X_train1 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [7]:
# Fit the model to the training data
clf.fit(X_train1, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [8]:
DecisionTreeClassifier(max_depth=3, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [9]:
# array of not survived and survived
y_pred1 = clf.predict(X_train1)

In [10]:
# Estimate the probability of not survived
y_pred_proba = clf.predict_proba(X_train1)

### 2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [11]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train1, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [12]:
confusion_matrix(y_pred1, y_train)

array([[279,  62],
       [ 28, 128]])

In [13]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred1, y_train), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,279,62
Model Survived,28,128


In [14]:
print("Model1 report:\n", classification_report(y_train, y_pred1))

Model1 report:
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [15]:
TP = 279
TN = 128
FP = 62
FN = 28

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [16]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Accuracy: 0.82, Precision: 0.82, Recall: 0.91
f1_score: 0.86,Support0: 307, Support1: 190


### 4. Run through steps 2-4 using a different max_depth value. Used 6 for this example

In [17]:
clf = DecisionTreeClassifier(max_depth=6, random_state=123)

In [18]:
X_train2 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [19]:
clf.fit(X_train2, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [20]:
DecisionTreeClassifier(max_depth=6, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [21]:
y_pred2 = clf.predict(X_train2)

In [22]:
y_pred_proba = clf.predict_proba(X_train2)

In [23]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train2, y_train)))

Accuracy of Decision Tree classifier on training set: 0.86


In [24]:
confusion_matrix(y_pred2, y_train)

array([[282,  44],
       [ 25, 146]])

In [25]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred2, y_train), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,282,44
Model Survived,25,146


In [26]:
print("Model2 report:\n", classification_report(y_train, y_pred2))

Model2 report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.89       307
           1       0.85      0.77      0.81       190

    accuracy                           0.86       497
   macro avg       0.86      0.84      0.85       497
weighted avg       0.86      0.86      0.86       497



In [27]:
TP = 282
TN = 146
FP = 44
FN = 25

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [28]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Accuracy: 0.86, Precision: 0.87, Recall: 0.92
f1_score: 0.9,Support0: 307, Support1: 190


### 4. Last model with max depth of 9.

In [29]:
clf = DecisionTreeClassifier(max_depth=9, random_state=123)

In [30]:
X_train3 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [31]:
clf.fit(X_train3, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [32]:
DecisionTreeClassifier(max_depth=9, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [33]:
y_pred3 = clf.predict(X_train3)

In [34]:
y_pred_proba = clf.predict_proba(X_train3)

In [35]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train3, y_train)))

Accuracy of Decision Tree classifier on training set: 0.91


In [36]:
confusion_matrix(y_pred3, y_train)

array([[294,  30],
       [ 13, 160]])

In [37]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred3, y_train), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,294,30
Model Survived,13,160


In [38]:
print("Model3 report:\n", classification_report(y_train, y_pred3))

Model3 report:
               precision    recall  f1-score   support

           0       0.91      0.96      0.93       307
           1       0.92      0.84      0.88       190

    accuracy                           0.91       497
   macro avg       0.92      0.90      0.91       497
weighted avg       0.91      0.91      0.91       497



In [39]:
TP = 294
TN = 160
FP = 30
FN = 13

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [40]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Accuracy: 0.91, Precision: 0.91, Recall: 0.96
f1_score: 0.94,Support0: 307, Support1: 190


In [41]:
# Now going to validate my top 3

In [42]:
# Decision tree object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [45]:
X_validate1 = validate.drop(['survived'], axis=1)
y_validate = validate[['survived']]

In [46]:
clf.fit(X_validate1, y_validate)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [47]:
DecisionTreeClassifier(max_depth=3, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [48]:
y_pred1 = clf.predict(X_validate1)

In [49]:
y_pred_proba = clf.predict_proba(X_validate1)

In [50]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate1, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.83


In [51]:
# Model 2

In [52]:
clf = DecisionTreeClassifier(max_depth=6, random_state=123)

In [53]:
X_validate2 = validate.drop(['survived'], axis=1)

In [54]:
clf.fit(X_validate2, y_validate)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [55]:
DecisionTreeClassifier(max_depth=6, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [56]:
y_pred2 = clf.predict(X_validate2)

In [57]:
y_pred_proba = clf.predict_proba(X_validate2)

In [58]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate2, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.89


In [59]:
# Model 3

In [60]:
clf = DecisionTreeClassifier(max_depth=9, random_state=123)

In [61]:
X_validate3 = validate.drop(['survived'], axis=1)

In [62]:
clf.fit(X_validate3, y_validate)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [63]:
DecisionTreeClassifier(max_depth=9, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [64]:
y_pred3 = clf.predict(X_validate3)

In [65]:
y_pred_proba = clf.predict_proba(X_validate3)

In [66]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate3, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.96


In [68]:
confusion_matrix(y_pred3, y_validate)

array([[132,   8],
       [  0,  74]])

In [69]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred3, y_validate), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,132,8
Model Survived,0,74


In [71]:
print("Model3 validate report:\n", classification_report(y_validate, y_pred3))

Model3 validate report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       132
           1       1.00      0.90      0.95        82

    accuracy                           0.96       214
   macro avg       0.97      0.95      0.96       214
weighted avg       0.96      0.96      0.96       214



In [72]:
TP = 132
TN = 74
FP = 8
FN = 0

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [74]:
print(f"Model 3 validate \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Model 3 validate 
Accuracy: 0.96, Precision: 0.94, Recall: 1.0
f1_score: 0.97,Support0: 132, Support1: 82


In [75]:
validate.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male
610,0,3,39.0,1,5,31.275,0,0,1,0
424,0,3,18.0,1,1,20.2125,0,0,1,1
568,0,3,29.916875,0,0,7.2292,1,0,0,1
701,1,1,35.0,0,0,26.2875,1,0,1,1
101,0,3,29.916875,0,0,7.8958,1,0,1,1


In [None]:
# Now doing the test on Model 3

In [None]:
clf = DecisionTreeClassifier(max_depth=9, random_state=123)

In [77]:
X_test3 = test.drop(['survived'], axis=1)
y_test = test[['survived']]

In [78]:
clf.fit(X_test3, y_test)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [79]:
DecisionTreeClassifier(max_depth=9, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [80]:
y_pred3 = clf.predict(X_test3)

In [81]:
y_pred_proba = clf.predict_proba(X_test3)

In [84]:
print('Test 3 has an accuracy of: {:.2f}'
     .format(clf.score(X_test3, y_test)))

Test 3 has an accuracy of: 0.99


In [85]:
confusion_matrix(y_pred3, y_test)

array([[110,   1],
       [  0,  67]])

In [86]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred3, y_test), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,110,1
Model Survived,0,67


In [87]:
TP = 110
TN = 67
FP = 1
FN = 0

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [88]:
print(f"Model 3 Test \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Model 3 Test 
Accuracy: 0.99, Precision: 0.99, Recall: 1.0
f1_score: 0.99,Support0: 110, Support1: 68


In conclustion, my train3 had the best accuracy of 86%. On validate it had an accuracy of 92% and tested at 99% on the test model. 

In [102]:
test = prep_titanic()

clf = DecisionTreeClassifier()
clf = clf.fit(X_test3, y_test)

import graphviz

from graphviz import Graph

dot_data = export_graphviz(clf, feature_names= X_test3.columns, class_names= {0:'Died', 1:'Survived'}, rounded=True, filled=True, out_file=None)

graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)


'titanic_decision_tree.pdf'

In [89]:
!git status

On branch master
Your branch and 'origin/master' have diverged,
and have 1 and 1 different commits each, respectively.
  (use "git pull" to merge the remote branch into yours)

All conflicts fixed but you are still merging.
  (use "git commit" to conclude merge)

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   model.ipynb[m
	[31mmodified:   prepare.py[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31miris_decision_tree[m



In [90]:
!git add -A

In [91]:
!git commit -m "Finished the exercise"

[master aab3dbf] Finished the exercise


In [92]:
!git push

Enumerating objects: 18, done.
Counting objects: 100% (15/15), done.
Delta compression using up to 8 threads
Compressing objects: 100% (10/10), done.
Writing objects: 100% (10/10), 8.65 KiB | 2.88 MiB/s, done.
Total 10 (delta 7), reused 0 (delta 0)
remote: Resolving deltas: 100% (7/7), completed with 4 local objects.[K
To https://github.com/george887/classification_exercises.git
   eed1b9f..aab3dbf  master -> master
