In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

from acquire import get_titanic_data
from prepare import prep_titanic
import warnings
warnings.filterwarnings("ignore")

In [2]:
train, validate, test = prep_titanic()
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')
train.head()

train -> (497, 10)
validate -> (214, 10)
test -> (178, 10)


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male
583,0,1,36.0,0,0,40.125,1,0,0,1
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,0,1,1
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0


### 1. Start by defining your baseline model.

In [3]:
train.dropna(inplace=True)

In [4]:
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [5]:
train['baseline_prediction'] = 'not survived'
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male,baseline_prediction
583,0,1,36.0,0,0,40.125,1,0,0,1,not survived
337,1,1,41.0,0,0,134.5,1,0,0,0,not survived
50,0,3,7.0,4,1,39.6875,0,0,1,1,not survived
218,1,1,32.0,0,0,76.2917,1,0,0,0,not survived
31,1,1,29.916875,1,0,146.5208,0,0,0,0,not survived


In [6]:
pd.crosstab(train.baseline_prediction, train.survived)

survived,0,1
baseline_prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
not survived,307,190


In [7]:
baseline_accuracy = 1- (train.survived).mean()
baseline_accuracy

0.6177062374245472

In [8]:
X_train = train.drop(columns = ["survived", "baseline_prediction"])
y_train = train.survived

In [9]:
logit1 = LogisticRegression()

In [10]:
logit1 = logit1.fit(X_train, y_train)

In [11]:
print(logit1.coef_)
print(logit1.intercept_)

[[-1.08378928e+00 -3.06325650e-02 -4.87905273e-01 -2.50746444e-01
   1.80619176e-03 -9.12847169e-01  9.03534779e-01  2.64568017e-01
  -2.48352720e+00]]
[4.95340416]


In [12]:
y_pred = logit1.predict(X_train)

In [13]:
logit1.score(X_train, y_train)

0.8028169014084507

### I did not use this model when comparing all other models. Compared models from problem 2 to 4

### 2. Create another model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [None]:
X_train = train[['pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['pclass','age','fare']]
y_test = test[['survived']]

In [None]:
logit = LogisticRegression()

In [None]:
logit = logit.fit(X_train, y_train)

In [None]:
print(logit.coef_)

print(logit.intercept_)

In [None]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [None]:
y_pred_proba

In [None]:
# accuracy
logit.score(X_train, y_train)

In [None]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute percision and support
print(classification_report(y_train, y_pred))

### I calculated a baseline accuracy 62% and the model showed a accuracy of 72%

### 3. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [None]:
X_train = train[['sex_male','pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','age','fare']]
y_test = test[['survived']]

In [None]:
logit2 = LogisticRegression()

In [None]:
logit2 = logit.fit(X_train, y_train)

In [None]:
print(logit2.coef_)

print(logit2.intercept_)

In [None]:
y_pred = logit2.predict(X_train)
y_pred_proba = logit2.predict_proba(X_train)

In [None]:
# accuracy
logit2.score(X_train, y_train)

In [None]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute percision and support
print(classification_report(y_train, y_pred))

### Accuracy is at 80% now with sex included

### 4. Try out other combinations of features and models.

In [None]:
# Creating a model with alone embarked Q & S & fare

In [None]:
train.head()

In [None]:
X_train = train[['alone','embarked_Q','embarked_S','fare']]
y_train = train[['survived']]
X_validate = validate[['alone','embarked_Q','embarked_S','fare']]
y_validate = validate[['survived']]
X_test = test[['alone','embarked_Q','embarked_S','fare']]
y_test = test[['survived']]

In [None]:
logit3 = LogisticRegression()

In [None]:
logit3 = logit3.fit(X_train, y_train)

In [None]:
print(logit3.coef_)

print(logit3.intercept_)

In [None]:
# accuracy
logit3.score(X_train, y_train)

In [None]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute percision and support
print(classification_report(y_train, y_pred))

### Going to do 3. the model set with sex and change some of the 

In [None]:
X_train = train[['sex_male','pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','age','fare']]
y_test = test[['survived']]

In [None]:
logit4 = LogisticRegression(C=1000, class_weight={0:1, 1:3}, random_state=123, max_iter=500, solver='lbfgs')


In [None]:
logit4.fit(X_train, y_train)

In [None]:
LogisticRegression(C=1000, random_state=123)


In [None]:
print('Coefficient: \n', logit4.coef_)
print('Intercept: \n', logit4.intercept_)

In [None]:
y_pred = logit4.predict(X_train)

In [None]:
y_pred_proba = logit4.predict_proba(X_train)

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit4.score(X_train, y_train)))

With a low c ~.1 accuracy went down 1 hundredth .01 to .79. I increased C to 1500 and accuracy stayed at 80%. When weights were added class_weight={0: 1, 1: 99}, the accuracy went down tremendously to .38. Adjusted the class_weight={0: 1, 1: 3} and got 73%


In [None]:
X_train = train[['sex_male','pclass','sibsp','age']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','sibsp','age']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','sibsp','age']]
y_test = test[['survived']]

In [None]:
logit5 = LogisticRegression()

In [None]:
logit5 = logit5.fit(X_train, y_train)

In [None]:
print(logit5.coef_)

print(logit5.intercept_)

In [None]:
# accuracy
logit5.score(X_train, y_train)

In [None]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute percision and support
print(classification_report(y_train, y_pred))

### 5. Use you best 3 models to predict and evaluate on your validate sample. Logit 2, 4, & 5

In [None]:
y_pred2 = logit2.predict(X_validate)
y_pred4 = logit4.predict(X_validate)
y_pred5 = logit5.predict(X_validate)

print("Model 2: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred2))

print(classification_report(y_validate, y_pred2))

print("Model 4: solver = lbfgs, c = 1000 ,class_weight={0:1, 1:3}, random_state=123, max_iter=500")

print('Accuracy: {:.2f}'.format(logit4.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred4))

print(classification_report(y_validate, y_pred4))

print("Model 5: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit5.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred5))

print(classification_report(y_validate, y_pred5))

### 6. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [None]:
y_pred = logit5.predict(X_test)
y_pred_proba = logit5.predict_proba(X_test)

print("Model 5: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit5.score(X_test, y_test)))

print(confusion_matrix(y_validate, y_pred5))

print(classification_report(y_validate, y_pred5))

In [None]:
#My model got an accuracy of .79 on test, .76 on validate and .81 on test

In [None]:
!git status

# Decision Tree Exercises

## Working through the curric example

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd

df = data('iris')
df.head()

In [None]:
# Cleaning up the data to get rid of capitalized letters and periods
df.columns = [col.lower().replace('.', '_') for col in df]
df.head()

In [None]:
# Dropping species because that is the variable our decision tree is going to try and predict
X = df.drop(['species'], axis = 1)
y = df[['species']]

In [None]:
# Split into train, validate, and test datasets
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)


In [None]:
# Creating the decision tree object
# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# Fitting the data to the trained data
clf.fit(X_train, y_train)

In [None]:
DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# Estimating the species
y_pred = clf.predict(X_train)
y_pred[0:5]

In [None]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
y_train.species.value_counts()

In [None]:
labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train, y_pred))

### 1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
train, validate, test = prep_titanic()

In [None]:
train.head()

In [None]:
# Decision tree object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# Split the data for the train set
X_train1 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [None]:
# Fit the model to the training data
clf.fit(X_train1, y_train)

In [None]:
DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# array of not survived and survived
y_pred1 = clf.predict(X_train1)

In [None]:
# Estimate the probability of not survived
y_pred_proba = clf.predict_proba(X_train1)

### 2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train1, y_train)))

In [None]:
confusion_matrix(y_pred1, y_train)

In [None]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred1, y_train), index=label1, columns=label)

In [None]:
print("Model1 report:\n", classification_report(y_train, y_pred1))

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
TP = 279
TN = 128
FP = 62
FN = 28

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [None]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

### 4. Run through steps 2-4 using a different max_depth value. Used 6 for this example

In [None]:
clf = DecisionTreeClassifier(max_depth=6, random_state=123)

In [None]:
X_train2 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [None]:
clf.fit(X_train2, y_train)

In [None]:
DecisionTreeClassifier(max_depth=6, random_state=123)

In [None]:
y_pred2 = clf.predict(X_train2)

In [None]:
y_pred_proba = clf.predict_proba(X_train2)

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train2, y_train)))

In [None]:
confusion_matrix(y_pred2, y_train)

In [None]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred2, y_train), index=label1, columns=label)

In [None]:
print("Model2 report:\n", classification_report(y_train, y_pred2))

In [None]:
TP = 282
TN = 146
FP = 44
FN = 25

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [None]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

### 4. Last model with max depth of 9.

In [None]:
clf = DecisionTreeClassifier(max_depth=9, random_state=123)

In [None]:
X_train3 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [None]:
clf.fit(X_train3, y_train)

In [None]:
DecisionTreeClassifier(max_depth=9, random_state=123)

In [None]:
y_pred3 = clf.predict(X_train3)

In [None]:
y_pred_proba = clf.predict_proba(X_train3)

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train3, y_train)))

In [None]:
confusion_matrix(y_pred3, y_train)

In [None]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred3, y_train), index=label1, columns=label)

In [None]:
print("Model3 report:\n", classification_report(y_train, y_pred3))

In [None]:
TP = 294
TN = 160
FP = 30
FN = 13

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [None]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

In [None]:
# Now going to validate my top 3

In [None]:
# Decision tree object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
X_validate1 = validate.drop(['survived'], axis=1)
y_validate = validate[['survived']]

In [None]:
clf.fit(X_validate1, y_validate)

In [None]:
DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
y_pred1 = clf.predict(X_validate1)

In [None]:
y_pred_proba = clf.predict_proba(X_validate1)

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate1, y_validate)))

In [None]:
# Model 2

In [None]:
clf = DecisionTreeClassifier(max_depth=6, random_state=123)

In [None]:
X_validate2 = validate.drop(['survived'], axis=1)

In [None]:
clf.fit(X_validate2, y_validate)

In [None]:
DecisionTreeClassifier(max_depth=6, random_state=123)

In [None]:
y_pred2 = clf.predict(X_validate2)

In [None]:
y_pred_proba = clf.predict_proba(X_validate2)

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate2, y_validate)))

In [None]:
# Model 3

In [None]:
clf = DecisionTreeClassifier(max_depth=9, random_state=123)

In [None]:
X_validate3 = validate.drop(['survived'], axis=1)

In [None]:
clf.fit(X_validate3, y_validate)

In [None]:
DecisionTreeClassifier(max_depth=9, random_state=123)

In [None]:
y_pred3 = clf.predict(X_validate3)

In [None]:
y_pred_proba = clf.predict_proba(X_validate3)

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate3, y_validate)))

In [None]:
confusion_matrix(y_pred3, y_validate)

In [None]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred3, y_validate), index=label1, columns=label)

In [None]:
print("Model3 validate report:\n", classification_report(y_validate, y_pred3))

In [None]:
TP = 132
TN = 74
FP = 8
FN = 0

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [None]:
print(f"Model 3 validate \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

In [None]:
validate.head()

In [None]:
# Now doing the test on Model 3

In [None]:
clf = DecisionTreeClassifier(max_depth=9, random_state=123)

In [None]:
X_test3 = test.drop(['survived'], axis=1)
y_test = test[['survived']]

In [None]:
clf.fit(X_test3, y_test)

In [None]:
DecisionTreeClassifier(max_depth=9, random_state=123)

In [None]:
y_pred3 = clf.predict(X_test3)

In [None]:
y_pred_proba = clf.predict_proba(X_test3)

In [None]:
print('Test 3 has an accuracy of: {:.2f}'
     .format(clf.score(X_test3, y_test)))

In [None]:
confusion_matrix(y_pred3, y_test)

In [None]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred3, y_test), index=label1, columns=label)

In [None]:
TP = 110
TN = 67
FP = 1
FN = 0

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [None]:
print(f"Model 3 Test \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

In conclustion, my train3 had the best accuracy of 86%. On validate it had an accuracy of 92% and tested at 99% on the test model. 

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_test3, y_test)

import graphviz

from graphviz import Graph

dot_data = export_graphviz(clf, feature_names= X_test3.columns, class_names= {0:'Died', 1:'Survived'}, rounded=True, filled=True, out_file=None)

graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)


In [None]:
!git status

In [None]:
!git add model.ipynb

In [None]:
!git commit -m "Added graphviz tree"

In [None]:
!git push