In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

from acquire import get_titanic_data
from prepare import prep_titanic
import warnings
warnings.filterwarnings("ignore")

In [2]:
train, validate, test = prep_titanic()
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')
train.head()

train -> (497, 10)
validate -> (214, 10)
test -> (178, 10)


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male
583,0,1,36.0,0,0,40.125,1,0,0,1
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,0,1,1
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0


### 1. Start by defining your baseline model.

In [3]:
train.dropna(inplace=True)

In [4]:
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [5]:
train['baseline_prediction'] = 'not survived'
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male,baseline_prediction
583,0,1,36.0,0,0,40.125,1,0,0,1,not survived
337,1,1,41.0,0,0,134.5,1,0,0,0,not survived
50,0,3,7.0,4,1,39.6875,0,0,1,1,not survived
218,1,1,32.0,0,0,76.2917,1,0,0,0,not survived
31,1,1,29.916875,1,0,146.5208,0,0,0,0,not survived


In [6]:
pd.crosstab(train.baseline_prediction, train.survived)

survived,0,1
baseline_prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
not survived,307,190


In [7]:
baseline_accuracy = 1- (train.survived).mean()
baseline_accuracy

0.6177062374245472

In [8]:
X_train = train.drop(columns = ["survived", "baseline_prediction"])
y_train = train.survived

In [9]:
logit1 = LogisticRegression()

In [10]:
logit1 = logit1.fit(X_train, y_train)

In [11]:
print(logit1.coef_)
print(logit1.intercept_)

[[-1.08378928e+00 -3.06325650e-02 -4.87905273e-01 -2.50746444e-01
   1.80619176e-03 -9.12847169e-01  9.03534779e-01  2.64568017e-01
  -2.48352720e+00]]
[4.95340416]


In [12]:
y_pred = logit1.predict(X_train)

In [13]:
logit1.score(X_train, y_train)

0.8028169014084507

### I did not use this model when comparing all other models. Compared models from problem 2 to 4

### 2. Create another model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [14]:
X_train = train[['pclass','age','fare']]
y_train = train[['survived']]
# Don't need to create validate and test until you need it
X_validate = validate[['pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['pclass','age','fare']]
y_test = test[['survived']]

In [15]:
logit = LogisticRegression()

In [16]:
logit = logit.fit(X_train, y_train)

In [17]:
print(logit.coef_)

print(logit.intercept_)

[[-0.97983178 -0.03051881  0.00266519]]
[2.52970125]


In [18]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [19]:
y_pred_proba

array([[0.36397951, 0.63602049],
       [0.34139883, 0.65860117],
       [0.6265983 , 0.3734017 ],
       [0.31505319, 0.68494681],
       [0.26359851, 0.73640149],
       [0.56992912, 0.43007088],
       [0.66202017, 0.33797983],
       [0.56377681, 0.43622319],
       [0.78619933, 0.21380067],
       [0.75721249, 0.24278751],
       [0.83302159, 0.16697841],
       [0.5524311 , 0.4475689 ],
       [0.26852561, 0.73147439],
       [0.44097115, 0.55902885],
       [0.65470646, 0.34529354],
       [0.69246728, 0.30753272],
       [0.78606491, 0.21393509],
       [0.20972644, 0.79027356],
       [0.60657891, 0.39342109],
       [0.75981083, 0.24018917],
       [0.67296533, 0.32703467],
       [0.78618813, 0.21381187],
       [0.44717612, 0.55282388],
       [0.70630063, 0.29369937],
       [0.43849213, 0.56150787],
       [0.74851029, 0.25148971],
       [0.44449993, 0.55550007],
       [0.47490776, 0.52509224],
       [0.33049719, 0.66950281],
       [0.55461193, 0.44538807],
       [0.

In [20]:
# accuracy
logit.score(X_train, y_train)

0.716297786720322

In [21]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[265  42]
 [ 99  91]]


In [22]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.68      0.48      0.56       190

    accuracy                           0.72       497
   macro avg       0.71      0.67      0.68       497
weighted avg       0.71      0.72      0.70       497



### I calculated a baseline accuracy 62% and the model showed a accuracy of 72%

### 3. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [23]:
X_train = train[['sex_male','pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','age','fare']]
y_test = test[['survived']]

In [24]:
logit2 = LogisticRegression()

In [25]:
logit2 = logit.fit(X_train, y_train)

In [26]:
print(logit2.coef_)

print(logit2.intercept_)

[[-2.45878213e+00 -1.11402368e+00 -2.66594879e-02  9.02716903e-04]]
[4.30664987]


In [27]:
y_pred = logit2.predict(X_train)
y_pred_proba = logit2.predict_proba(X_train)

In [28]:
# accuracy
logit2.score(X_train, y_train)

0.7987927565392354

In [29]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[263  44]
 [ 56 134]]


In [30]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



### Accuracy is at 80% now with sex included

### 4. Try out other combinations of features and models.

In [31]:
# Creating a model with alone embarked Q & S & fare

In [32]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male,baseline_prediction
583,0,1,36.0,0,0,40.125,1,0,0,1,not survived
337,1,1,41.0,0,0,134.5,1,0,0,0,not survived
50,0,3,7.0,4,1,39.6875,0,0,1,1,not survived
218,1,1,32.0,0,0,76.2917,1,0,0,0,not survived
31,1,1,29.916875,1,0,146.5208,0,0,0,0,not survived


In [33]:
X_train = train[['alone','embarked_Q','embarked_S','fare']]
y_train = train[['survived']]
X_validate = validate[['alone','embarked_Q','embarked_S','fare']]
y_validate = validate[['survived']]
X_test = test[['alone','embarked_Q','embarked_S','fare']]
y_test = test[['survived']]

In [34]:
logit3 = LogisticRegression()

In [35]:
logit3 = logit3.fit(X_train, y_train)

In [36]:
print(logit3.coef_)

print(logit3.intercept_)

[[-0.82913469  0.20969254 -0.26724887  0.00935761]]
[-0.12836476]


In [37]:
# accuracy
logit3.score(X_train, y_train)

0.6720321931589537

In [38]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[263  44]
 [ 56 134]]


In [39]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



### Going to do 3. the model set with sex and change some of the 

In [40]:
X_train = train[['sex_male','pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','age','fare']]
y_test = test[['survived']]

In [41]:
logit4 = LogisticRegression(C=1000, class_weight={0:1, 1:3}, random_state=123, max_iter=500, solver='lbfgs')


In [42]:
logit4.fit(X_train, y_train)

LogisticRegression(C=1000, class_weight={0: 1, 1: 3}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=500, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
LogisticRegression(C=1000, random_state=123)


LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
print('Coefficient: \n', logit4.coef_)
print('Intercept: \n', logit4.intercept_)

Coefficient: 
 [[-2.48146317e+00 -1.04576529e+00 -2.88485000e-02  1.36953490e-03]]
Intercept: 
 [5.31245845]


In [45]:
y_pred = logit4.predict(X_train)

In [46]:
y_pred_proba = logit4.predict_proba(X_train)

In [47]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit4.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.73


With a low c ~.1 accuracy went down 1 hundredth .01 to .79. I increased C to 1500 and accuracy stayed at 80%. When weights were added class_weight={0: 1, 1: 99}, the accuracy went down tremendously to .38. Adjusted the class_weight={0: 1, 1: 3} and got 73%


In [48]:
X_train = train[['sex_male','pclass','sibsp','age']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','sibsp','age']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','sibsp','age']]
y_test = test[['survived']]

In [49]:
logit5 = LogisticRegression()

In [50]:
logit5 = logit5.fit(X_train, y_train)

In [51]:
print(logit5.coef_)

print(logit5.intercept_)

[[-2.57815189 -1.14702511 -0.29479208 -0.03177517]]
[4.79231522]


In [52]:
# accuracy
logit5.score(X_train, y_train)

0.7867203219315896

In [53]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[205 102]
 [ 31 159]]


In [54]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.67      0.76       307
           1       0.61      0.84      0.71       190

    accuracy                           0.73       497
   macro avg       0.74      0.75      0.73       497
weighted avg       0.77      0.73      0.74       497



### 5. Use you best 3 models to predict and evaluate on your validate sample. Logit 2, 4, & 5

In [55]:
y_pred2 = logit2.predict(X_validate)
y_pred4 = logit4.predict(X_validate)
y_pred5 = logit5.predict(X_validate)

print("Model 2: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred2))

print(classification_report(y_validate, y_pred2))

print("Model 4: solver = lbfgs, c = 1000 ,class_weight={0:1, 1:3}, random_state=123, max_iter=500")

print('Accuracy: {:.2f}'.format(logit4.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred4))

print(classification_report(y_validate, y_pred4))

print("Model 5: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit5.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred5))

print(classification_report(y_validate, y_pred5))

Model 2: solver = lbfgs, c = 1
Accuracy: 0.74
[[94 38]
 [18 64]]
              precision    recall  f1-score   support

           0       0.84      0.71      0.77       132
           1       0.63      0.78      0.70        82

    accuracy                           0.74       214
   macro avg       0.73      0.75      0.73       214
weighted avg       0.76      0.74      0.74       214

Model 4: solver = lbfgs, c = 1000 ,class_weight={0:1, 1:3}, random_state=123, max_iter=500
Accuracy: 0.68
[[76 56]
 [13 69]]
              precision    recall  f1-score   support

           0       0.85      0.58      0.69       132
           1       0.55      0.84      0.67        82

    accuracy                           0.68       214
   macro avg       0.70      0.71      0.68       214
weighted avg       0.74      0.68      0.68       214

Model 5: solver = lbfgs, c = 1
Accuracy: 0.76
[[109  23]
 [ 28  54]]
              precision    recall  f1-score   support

           0       0.80      0.8

### 6. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [56]:
y_pred = logit5.predict(X_test)
y_pred_proba = logit5.predict_proba(X_test)

print("Model 5: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit5.score(X_test, y_test)))

print(confusion_matrix(y_validate, y_pred5))

print(classification_report(y_validate, y_pred5))

Model 5: solver = lbfgs, c = 1
Accuracy: 0.81
[[109  23]
 [ 28  54]]
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       132
           1       0.70      0.66      0.68        82

    accuracy                           0.76       214
   macro avg       0.75      0.74      0.74       214
weighted avg       0.76      0.76      0.76       214



In [57]:
#My model got an accuracy of .79 on train, .76 on validate and .81 on test

In [58]:
!git status

On branch master
Your branch is up to date with 'origin/master'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   model.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mtitanic_decision_tree[m
	[31mtitanic_decision_tree.pdf[m

no changes added to commit (use "git add" and/or "git commit -a")


# Decision Tree Exercises

## Working through the curric example

In [59]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd

df = data('iris')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [60]:
# Cleaning up the data to get rid of capitalized letters and periods
df.columns = [col.lower().replace('.', '_') for col in df]
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [61]:
# Dropping species because that is the variable our decision tree is going to try and predict
X = df.drop(['species'], axis = 1)
y = df[['species']]

In [62]:
# Split into train, validate, and test datasets
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)


In [63]:
# Creating the decision tree object
# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [64]:
# Fitting the data to the trained data
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [65]:
DecisionTreeClassifier(max_depth=3, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [66]:
# Estimating the species
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['versicolor', 'setosa', 'virginica', 'setosa', 'virginica'],
      dtype=object)

In [67]:
y_pred_proba = clf.predict_proba(X_train)

In [68]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.98


In [69]:
confusion_matrix(y_train, y_pred)

array([[26,  0,  0],
       [ 0, 29,  2],
       [ 0,  0, 27]])

In [70]:
y_train.species.value_counts()

versicolor    31
virginica     27
setosa        26
Name: species, dtype: int64

In [71]:
labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,setosa,versicolor,virginica
setosa,26,0,0
versicolor,0,29,2
virginica,0,0,27


In [72]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        26
  versicolor       1.00      0.94      0.97        31
   virginica       0.93      1.00      0.96        27

    accuracy                           0.98        84
   macro avg       0.98      0.98      0.98        84
weighted avg       0.98      0.98      0.98        84



### 1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [73]:
from acquire import get_titanic_data
from prepare import prep_titanic
import warnings
warnings.filterwarnings("ignore")

In [74]:
train, validate, test = prep_titanic()

In [75]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male
583,0,1,36.0,0,0,40.125,1,0,0,1
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,0,1,1
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0


In [76]:
# Decision tree object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [77]:
# Split the data for the train set
X_train1 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [78]:
# Fit the model to the training data
clf.fit(X_train1, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [79]:
DecisionTreeClassifier(max_depth=3, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [80]:
# array of not survived and survived
y_pred1 = clf.predict(X_train1)

In [81]:
# Estimate the probability of not survived
y_pred_proba = clf.predict_proba(X_train1)

### 2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [82]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train1, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [83]:
confusion_matrix(y_pred1, y_train)

array([[279,  62],
       [ 28, 128]])

In [84]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred1, y_train), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,279,62
Model Survived,28,128


In [85]:
print("Model1 report:\n", classification_report(y_train, y_pred1))

Model1 report:
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [86]:
TP = 279
TN = 128
FP = 62
FN = 28

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [87]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Accuracy: 0.82, Precision: 0.82, Recall: 0.91
f1_score: 0.86,Support0: 307, Support1: 190


### 4. Run through steps 2-4 using a different max_depth value. Used 6 for this example

In [88]:
clf = DecisionTreeClassifier(max_depth=6, random_state=123)

In [89]:
X_train2 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [90]:
clf.fit(X_train2, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [91]:
DecisionTreeClassifier(max_depth=6, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [92]:
y_pred2 = clf.predict(X_train2)

In [93]:
y_pred_proba = clf.predict_proba(X_train2)

In [94]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train2, y_train)))

Accuracy of Decision Tree classifier on training set: 0.86


In [95]:
confusion_matrix(y_pred2, y_train)

array([[282,  44],
       [ 25, 146]])

In [96]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred2, y_train), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,282,44
Model Survived,25,146


In [97]:
print("Model2 report:\n", classification_report(y_train, y_pred2))

Model2 report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.89       307
           1       0.85      0.77      0.81       190

    accuracy                           0.86       497
   macro avg       0.86      0.84      0.85       497
weighted avg       0.86      0.86      0.86       497



In [98]:
TP = 282
TN = 146
FP = 44
FN = 25

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [99]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Accuracy: 0.86, Precision: 0.87, Recall: 0.92
f1_score: 0.9,Support0: 307, Support1: 190


### 4. Last model with max depth of 9.

In [100]:
clf = DecisionTreeClassifier(max_depth=9, random_state=123)

In [101]:
X_train3 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [102]:
clf.fit(X_train3, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [103]:
DecisionTreeClassifier(max_depth=9, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [104]:
y_pred3 = clf.predict(X_train3)

In [105]:
y_pred_proba = clf.predict_proba(X_train3)

In [106]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train3, y_train)))

Accuracy of Decision Tree classifier on training set: 0.91


In [107]:
confusion_matrix(y_pred3, y_train)

array([[294,  30],
       [ 13, 160]])

In [108]:
tn, fp, fn, tp = confusion_matrix(y_pred3, y_train).ravel()

In [109]:
tn, fp, fn, tp

(294, 30, 13, 160)

In [110]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred3, y_train), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,294,30
Model Survived,13,160


In [111]:
print("Model3 report:\n", classification_report(y_train, y_pred3))

Model3 report:
               precision    recall  f1-score   support

           0       0.91      0.96      0.93       307
           1       0.92      0.84      0.88       190

    accuracy                           0.91       497
   macro avg       0.92      0.90      0.91       497
weighted avg       0.91      0.91      0.91       497



In [112]:
TP = 294
TN = 160
FP = 30
FN = 13

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [113]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Accuracy: 0.91, Precision: 0.91, Recall: 0.96
f1_score: 0.94,Support0: 307, Support1: 190


In [114]:
# Now going to validate my top 3

In [115]:
# Decision tree object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [116]:
X_validate1 = validate.drop(['survived'], axis=1)
y_validate = validate[['survived']]

In [117]:
clf.fit(X_validate1, y_validate)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [118]:
DecisionTreeClassifier(max_depth=3, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [119]:
y_pred1 = clf.predict(X_validate1)

In [120]:
y_pred_proba = clf.predict_proba(X_validate1)

In [121]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate1, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.83


In [122]:
# Model 2

In [123]:
clf = DecisionTreeClassifier(max_depth=6, random_state=123)

In [124]:
X_validate2 = validate.drop(['survived'], axis=1)

In [125]:
clf.fit(X_validate2, y_validate)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [126]:
DecisionTreeClassifier(max_depth=6, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [127]:
y_pred2 = clf.predict(X_validate2)

In [128]:
y_pred_proba = clf.predict_proba(X_validate2)

In [129]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate2, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.89


In [130]:
# Model 3

In [131]:
clf = DecisionTreeClassifier(max_depth=9, random_state=123)

In [132]:
X_validate3 = validate.drop(['survived'], axis=1)

In [133]:
clf.fit(X_validate3, y_validate)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [134]:
DecisionTreeClassifier(max_depth=9, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [135]:
y_pred3 = clf.predict(X_validate3)

In [136]:
y_pred_proba = clf.predict_proba(X_validate3)

In [137]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate3, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.96


In [138]:
confusion_matrix(y_pred3, y_validate)

array([[132,   8],
       [  0,  74]])

In [139]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred3, y_validate), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,132,8
Model Survived,0,74


In [140]:
print("Model3 validate report:\n", classification_report(y_validate, y_pred3))

Model3 validate report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       132
           1       1.00      0.90      0.95        82

    accuracy                           0.96       214
   macro avg       0.97      0.95      0.96       214
weighted avg       0.96      0.96      0.96       214



In [141]:
TP = 132
TN = 74
FP = 8
FN = 0

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [142]:
print(f"Model 3 validate \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Model 3 validate 
Accuracy: 0.96, Precision: 0.94, Recall: 1.0
f1_score: 0.97,Support0: 132, Support1: 82


In [143]:
validate.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male
610,0,3,39.0,1,5,31.275,0,0,1,0
424,0,3,18.0,1,1,20.2125,0,0,1,1
568,0,3,29.916875,0,0,7.2292,1,0,0,1
701,1,1,35.0,0,0,26.2875,1,0,1,1
101,0,3,29.916875,0,0,7.8958,1,0,1,1


In [144]:
# Now doing the test on Model 3

In [145]:
clf = DecisionTreeClassifier(max_depth=9, random_state=123)

In [146]:
X_test3 = test.drop(['survived'], axis=1)
y_test = test[['survived']]

In [147]:
clf.fit(X_test3, y_test)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [148]:
DecisionTreeClassifier(max_depth=9, random_state=123)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [149]:
y_pred3 = clf.predict(X_test3)

In [150]:
y_pred_proba = clf.predict_proba(X_test3)

In [151]:
print('Test 3 has an accuracy of: {:.2f}'
     .format(clf.score(X_test3, y_test)))

Test 3 has an accuracy of: 0.99


In [152]:
confusion_matrix(y_pred3, y_test)

array([[110,   1],
       [  0,  67]])

In [153]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred3, y_test), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,110,1
Model Survived,0,67


In [154]:
TP = 110
TN = 67
FP = 1
FN = 0

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [155]:
print(f"Model 3 Test \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Model 3 Test 
Accuracy: 0.99, Precision: 0.99, Recall: 1.0
f1_score: 0.99,Support0: 110, Support1: 68


In conclustion, my train3 had the best accuracy of 86%. On validate it had an accuracy of 92% and tested at 99% on the test model. 

In [156]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_test3, y_test)

import graphviz

from graphviz import Graph

dot_data = export_graphviz(clf, feature_names= X_test3.columns, class_names= {0:'Died', 1:'Survived'}, rounded=True, filled=True, out_file=None)

graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)


'titanic_decision_tree.pdf'

# Random Forest Exercises

In [157]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_titanic_data
from prepare import prep_titanic
import pandas as pd

In [158]:
train, validate, test = prep_titanic()
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male
583,0,1,36.0,0,0,40.125,1,0,0,1
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,0,1,1
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0


## Using default settings on Random forest classifier

In [159]:
# Random forest object
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=123)

In [160]:
# Setting up X_train and y_train
X_train = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [161]:
# Fitting the model
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [162]:
RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [163]:
# Estimate whether or not a passenger would survive, using the training data
y_pred = rf.predict(X_train)

In [164]:
# Estimate the probability of a passenger surviving, using the training data
y_pred_proba = rf.predict_proba(X_train)

In [165]:
# Accuracy
rf.score(X_train, y_train)

0.8329979879275654

In [166]:
confusion_matrix(y_train, y_pred)

array([[289,  18],
       [ 65, 125]])

In [167]:
pd.DataFrame(confusion_matrix(y_train, y_pred, labels = [0,1]), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,289,18
Model Survived,65,125


In [168]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
tn, fp, fn, tp

(289, 18, 65, 125)

In [169]:
print("Train model report:\n", classification_report(y_train, y_pred))

Train model report:
               precision    recall  f1-score   support

           0       0.82      0.94      0.87       307
           1       0.87      0.66      0.75       190

    accuracy                           0.83       497
   macro avg       0.85      0.80      0.81       497
weighted avg       0.84      0.83      0.83       497



In [170]:
TP = 125
TN = 289
FP = 18
FN = 65

accuracy = round((TP + TN)/ (TP + TN + FP + FN),2)
precision = round(TP/(TP + FP),2)
recall = round(TP/(TP + FN),2)
f1_score = round((precision + recall)/2,2)
support0 = TP + FN
support1 = TN + FP

In [171]:
print(f"Train model \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Train model 
Accuracy: 0.83, Precision: 0.87, Recall: 0.66
f1_score: 0.77,Support0: 190, Support1: 307


## 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [172]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf = 1, random_state=123)

In [173]:
# Setting up X_train and y_train
X_train1 = train.drop(['survived'], axis=1)
y_train = train[['survived']]

In [174]:
# Fitting the model
rf.fit(X_train1, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [175]:
RandomForestClassifier(max_depth=20, min_samples_leaf=1, random_state=123)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [176]:
# Estimate whether or not a passenger would survive, using the training data
y_pred1 = rf.predict(X_train1)

In [177]:
# Estimate the probability of a passenger surviving, using the training data
y_pred_proba = rf.predict_proba(X_train1)

In [178]:
# Accuracy
rf.score(X_train1, y_train)

0.9879275653923542

In [179]:
confusion_matrix(y_pred1, y_train)

array([[305,   4],
       [  2, 186]])

In [180]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred1).ravel()
tn, fp, fn, tp

(305, 2, 4, 186)

In [181]:
label = ["Actual Died", "Actual Survived"]
label1 = ["Model Died", " Model Survived"]
pd.DataFrame(confusion_matrix(y_pred1, y_train), index=label1, columns=label)

Unnamed: 0,Actual Died,Actual Survived
Model Died,305,4
Model Survived,2,186


In [182]:
print("Train model with max_dept= 20 & min_samples_leaf=1 report:\n", classification_report(y_pred1, y_train))

Train model with max_dept= 20 & min_samples_leaf=1 report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       309
           1       0.98      0.99      0.98       188

    accuracy                           0.99       497
   macro avg       0.99      0.99      0.99       497
weighted avg       0.99      0.99      0.99       497



## 4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [183]:
rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_leaf = 5, random_state=123)

In [184]:
# Setting up X_train and y_train
X_train2 = train.drop(['survived'], axis=1)

In [185]:
# Fitting the model
rf.fit(X_train2, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [186]:
RandomForestClassifier(max_depth=3, min_samples_leaf=5, random_state=123)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [187]:
# Estimate whether or not a passenger would survive, using the training data
y_pred2 = rf.predict(X_train2)

In [188]:
# Estimate the probability of a passenger surviving, using the training data
y_pred_proba = rf.predict_proba(X_train2)

In [189]:
# Accuracy
rf.score(X_train2, y_train)

0.8309859154929577

In [190]:
confusion_matrix(y_pred2, y_train)

array([[288,  65],
       [ 19, 125]])

In [191]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred2).ravel()
tn, fp, fn, tp

(288, 19, 65, 125)

In [192]:
print("Train model with max_dept= 3 & min_samples_leaf=5 report:\n", classification_report(y_pred, y_train))

Train model with max_dept= 3 & min_samples_leaf=5 report:
               precision    recall  f1-score   support

           0       0.94      0.82      0.87       354
           1       0.66      0.87      0.75       143

    accuracy                           0.83       497
   macro avg       0.80      0.85      0.81       497
weighted avg       0.86      0.83      0.84       497



## 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?    

My model that had min_samples_leaf = 1 and max_depth = 20 had the best accuracy of 99% because it was over fit with a max depth of 20

min_sample_leaf: The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

## Validate samples

In [193]:
# Random forest object with default settings
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=123)

In [194]:
X_validate = validate.drop(['survived'], axis=1)
y_validate = validate.survived

In [195]:
# Fitting the model
rf.fit(X_validate, y_validate)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [196]:
rf.score(X_validate, y_validate)

0.8271028037383178

In [197]:
# Random forest with min_samples_leaf = 1 and max_depth = 20
rf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf = 1, random_state=123)

In [198]:
X_validate2 = validate.drop(['survived'], axis=1)

In [199]:
# Fitting the model
rf.fit(X_validate2, y_validate)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [200]:
rf.score(X_validate2, y_validate)

0.9906542056074766

In [205]:
# min_samples_leaf to 5 and decreasing your max_depth to 3.
rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_leaf = 5, random_state=123)

In [206]:
X_validate3 = validate.drop(['survived'], axis=1)

In [207]:
# Fitting the model
rf.fit(X_validate3, y_validate)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [208]:
rf.score(X_validate3, y_validate)

0.8130841121495327

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf = 1, random_state=123)

In [209]:
X_test = test.drop(['survived'], axis=1)
y_test = test.survived

In [210]:
# Fitting the model
rf.fit(X_test, y_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [212]:
rf.score(X_test, y_test)

0.8651685393258427

In [None]:
# Test accuracy of 86.51%

# KNN Exercise

### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [213]:
from sklearn.neighbors import KNeighborsClassifier

In [214]:
knn = KNeighborsClassifier(n_neighbors=5)

In [215]:
knn = knn.fit(X_train, y_train)

In [216]:
knn.score(X_train, y_train)

0.7746478873239436

In [217]:
y_pred_knn = knn.predict(X_train)

In [221]:
print(classification_report(y_train, y_pred_knn))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       307
           1       0.72      0.68      0.70       190

    accuracy                           0.77       497
   macro avg       0.76      0.76      0.76       497
weighted avg       0.77      0.77      0.77       497



In [222]:
print(confusion_matrix(y_train, y_pred_knn))

[[256  51]
 [ 61 129]]


In [225]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_knn).ravel()
tn, fp, fn, tp

(256, 51, 61, 129)

In [226]:
accuracy = round((fp + tn)/ (tp + tn + fp + fn),2)
precision = round(tp/(tp + fp),2)
recall = round(tp/(tp + fn),2)

In [227]:
print(f"Train model \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Train model 
Accuracy: 0.62, Precision: 0.72, Recall: 0.68
f1_score: 0.77,Support0: 190, Support1: 307


## 4. Run through steps 2-4 setting k to 10

In [228]:
knn = KNeighborsClassifier(n_neighbors=10)

In [230]:
knn1 = knn.fit(X_train, y_train)

In [231]:
knn1.score(X_train, y_train)

0.7605633802816901

In [232]:
y_pred_knn = knn1.predict(X_train)

In [233]:
print(classification_report(y_train, y_pred_knn))

              precision    recall  f1-score   support

           0       0.75      0.92      0.83       307
           1       0.79      0.51      0.62       190

    accuracy                           0.76       497
   macro avg       0.77      0.71      0.72       497
weighted avg       0.77      0.76      0.75       497



In [234]:
print(confusion_matrix(y_train, y_pred_knn))

[[282  25]
 [ 94  96]]


In [235]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_knn).ravel()
tn, fp, fn, tp

(282, 25, 94, 96)

In [236]:
accuracy = round((fp + tn)/ (tp + tn + fp + fn),2)
precision = round(tp/(tp + fp),2)
recall = round(tp/(tp + fn),2)

In [238]:
print(f"Train model with k set to 10\nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}\nf1_score: {f1_score},Support0: {support0}, Support1: {support1}")

Train model with k set to 10
Accuracy: 0.62, Precision: 0.79, Recall: 0.51
f1_score: 0.77,Support0: 190, Support1: 307


# K to 20

In [239]:
knn = KNeighborsClassifier(n_neighbors=20)

In [240]:
knn2 = knn.fit(X_train, y_train)

In [241]:
knn2.score(X_train, y_train)

0.7183098591549296

The default settings work the had the best accuracy of 77%. With k at 10 I got an accuracy of 76%. 

In [201]:
!git status

On branch master
Your branch is up to date with 'origin/master'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   model.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mtitanic_decision_tree[m
	[31mtitanic_decision_tree.pdf[m

no changes added to commit (use "git add" and/or "git commit -a")


In [202]:
!git add model.ipynb

In [203]:
!git commit -m "Added a model comparing all vars in the Logistic Regression exercise"

[master 79a9677] Added a model comparing all vars in the Logistic Regression exercise
 1 file changed, 1400 insertions(+), 341 deletions(-)


In [204]:
!git push

Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 8 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 4.36 KiB | 893.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/george887/classification_exercises.git
   dcf00ac..79a9677  master -> master
