# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import graphviz

from graphviz import Graph
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.datasets import load_iris



import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# import warnings
# warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic_data

df = prep_titanic_data(get_titanic_data())
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [2]:
# Handle missing values in the 'age' column
df.dropna(inplace=True)

In [3]:
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
60,3,22.0,7.2292,0,0
348,3,3.0,15.9,1,1
606,3,30.0,7.8958,0,0
195,1,58.0,146.5208,0,0
56,2,21.0,10.5,0,0


In [4]:
# 1. Make the thing
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

In [5]:
# 2. Fit
logit.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [6]:
# 3. Do stuff
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.03183343 -0.00819677  0.01882492 -0.02030296  0.01317287]]
Intercept: 
 [0.00230414]


### 1. Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample

In [7]:
# Estimate whether a passenger would survive
y_pred = logit.predict(X_train)

# Estimate the probability of a passenger surviving
y_pred_proba = logit.predict_proba(X_train)

### 2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [8]:
# Model score
print(f'Accuracy of the Logistic Regression classifier on\
        training set (Model Score):\
        {logit.score(X_train, y_train)}')

Accuracy of the Logistic Regression classifier on        training set (Model Score):        0.6472945891783567


In [9]:
# Confusion matrix
print(confusion_matrix(y_train, y_pred))

cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
                 columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
# cm

[[181 112]
 [ 64 142]]


In [10]:
# Classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.62      0.67       293
           1       0.56      0.69      0.62       206

   micro avg       0.65      0.65      0.65       499
   macro avg       0.65      0.65      0.65       499
weighted avg       0.66      0.65      0.65       499



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [11]:
tp = cm.loc['Actual +', 'Pred +']
fn = cm.loc['Actual +', 'Pred -']
fp = cm.loc['Actual -', 'Pred +']
tn = cm.loc['Actual -', 'Pred -']

recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1 = (recall + precision) / 2
support = tp + fn

print(f'Recall: \t {recall}')
print(f'Precision: \t {precision}')
print(f'F1: \t\t {f1}')
print(f'Support: \t {support}')

Recall: 	 0.6893203883495146
Precision: 	 0.5590551181102362
F1: 		 0.6241877532298754
Support: 	 206


In [12]:
# Model score
# print(f'Accuracy of the Logistic Regression classifier on\
#         test set (Model Score):\
#         {logit.score(X_test, y_test)}')

### 4. Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?

In [13]:
# Use liblinear instead of saga

### 5. Run through steps 2-4 using another solver (from question 5)

In [14]:
# 1. Make the thing
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='liblinear')

# 2. Fit
logit.fit(X_train, y_train)

# 3. Do stuff
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

# Estimate whether a passenger would survive
y_pred = logit.predict(X_train)

# Estimate the probability of a passenger surviving
y_pred_proba = logit.predict_proba(X_train)

# Model score
print(f'Accuracy of the Logistic Regression classifier on\
        training set (Model Score):\
        {logit.score(X_train, y_train)}')

# Confusion matrix
print(confusion_matrix(y_train, y_pred))

cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
                 columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
cm

Coefficient: 
 [[-0.87476501 -0.02505375  0.00588046 -0.28755904  0.37926043]]
Intercept: 
 [2.76511655]
Accuracy of the Logistic Regression classifier on        training set (Model Score):        0.6993987975951904
[[194  99]
 [ 51 155]]


  y = column_or_1d(y, warn=True)


Unnamed: 0,Pred -,Pred +
Actual -,194,99
Actual +,51,155


In [15]:
# Classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.66      0.72       293
           1       0.61      0.75      0.67       206

   micro avg       0.70      0.70      0.70       499
   macro avg       0.70      0.71      0.70       499
weighted avg       0.72      0.70      0.70       499



In [16]:
tp = cm.loc['Actual +', 'Pred +']
fn = cm.loc['Actual +', 'Pred -']
fp = cm.loc['Actual -', 'Pred +']
tn = cm.loc['Actual -', 'Pred -']

recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1 = (recall + precision) / 2
support = tp + fn

print(f'Recall: \t {recall}')
print(f'Precision: \t {precision}')
print(f'F1: \t\t {f1}')
print(f'Support: \t {support}')

Recall: 	 0.7524271844660194
Precision: 	 0.610236220472441
F1: 		 0.6813317024692302
Support: 	 206


### 6. Which performs better on your in-sample data?

In [17]:
# liblinear

### 7. Save the best model in logit_fit

In [18]:
logit_fit = logit

In [19]:
# # Model score
# print(f'Accuracy of the Logistic Regression classifier on\
#         test set (Model Score):\
#         {logit.score(X_test, y_test)}')

# Decision Tree - Iris Dataset

In [20]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

df = data('iris')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [21]:
df.columns = [col.lower().replace('.', '_') for col in df]

In [22]:
X = df.drop(['species'],axis=1)
y = df[['species']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
115,5.8,2.8,5.1,2.4
137,6.3,3.4,5.6,2.4
54,5.5,2.3,4.0,1.3
20,5.1,3.8,1.5,0.3
39,4.4,3.0,1.3,0.2


### Fit the decision tree classifier to your training sample and transform, i.e. make predictions on the training sample

In [91]:
# 1. Make the thing
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)

In [92]:
# 2. Fit
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [93]:
# 3. Do stuff

# Estimate species
y_pred = clf.predict(X_train)
y_pred[0:5]

# Estimate the probability of a species
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [0.

### 2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [95]:
# Model score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

# Confusion matrix
print(confusion_matrix(y_train, y_pred))

labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

# Classification report
print(classification_report(y_train, y_pred))
clf.score(X_train, y_train)

Accuracy of Decision Tree classifier on training set: 0.98
[[32  0  0]
 [ 0 40  0]
 [ 0  2 31]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       0.95      1.00      0.98        40
           2       1.00      0.94      0.97        33

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



0.9809523809523809

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [27]:
print(f'Recall: \t {recall}')
print(f'Precision: \t {precision}')
print(f'F1: \t\t {f1}')
print(f'Support: \t {support}')

Recall: 	 0.7524271844660194
Precision: 	 0.610236220472441
F1: 		 0.6813317024692302
Support: 	 206


### 4. Run through steps 2-4 using entropy as your measure of impurity.

In [28]:
# 1. Make the thing
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)

# 2. Fit
clf.fit(X_train, y_train)

# 3. Do stuff

# Estimate species
y_pred = clf.predict(X_train)
y_pred[0:5]

# Estimate the probability of a species
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

# Model score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

# Confusion matrix
print(confusion_matrix(y_train, y_pred))

labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

# Classification report
print(classification_report(y_train, y_pred))


Accuracy of Decision Tree classifier on training set: 0.98
[[32  0  0]
 [ 0 40  0]
 [ 0  2 31]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.95      1.00      0.98        40
   virginica       1.00      0.94      0.97        33

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



### 6. Save the best model in tree_fit

In [29]:
tree_fit = clf

In [30]:
# Model score on TEST
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.93


In [31]:
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

In [32]:
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree', view=True)

'iris_decision_tree.pdf'

# Decision Tree - Titanic Dataset

In [104]:
from acquire import get_titanic_data
from prepare import prep_titanic_data

df = prep_titanic_data(get_titanic_data())

In [105]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [106]:
df.dropna(inplace=True)

In [107]:
df['sex'] = df['sex'].map( {'female': 0, 'male': 1} ).astype(int)

In [108]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

train, test = train_test_split(df)

In [109]:
# 1. make the thing
scaler = MinMaxScaler()

# 2. fit the thing
scaler.fit(train[['age', 'fare']])

# 3. use the thing
train[['age', 'fare']] = scaler.transform(train[['age', 'fare']])
test[['age', 'fare']] = scaler.transform(test[['age', 'fare']])

In [110]:
train.head(5)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
673,673,1,2,1,0.415602,0,0,0.025374,S,Second,Southampton,1,3
163,163,0,3,1,0.225333,0,0,0.016908,S,Third,Southampton,1,3
850,850,0,3,1,0.048655,4,2,0.061045,S,Third,Southampton,0,3
432,432,1,2,0,0.565099,1,0,0.050749,S,Second,Southampton,0,3
99,99,0,2,1,0.456374,1,0,0.050749,S,Second,Southampton,0,3


In [111]:
X = df.drop(['passenger_id', 'embarked', 'class', 'embark_town', 'survived'],axis=1)
y = df[['survived']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,alone,embarked_encode
60,3,1,22.0,0,0,7.2292,1,0
348,3,1,3.0,1,1,15.9,0,3
606,3,1,30.0,0,0,7.8958,1,3
195,1,0,58.0,0,0,146.5208,1,0
56,2,0,21.0,0,0,10.5,1,3


In [112]:
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)

In [113]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [114]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 1, 1])

In [115]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.90134529, 0.09865471],
       [0.45833333, 0.54166667],
       [0.90134529, 0.09865471],
       [0.04545455, 0.95454545],
       [0.04545455, 0.95454545],
       [0.90134529, 0.09865471],
       [0.90134529, 0.09865471],
       [0.60294118, 0.39705882],
       [0.60294118, 0.39705882],
       [0.90134529, 0.09865471],
       [0.60294118, 0.39705882],
       [0.42372881, 0.57627119],
       [0.04545455, 0.95454545],
       [1.        , 0.        ],
       [0.04545455, 0.95454545],
       [0.45833333, 0.54166667],
       [1.        , 0.        ],
       [0.45833333, 0.54166667],
       [0.90134529, 0.09865471],
       [0.        , 1.        ],
       [0.90134529, 0.09865471],
       [0.90134529, 0.09865471],
       [0.90134529, 0.09865471],
       [0.42372881, 0.57627119],
       [0.90134529, 0.09865471],
       [0.90134529, 0.09865471],
       [0.90134529, 0.09865471],
       [0.60294118, 0.39705882],
       [0.60294118, 0.39705882],
       [0.90134529, 0.09865471],
       [1.

In [116]:
print(f'Accuracy of Decision Tree classifier on training set: {clf.score(X_train, y_train)}')

Accuracy of Decision Tree classifier on training set: 0.8176352705410822


In [46]:
confusion_matrix(y_train, y_pred)

array([[252,  41],
       [ 50, 156]])

In [47]:
y_train.survived.value_counts()

0    293
1    206
Name: survived, dtype: int64

In [48]:
import pandas as pd

labels = sorted(y_train.survived.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,252,41
1,50,156


In [49]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.80


In [50]:
# Classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       293
           1       0.79      0.76      0.77       206

   micro avg       0.82      0.82      0.82       499
   macro avg       0.81      0.81      0.81       499
weighted avg       0.82      0.82      0.82       499



# KNN - Titanic Dataset

In [51]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from acquire import get_titanic_data
from prepare import prep_titanic_data

df = prep_titanic_data(get_titanic_data())

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
passenger_id       891 non-null int64
survived           891 non-null int64
pclass             891 non-null int64
sex                891 non-null object
age                714 non-null float64
sibsp              891 non-null int64
parch              891 non-null int64
fare               891 non-null float64
embarked           891 non-null object
class              891 non-null object
embark_town        891 non-null object
alone              891 non-null int64
embarked_encode    891 non-null int64
dtypes: float64(2), int64(7), object(4)
memory usage: 90.6+ KB


In [52]:
df.dropna(inplace=True) # handle missing age values

X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
60,3,22.0,7.2292,0,0
348,3,3.0,15.9,1,1
606,3,30.0,7.8958,0,0
195,1,58.0,146.5208,0,0
56,2,21.0,10.5,0,0


### 1. Fit the K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [53]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [54]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [55]:
y_pred = knn.predict(X_train)

In [56]:
y_pred_proba = knn.predict_proba(X_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [57]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.76


In [58]:
print(confusion_matrix(y_train, y_pred))

[[239  54]
 [ 65 141]]


In [59]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.82      0.80       293
           1       0.72      0.68      0.70       206

   micro avg       0.76      0.76      0.76       499
   macro avg       0.75      0.75      0.75       499
weighted avg       0.76      0.76      0.76       499



### 4. Run through steps 2-4 setting k to 10

In [60]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

Accuracy of KNN classifier on training set: 0.71
[[252  41]
 [103 103]]
              precision    recall  f1-score   support

           0       0.71      0.86      0.78       293
           1       0.72      0.50      0.59       206

   micro avg       0.71      0.71      0.71       499
   macro avg       0.71      0.68      0.68       499
weighted avg       0.71      0.71      0.70       499



### 5. Run through setps 2-4 setting k to 20

In [61]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

Accuracy of KNN classifier on training set: 0.71
[[249  44]
 [100 106]]
              precision    recall  f1-score   support

           0       0.71      0.85      0.78       293
           1       0.71      0.51      0.60       206

   micro avg       0.71      0.71      0.71       499
   macro avg       0.71      0.68      0.69       499
weighted avg       0.71      0.71      0.70       499



### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

x

### 7. Save the best model in knn_fit

In [62]:
knn_fit = knn

# KNN - Iris Dataset

In [100]:
# # ignore warnings
# import warnings
# warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from acquire import get_iris_data
from prepare import prep_iris_data

df = prep_iris_data(get_iris_data())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
species         150 non-null int64
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
dtypes: float64(4), int64(1)
memory usage: 5.9 KB


In [102]:
df.dropna(inplace=True)
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df[['species']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))
knn.score(X_test, y_test)

Accuracy of KNN classifier on training set: 0.98
[[32  0  0]
 [ 0 39  1]
 [ 0  1 32]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       0.97      0.97      0.97        40
           2       0.97      0.97      0.97        33

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105

Accuracy of KNN classifier on test set: 0.98


0.9777777777777777

In [65]:
df.dropna(inplace=True)
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df[['species']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of KNN classifier on training set: 0.97
[[32  0  0]
 [ 0 39  1]
 [ 0  2 31]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       0.95      0.97      0.96        40
           2       0.97      0.94      0.95        33

   micro avg       0.97      0.97      0.97       105
   macro avg       0.97      0.97      0.97       105
weighted avg       0.97      0.97      0.97       105

Accuracy of KNN classifier on test set: 0.96


In [66]:
df.dropna(inplace=True)
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df[['species']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of KNN classifier on training set: 0.96
[[32  0  0]
 [ 0 39  1]
 [ 0  3 30]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       0.93      0.97      0.95        40
           2       0.97      0.91      0.94        33

   micro avg       0.96      0.96      0.96       105
   macro avg       0.97      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105

Accuracy of KNN classifier on test set: 0.91


# Random Forest - Titanic Dataset

In [67]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_titanic_data
from prepare import prep_titanic_data

df = prep_titanic_data(get_titanic_data())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
passenger_id       891 non-null int64
survived           891 non-null int64
pclass             891 non-null int64
sex                891 non-null object
age                714 non-null float64
sibsp              891 non-null int64
parch              891 non-null int64
fare               891 non-null float64
embarked           891 non-null object
class              891 non-null object
embark_town        891 non-null object
alone              891 non-null int64
embarked_encode    891 non-null int64
dtypes: float64(2), int64(7), object(4)
memory usage: 90.6+ KB


In [68]:
# Handle missing age values
df.dropna(inplace=True)

X = df[['pclass','age','fare','sibsp','parch']]
y = df.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
60,3,22.0,7.2292,0,0
348,3,3.0,15.9,1,1
606,3,30.0,7.8958,0,0
195,1,58.0,146.5208,0,0
56,2,21.0,10.5,0,0


In [69]:
df.isnull().sum()

passenger_id       0
survived           0
pclass             0
sex                0
age                0
sibsp              0
parch              0
fare               0
embarked           0
class              0
embark_town        0
alone              0
embarked_encode    0
dtype: int64

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [70]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

In [71]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [72]:
print(rf.feature_importances_)

[0.10439494 0.39013442 0.38148822 0.06701136 0.05697105]


In [73]:
y_pred = rf.predict(X_train)
y_pred[0:10]

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0])

In [74]:
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba[0:10]

array([[0.72104762, 0.27895238],
       [0.03      , 0.97      ],
       [0.822     , 0.178     ],
       [0.04      , 0.96      ],
       [0.26666667, 0.73333333],
       [0.802     , 0.198     ],
       [0.96333333, 0.03666667],
       [0.63      , 0.37      ],
       [0.79666667, 0.20333333],
       [0.74      , 0.26      ]])

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [75]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.98


In [76]:
print(confusion_matrix(y_train, y_pred))

[[291   2]
 [  6 200]]


In [77]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       293
           1       0.99      0.97      0.98       206

   micro avg       0.98      0.98      0.98       499
   macro avg       0.98      0.98      0.98       499
weighted avg       0.98      0.98      0.98       499



In [78]:
print('Accuracy of random forest classifier on TEST set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on TEST set: 0.71


### 4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [79]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [80]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [81]:
print(rf.feature_importances_)

[0.31756957 0.13479889 0.39019831 0.07086815 0.08656508]


In [82]:
y_pred = rf.predict(X_train)

In [83]:
y_pred_proba = rf.predict_proba(X_train)

In [84]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.75


In [85]:
print(confusion_matrix(y_train, y_pred))

[[247  46]
 [ 79 127]]


In [86]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80       293
           1       0.73      0.62      0.67       206

   micro avg       0.75      0.75      0.75       499
   macro avg       0.75      0.73      0.73       499
weighted avg       0.75      0.75      0.75       499



### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Min Leaf of 1 and Max depth of 20 performed better.

### 6. Save the best model in forest_fit

In [87]:
forest_fit = rf

# Random Forest - Iris Dataset

In [98]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_titanic_data
from prepare import prep_titanic_data

df = prep_iris_data(get_iris_data())
df.dropna(inplace=True)
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df.species
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

rf.fit(X_train, y_train)
print(rf.feature_importances_)
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

[0.08980597 0.0198138  0.44366243 0.44671781]
Accuracy of random forest classifier on training set: 1.00
[[32  0  0]
 [ 0 40  0]
 [ 0  0 33]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        40
           2       1.00      1.00      1.00        33

   micro avg       1.00      1.00      1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105

Accuracy of random forest classifier on test set: 0.93


In [99]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

rf.fit(X_train, y_train)
print(rf.feature_importances_)
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

[0.08647229 0.00786147 0.45520172 0.45046453]
Accuracy of random forest classifier on training set: 0.96
[[32  0  0]
 [ 0 37  3]
 [ 0  1 32]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       0.97      0.93      0.95        40
           2       0.91      0.97      0.94        33

   micro avg       0.96      0.96      0.96       105
   macro avg       0.96      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105

Accuracy of random forest classifier on test set: 0.96


# Test

## Titanic Dataset

In [117]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.80


## Iris Dataset

In [103]:
# Model score on TEST
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.93
