# Day 4 Exercises - Logistic Regression

#### Set up environment

In [44]:
#standard imports
import pandas as pd
import numpy as np
import pydataset
import acquire
import warnings
warnings.filterwarnings('ignore')

# plotting imports
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

#modeling imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import graphviz
from graphviz import Graph

### Acquire and Prep Data

In [2]:
iris = acquire.get_iris_data()
iris.drop(columns=['species_id', 'measurement_id'], inplace = True)
iris.rename(columns={'species_name':'species'}, inplace = True)
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


#### Split Data

In [40]:
X=iris[['sepal_length','sepal_width','petal_length','petal_width']]
y=iris[['species']]
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=.70,random_state=123)

### 1.) Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample.

In [4]:
# Create
logit=LogisticRegression(random_state =123)
# Fit
logit.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [5]:
# Just previewing the coefficients and y-intercept
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.38421538  1.32718255 -2.11307588 -0.94269552]
 [ 0.43099717 -1.34596217  0.4506587  -1.07117492]
 [-1.517952   -1.52141607  2.26046444  2.12613123]]
Intercept: 
 [ 0.25726194  0.58107381 -0.87235291]


In [6]:
# Predict
y_pred = logit.predict(X_train)
y_pred

array(['virginica', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor', 'virginica',
       'setosa', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'versicolor', 'virginica', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'virginica', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'virginica', 'virginica',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'virginica',
       'setosa', 'setosa', 'versicolor', 'setosa', 'virginica',
       'virginica', 'setosa', 'virginica', 'setosa', 'setosa',
       'versicolor', 'setosa', 'setosa', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'versicolor', 'setosa', 'setosa',
       'versicolor', 'virginica', 'setosa', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'se

In [7]:
# Check probability of predictions
y_pred_proba=logit.predict_proba(X_train)
y_pred_proba

array([[8.82438477e-04, 2.27909291e-01, 7.71208271e-01],
       [9.04840755e-04, 1.89452140e-01, 8.09643019e-01],
       [1.61592108e-02, 6.53564224e-01, 3.30276565e-01],
       [8.89718868e-01, 1.10252519e-01, 2.86122251e-05],
       [8.02996773e-01, 1.96869865e-01, 1.33362269e-04],
       [4.01764007e-03, 3.66910909e-01, 6.29071451e-01],
       [8.27624301e-01, 1.72231115e-01, 1.44583818e-04],
       [7.68419373e-01, 2.31515414e-01, 6.52132008e-05],
       [2.05897310e-02, 7.67212016e-01, 2.12198253e-01],
       [1.73946587e-02, 6.62952984e-01, 3.19652358e-01],
       [2.43782930e-02, 6.69593647e-01, 3.06028060e-01],
       [1.28790640e-01, 8.21403635e-01, 4.98057249e-02],
       [1.16916979e-03, 3.32890868e-01, 6.65939962e-01],
       [4.72232958e-02, 8.47334030e-01, 1.05442674e-01],
       [1.54834217e-03, 2.44290941e-01, 7.54160717e-01],
       [6.75565848e-01, 3.24060721e-01, 3.73431196e-04],
       [9.74637491e-04, 3.03672885e-01, 6.95352477e-01],
       [4.98965898e-02, 7.39623

### 2.) Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [16]:
print('Accuracy of Logistic regression on training set: {:.2f}'.format(logit.score(X_train,y_train)))

Accuracy of Logistic regression on training set: 0.95


In [18]:
cm = confusion_matrix(y_train,y_pred)
labels=sorted(y_train.species.unique())
pretty_cr=pd.DataFrame(confusion_matrix(y_train,y_pred),index=labels,columns=labels)
print('Confusion Matrix: \n\n', pretty_cr)

Confusion Matrix: 

             setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          36          4
virginica        0           1         32


In [19]:
cr=(classification_report(y_train,y_pred,output_dict=False))
print("Classification Report: \n\n",cr)

Classification Report: 

               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.90      0.94        40
   virginica       0.89      0.97      0.93        33

    accuracy                           0.95       105
   macro avg       0.95      0.96      0.95       105
weighted avg       0.95      0.95      0.95       105



### 3.) Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [36]:
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

f1_score=(TPR+PPV)/2

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

In [37]:
print("Accuracy: ", ACC)
print("True Positive Rate: ", TPR)
print("False Positive Rate: ", FPR)
print("True Negative Rate: ", TNR)
print("False Negative Rate: ", FNR)
print('Precision: ', PPV)
print("Recall: ", TPR)
print("F1-Score: ", f1_score)

Accuracy:  [1.         0.95238095 0.95238095]
True Positive Rate:  [1.         0.9        0.96969697]
False Positive Rate:  [0.         0.01538462 0.05555556]
True Negative Rate:  [1.         0.98461538 0.94444444]
False Negative Rate:  [0.         0.1        0.03030303]
Precision:  [1.         0.97297297 0.88888889]
Recall:  [1.         0.9        0.96969697]
F1-Score:  [1.         0.93648649 0.92929293]


### 4.) Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?

In [39]:
print('Accuracy of Logistic regression on training set: {:.2f}'.format(logit.score(X_test,y_test)))

Accuracy of Logistic regression on training set: 0.98


### 5.) Run through steps 2-4 using another solver (from question 5)

### 6.) Which performs better on your in-sample data?

# Day 5 Exercises - Decision Tree

### 1.) Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [50]:
# Create the decision tree object
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)
# Fit
clf.fit(X_train, y_train)
# Predict species
y_pred_tree = clf.predict(X_train)
# Preview
y_pred_tree

array(['virginica', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor', 'virginica',
       'setosa', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'versicolor', 'virginica', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'virginica', 'setosa', 'virginica',
       'versicolor', 'versicolor', 'setosa', 'virginica', 'virginica',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'virginica',
       'setosa', 'setosa', 'versicolor', 'setosa', 'versicolor',
       'virginica', 'setosa', 'virginica', 'setosa', 'setosa',
       'versicolor', 'setosa', 'setosa', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'versicolor', 'setosa', 'setosa',
       'versicolor', 'virginica', 'setosa', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'se

In [51]:
# Check the probability 
y_pred_proba_tree = clf.predict_proba(X_train)
y_pred_proba_tree

array([[0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [0.

### 2.) Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [52]:
# Model Score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.98


In [54]:
# Confusion Matrix
cmt = confusion_matrix(y_train,y_pred)
labels=sorted(y_train.species.unique())
pretty_cr=pd.DataFrame(cmt,index=labels,columns=labels)
print('Confusion Matrix: \n\n', pretty_cr)

Confusion Matrix: 

             setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          36          4
virginica        0           1         32


In [55]:
# Classification Report
crt=(classification_report(y_train,y_pred, output_dict=False))
print("Classification Report: \n\n",crt)

Classification Report: 

               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.90      0.94        40
   virginica       0.89      0.97      0.93        33

    accuracy                           0.95       105
   macro avg       0.95      0.96      0.95       105
weighted avg       0.95      0.95      0.95       105



### 3.) Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### 4.) Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?

### 5.) Run through steps 2-4 using another solver (from question 5)

### 6.) Which performs better on your in-sample data?