In [83]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd 

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

2. Given the following confusion matrix, evaluate (by hand) the model's performance.


|               | pred dog   | pred cat   |
|:------------  |-----------:|-----------:|
| actual dog    |         46 |         7  |
| actual cat    |         13 |         34 |


- In the context of this problem, what is a false positive?

Answer: The FPs are the 7 that were predicted as cat when they were actually a dog.

- In the context of this problem, what is a false negative?


Answer: The FNs were the 13 that were actually cats when they were predicted to be dogs. 

- How would you describe this model?


accuracy: (TP + TN) / (TP + TN + FP + FN)

In [80]:
(34 + 46) / (34 + 46 + 7 + 13)

0.8

precision = (TP) / (TP + FP)

In [81]:
34 / (34 + 7)

0.8292682926829268

Answer: I would describe this model as a cat classifier with 80% accuracy and 83% precision.

3. You are working as a datascientist working for Codeup Cody Creator (C3 for short), a rubber-duck manufacturing plant.

    Unfortunately, some of the rubber ducks that are produced will have defects. Your team has built several models that try to predict those defects, and the data from their predictions can be found here.



    Use the predictions dataset and pandas to help answer the following questions:

In [6]:
c3 = pd.read_csv('c3.csv')
c3

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect
...,...,...,...,...
195,No Defect,No Defect,Defect,Defect
196,Defect,Defect,No Defect,No Defect
197,No Defect,No Defect,No Defect,No Defect
198,No Defect,No Defect,Defect,Defect


- An internal team wants to investigate the cause of the manufacturing defects. They tell you that they want to identify as many of the ducks that have a defect as possible. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

In [7]:
# positive: ducks that have a defect 
# negative: ducks that have no defect

In [8]:
# want to find ducks that have defects

In [9]:
# Cost of FN is greater,
# because that's sending a duck with defects into the world 
# and it would be better to throw away a good duck occasionally than 
# send out a duck with defects. 

In [13]:
# metric to use is Recall (minimizing false negatives)
# recall: TP / (TP + FN)

In [39]:
# the best model is the 3rd one because is has the highest recall percentage (81.25%)
# The 3rd model would prevent more FNs than the other two

In [16]:
c3.actual.value_counts()


No Defect    184
Defect        16
Name: actual, dtype: int64

In [23]:
subset = c3[c3.actual == 'Defect']
subset

Unnamed: 0,actual,model1,model2,model3,baseline_prediction
13,Defect,No Defect,Defect,Defect,Defect
30,Defect,Defect,No Defect,Defect,Defect
65,Defect,Defect,Defect,Defect,Defect
70,Defect,Defect,Defect,Defect,Defect
74,Defect,No Defect,No Defect,Defect,Defect
87,Defect,No Defect,Defect,Defect,Defect
118,Defect,No Defect,Defect,No Defect,Defect
135,Defect,Defect,No Defect,Defect,Defect
140,Defect,No Defect,Defect,Defect,Defect
147,Defect,Defect,No Defect,Defect,Defect


In [36]:
# 1st model
model_recall = (subset.actual == subset.model1).mean()

In [31]:
print(f'model recall: {model_recall:.2%}')

model recall: 50.00%


In [37]:
# 2nd model
model_recall = (subset.actual == subset.model2).mean()

In [33]:
print(f'model recall: {model_recall:.2%}')

model recall: 56.25%


In [38]:
# 3rd model
model_recall = (subset.actual == subset.model3).mean()

In [35]:
print(f'model recall: {model_recall:.2%}')

model recall: 81.25%


- Recently several stories in the local news have come out highlighting customers who received a rubber duck with a defect, and portraying C3 in a bad light. The PR team has decided to launch a program that gives customers with a defective duck a vacation to Hawaii. They need you to predict which ducks will have defects, but tell you the really don't want to accidentally give out a vacation package when the duck really doesn't have a defect. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

In [54]:
# In this case the cost of a FP is higher because the company will be giving 
# out a vaca package to someone whose duck was actually fine.
# The metric we will use for this is precicion (use when FP is more costly)
# precision: TP / (TP + FP)
# Model 1 has the highest precision so that's the one we will want to use to prevent the most FPs.

In [46]:
# for model 1 
subset = c3[c3.model1 == 'Defect']
subset

Unnamed: 0,actual,model1,model2,model3,baseline_prediction
3,No Defect,Defect,Defect,Defect,Defect
30,Defect,Defect,No Defect,Defect,Defect
62,No Defect,Defect,No Defect,No Defect,Defect
65,Defect,Defect,Defect,Defect,Defect
70,Defect,Defect,Defect,Defect,Defect
135,Defect,Defect,No Defect,Defect,Defect
147,Defect,Defect,No Defect,Defect,Defect
163,Defect,Defect,Defect,Defect,Defect
194,Defect,Defect,No Defect,Defect,Defect
196,Defect,Defect,No Defect,No Defect,Defect


In [48]:
model1_precision = (subset.actual == subset.model1).mean()
print(f'model1 precision: {model1_precision:.2%}')

model1 precision: 80.00%


In [50]:
# for model 2
subset = c3[c3.model2 == 'Defect']
subset

Unnamed: 0,actual,model1,model2,model3,baseline_prediction
0,No Defect,No Defect,Defect,No Defect,Defect
1,No Defect,No Defect,Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect,Defect
3,No Defect,Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect,Defect
...,...,...,...,...,...
183,No Defect,No Defect,Defect,No Defect,Defect
185,No Defect,No Defect,Defect,No Defect,Defect
193,No Defect,No Defect,Defect,Defect,Defect
195,No Defect,No Defect,Defect,Defect,Defect


In [51]:
model2_precision = (subset.actual == subset.model2).mean()
print(f'model2 precision: {model2_precision:.2%}')

model2 precision: 10.00%


In [52]:
# for model 3
subset = c3[c3.model3 == 'Defect']
subset

Unnamed: 0,actual,model1,model2,model3,baseline_prediction
1,No Defect,No Defect,Defect,Defect,Defect
3,No Defect,Defect,Defect,Defect,Defect
5,No Defect,No Defect,No Defect,Defect,Defect
9,No Defect,No Defect,No Defect,Defect,Defect
13,Defect,No Defect,Defect,Defect,Defect
...,...,...,...,...,...
193,No Defect,No Defect,Defect,Defect,Defect
194,Defect,Defect,No Defect,Defect,Defect
195,No Defect,No Defect,Defect,Defect,Defect
198,No Defect,No Defect,Defect,Defect,Defect


In [53]:
model3_precision = (subset.actual == subset.model3).mean()
print(f'model3 precision: {model3_precision:.2%}')

model3 precision: 13.13%


4. You are working as a data scientist for Gives You Paws ™, a subscription based service that shows you cute pictures of dogs or cats (or both for an additional fee).

    At Gives You Paws, anyone can upload pictures of their cats or dogs. The photos are then put through a two step process. First an automated algorithm tags pictures as either a cat or a dog (Phase I). Next, the photos that have been initially identified are put through another round of review, possibly with some human oversight, before being presented to the users (Phase II).
    
    Several models have already been developed with the data, and you can find their results here.

    Given this dataset, use pandas to create a baseline model (i.e. a model that just predicts the most common class) and answer the following questions:


In [56]:
# create a baseline model

In [57]:
paws = pd.read_csv('gives_you_paws.csv')
paws.head()

Unnamed: 0,actual,model1,model2,model3,model4
0,cat,cat,dog,cat,dog
1,dog,dog,cat,cat,dog
2,dog,cat,cat,cat,dog
3,dog,dog,dog,cat,dog
4,cat,cat,cat,dog,dog


In [58]:
paws.actual.value_counts()

dog    3254
cat    1746
Name: actual, dtype: int64

In [59]:
# most frequent label
paws['baseline_prediction'] = 'dog'

In [60]:
paws.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline_prediction
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog


        a. In terms of accuracy, how do the various models compare to the baseline model? Are any of the models better than the baseline?


Answer: Models 1 & 4 are better thsn the baseline prediction

In [64]:
# for model 1
model1_accuracy = (paws.actual == paws.model1).mean()
baseline_accuracy = (paws.actual == paws.baseline_prediction).mean()

print(f'model1 accuracy: {model_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

model1 accuracy: 80.74%
baseline accuracy: 65.08%


In [67]:
# for model 2 
model2_accuracy = (paws.actual == paws.model2).mean()
baseline_accuracy = (paws.actual == paws.baseline_prediction).mean()

print(f'model2 accuracy: {model2_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

model2 accuracy: 63.04%
baseline accuracy: 65.08%


In [68]:
# for model 3
model3_accuracy = (paws.actual == paws.model3).mean()
baseline_accuracy = (paws.actual == paws.baseline_prediction).mean()

print(f'model1 accuracy: {model3_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

model1 accuracy: 50.96%
baseline accuracy: 65.08%


In [69]:
# for model 4
model4_accuracy = (paws.actual == paws.model4).mean()
baseline_accuracy = (paws.actual == paws.baseline_prediction).mean()

print(f'model1 accuracy: {model4_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

model1 accuracy: 74.26%
baseline accuracy: 65.08%


In [None]:
# Alternative way with cool code: 

In [77]:
# make a list of the models + baseline_prediction
models = list(paws.columns)
models = models[1:] # skip 'actual'
print(models)

['model1', 'model2', 'model3', 'model4', 'baseline_prediction']


In [78]:
accuracy_out = {} #collector
for model in models:
    accuracy = (paws.actual == paws[model]).mean()
    accuracy_out.update({model:accuracy})
print(accuracy_out)

{'model1': 0.8074, 'model2': 0.6304, 'model3': 0.5096, 'model4': 0.7426, 'baseline_prediction': 0.6508}


In [79]:
#make a data frame of accuracy results
pd.DataFrame(accuracy_out.items(), columns = ['model', 'accuracy'])

Unnamed: 0,model,accuracy
0,model1,0.8074
1,model2,0.6304
2,model3,0.5096
3,model4,0.7426
4,baseline_prediction,0.6508


        b. Suppose you are working on a team that solely deals with dog pictures. Which of these models would you recomend for Phase I? For Phase II?

Answer: 

Phase 1: Use model 4 for recall (96%).

Phase 2: Use model 2 for precision (89%).

In [85]:
# For phase 1
# for pahase one dealing with dog pictures we don't want any false negatives 
# (cats coming bacck as dogs)
# so we will use recall for phase 1 

In [87]:
subset = paws[paws.actual == 'dog']
subset.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline_prediction
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
5,dog,dog,dog,dog,dog,dog
8,dog,dog,cat,dog,dog,dog


In [88]:
# For model 1 
(subset.actual == subset.model1).mean()

0.803318992009834

In [90]:
# For model 2 
(subset.actual == subset.model2).mean()

0.49078057775046097

In [92]:
# For model 3
(subset.actual == subset.model3).mean()

0.5086047940995697

In [93]:
# For model 4 
(subset.actual == subset.model4).mean()

0.9557467732022127

In [112]:
# model 4 has the highest subset
pd.crosstab(paws.model4, paws.actual)

actual,cat,dog
model4,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,603,144
dog,1143,3110


In [97]:
# For phase 2
# Phase 2 is the last round before it goes to customers so we want to minimize FPs. 
# We will use precision to minimize FPs. 

In [99]:
# create subset for each model
subset1 = paws[paws.model1 == 'dog']
subset2 = paws[paws.model2 == 'dog']
subset3 = paws[paws.model3 == 'dog']
subset4 = paws[paws.model4 == 'dog']

In [100]:
# For model 1 
(subset1.actual == subset1.model1).mean()

0.8900238338440586

In [102]:
# For model 2
(subset2.actual == subset2.model2).mean()

0.8931767337807607

In [103]:
# For model 3
(subset3.actual == subset3.model3).mean()

0.6598883572567783

In [130]:
# For model 4 
(subset4.actual == subset4.model4).mean()

0.8072289156626506

In [106]:
# model 2 has the highest precision 
pd.crosstab(paws.model2, paws.actual)

actual,cat,dog
model2,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,1555,1657
dog,191,1597


        c. Suppose you are working on a team that solely deals with cat pictures. Which of these models would you recomend for Phase I? For Phase II?


Answer: 

Phase 1: Use model 2 for recall (89%).

Phase 2: Use model 4 for precision (81%).

In [121]:
# Same steps did for dog but for cat this time

In [123]:
# phase 1 - subset

In [115]:
subset = paws[paws.actual == 'cat']
subset.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline_prediction
0,cat,cat,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog
6,cat,cat,cat,cat,dog,dog
7,cat,dog,cat,cat,dog,dog
11,cat,cat,dog,cat,cat,dog


In [116]:
# For model 1 
(subset.actual == subset.model1).mean()

0.8150057273768614

In [117]:
# For model 2 
(subset.actual == subset.model2).mean()

0.8906071019473081

In [118]:
# For model 3
(subset.actual == subset.model3).mean()

0.5114547537227949

In [119]:
# For model 4 
(subset.actual == subset.model4).mean()

0.34536082474226804

In [120]:
# model 2 has the highest subset
pd.crosstab(paws.model2, paws.actual)

actual,cat,dog
model2,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,1555,1657
dog,191,1597


In [125]:
# phase 2 - precision

In [127]:
# create subset for each model
subset1 = paws[paws.model1 == 'cat']
subset2 = paws[paws.model2 == 'cat']
subset3 = paws[paws.model3 == 'cat']
subset4 = paws[paws.model4 == 'cat']

In [128]:
# For model 1 
(subset1.actual == subset1.model1).mean()

0.6897721764420747

In [129]:
# For model 2
(subset2.actual == subset2.model2).mean()

0.4841220423412204

In [131]:
# For model 3
(subset3.actual == subset3.model3).mean()

0.358346709470305

In [132]:
# For model 4 
(subset4.actual == subset4.model4).mean()

0.8072289156626506

In [133]:
# model 4 has the highest precision 
pd.crosstab(paws.model4, paws.actual)

actual,cat,dog
model4,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,603,144
dog,1143,3110


5. Follow the links below to read the documentation about each function, then apply those functions to the data from the previous problem.

    - sklearn.metrics.accuracy_score
    - sklearn.metrics.precision_score
    - sklearn.metrics.recall_score
    - sklearn.metrics.classification_report

In [137]:
# cats still the positive class

In [136]:
# sklearn.metrics.classification_report

In [134]:
x = classification_report(paws.actual, paws.model1, labels = ['cat', 'dog'], output_dict=True)

In [135]:
pd.DataFrame(x).T

Unnamed: 0,precision,recall,f1-score,support
cat,0.689772,0.815006,0.747178,1746.0
dog,0.890024,0.803319,0.844452,3254.0
accuracy,0.8074,0.8074,0.8074,0.8074
macro avg,0.789898,0.809162,0.795815,5000.0
weighted avg,0.820096,0.8074,0.810484,5000.0


In [138]:
# For model 1 
print("Model 1")
pd.DataFrame(classification_report(paws.actual, paws.model1, labels = ['cat', 'dog'], output_dict=True)).T


Model 1


Unnamed: 0,precision,recall,f1-score,support
cat,0.689772,0.815006,0.747178,1746.0
dog,0.890024,0.803319,0.844452,3254.0
accuracy,0.8074,0.8074,0.8074,0.8074
macro avg,0.789898,0.809162,0.795815,5000.0
weighted avg,0.820096,0.8074,0.810484,5000.0


In [139]:
# For model 2
print("Model 2")
pd.DataFrame(classification_report(paws.actual, paws.model2, labels = ['cat', 'dog'], output_dict=True)).T


Model 2


Unnamed: 0,precision,recall,f1-score,support
cat,0.484122,0.890607,0.627269,1746.0
dog,0.893177,0.490781,0.633479,3254.0
accuracy,0.6304,0.6304,0.6304,0.6304
macro avg,0.688649,0.690694,0.630374,5000.0
weighted avg,0.750335,0.6304,0.63131,5000.0


In [140]:
# For model 3
print("Model 3")
pd.DataFrame(classification_report(paws.actual, paws.model3, labels = ['cat', 'dog'], output_dict=True)).T


Model 3


Unnamed: 0,precision,recall,f1-score,support
cat,0.358347,0.511455,0.421425,1746.0
dog,0.659888,0.508605,0.574453,3254.0
accuracy,0.5096,0.5096,0.5096,0.5096
macro avg,0.509118,0.51003,0.497939,5000.0
weighted avg,0.55459,0.5096,0.521016,5000.0


In [141]:
# For model 4 
print("Model 4")
pd.DataFrame(classification_report(paws.actual, paws.model4, labels = ['cat', 'dog'], output_dict=True)).T


Model 4


Unnamed: 0,precision,recall,f1-score,support
cat,0.807229,0.345361,0.483755,1746.0
dog,0.731249,0.955747,0.82856,3254.0
accuracy,0.7426,0.7426,0.7426,0.7426
macro avg,0.769239,0.650554,0.656157,5000.0
weighted avg,0.757781,0.7426,0.708154,5000.0
