In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as warn
import sys
import env
import numpy as np

sys.path.append(env.util_repo)

from pydataset import data
from sklearn.model_selection import train_test_split
from scipy import stats
from utilities import evaluate_hypothesis_ttest, evaluate_hypothesis_pcorrelation

warn.filterwarnings("ignore")

### 1. Create a new file named model_evaluation.py or model_evaluation.ipynb for these exercises.

### 2. Given the following confusion matrix, evaluate (by hand) the model's performance.

|               | pred dog   | pred cat   |
|:------------  |-----------:|-----------:|
| actual dog    |         46 |         7  |
| actual cat    |         13 |         34 |

In [8]:
# Dogs are the 'positive' outcome
accuracy = (46 + 34) / (46 + 7 + 13 + 34)
print(f"Accuracy:  {round(accuracy * 100, 2)}%")

precision = 46 / (46 + 13)
print(f"Precision:  {round(precision * 100, 2)}%")

recall = 46 / (46 + 7)
print(f"Recall:  {round(recall * 100, 2)}%")

Accuracy:  80.0%
Precision:  77.97%
Recall:  86.79%


<li>In the context of this problem, what is a false positive?</ul>
<p>If we set dogs as the 'positive' outcome, false positives are when we predict dog, but actually get a cat. There were 13 false positives.

<li>In the context of this problem, what is a false negative?
<p>A false negative is when we predict cat, but actually get a dog. There were 7 false negatives.

<li>How would you describe this model?
<p>This model is highly accurate since it correctly predicted the outcomes 80% of the time which means that percentage of all predictions matched the actual result. It is also highly precise when predicting positive outcomes since it has a precision of 78% which means that percentage of dog predictions were correct. The recall rate is also high at 87% which means that percentage of actual dog results was predicted correctly.

### 3. You are working as a datascientist working for Codeup Cody Creator (C3 for short), a rubber-duck manufacturing plant.

#### Unfortunately, some of the rubber ducks that are produced will have defects. Your team has built several models that try to predict those defects, and the data from their predictions can be found here.

#### Use the predictions dataset and pandas to help answer the following questions:

In [11]:
ducks_df = pd.read_csv(env.data_path + "/c3.csv")
ducks_df.head()

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect


In [12]:
ducks_df.shape

(200, 4)

In [18]:
model_1 = ducks_df[['actual', 'model1']]
model_1.rename(columns={'model1' : 'predict'}, inplace=True)
pd.crosstab(model_1.predict, model_1.actual)

actual,Defect,No Defect
predict,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,8,2
No Defect,8,182


In [19]:
model_2 = ducks_df[['actual', 'model2']]
model_2.rename(columns={'model2' : 'predict'}, inplace=True)
pd.crosstab(model_2.predict, model_2.actual)

actual,Defect,No Defect
predict,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,9,81
No Defect,7,103


In [20]:
model_3 = ducks_df[['actual', 'model3']]
model_3.rename(columns={'model3' : 'predict'}, inplace=True)
pd.crosstab(model_3.predict, model_3.actual)

actual,Defect,No Defect
predict,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,13,86
No Defect,3,98


<ul>
    <li>An internal team wants to investigate the cause of the manufacturing defects. They tell you that they want to identify as many of the ducks that have a defect as possible. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?
    <li>Recently several stories in the local news have come out highlighting customers who received a rubber duck with a defect, and portraying C3 in a bad light. The PR team has decided to launch a program that gives customers with a defective duck a vacation to Hawaii. They need you to predict which ducks will have defects, but tell you the really don't want to accidentally give out a vacation package when the duck really doesn't have a defect. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?