In [13]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

df = pd.DataFrame({
    'actual': ['coffee', 'no coffee', 'no coffee', 'coffee', 'coffee', 'coffee', 'no coffee', 'coffee'],
    'prediction': ['no coffee', 'no coffee', 'coffee', 'coffee', 'coffee', 'coffee', 'no coffee', 'no coffee'],
})
df


Unnamed: 0,actual,prediction
0,coffee,no coffee
1,no coffee,no coffee
2,no coffee,coffee
3,coffee,coffee
4,coffee,coffee
5,coffee,coffee
6,no coffee,no coffee
7,coffee,no coffee


In [14]:
pd.crosstab(df.actual, df.prediction)

prediction,coffee,no coffee
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
coffee,3,2
no coffee,1,2


In [15]:
confusion_matrix(df.actual, df.prediction,
                 labels = ('no coffee', 'coffee'))

array([[2, 1],
       [2, 3]])

In [16]:
df.actual.value_counts()

coffee       5
no coffee    3
Name: actual, dtype: int64

In [17]:
df['baseline_prediction'] = 'coffee'

In [18]:
df

Unnamed: 0,actual,prediction,baseline_prediction
0,coffee,no coffee,coffee
1,no coffee,no coffee,coffee
2,no coffee,coffee,coffee
3,coffee,coffee,coffee
4,coffee,coffee,coffee
5,coffee,coffee,coffee
6,no coffee,no coffee,coffee
7,coffee,no coffee,coffee


In [19]:
#ACCURACY
model_accuracy = (df.prediction == df.actual).mean()
baseline_accuracy = (df.baseline_prediction == df.actual).mean()

print(f'   model accuracy: {model_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

   model accuracy: 62.50%
baseline accuracy: 62.50%


### Recall is how well we do on actually positive cases. Here we'll define positive as preferring coffee. First we'll subset the dataframe so that we are only looking at the rows where we have the positive case. Then we'll evaluate how well our model's predictions do.

In [20]:
#RECALL
subset = df[df.actual == 'coffee']

model_recall = (subset.prediction == subset.actual).mean()
baseline_recall = (subset.baseline_prediction == subset.actual).mean()

print(f'   model recall: {model_recall:.2%}')
print(f'baseline recall: {baseline_recall:.2%}')

   model recall: 60.00%
baseline recall: 100.00%


### Notice here that our baseline model has 100% recall. This is because the baseline is to always predict the person prefers coffee, so we'll never miss an actually positive case.

### Next we'll calculate precision. Precision is based on just the times that the model predicts the positive class (coffee in this case). Because the predictions for our model and the baseline differ, we'll need to create 2 seperate subsets here.

In [21]:
#Precision
subset = df[df.prediction == 'coffee']
model_precision = (subset.prediction == subset.actual).mean()

subset = df[df.baseline_prediction == 'coffee']
baseline_precision = (subset.baseline_prediction == subset.actual).mean()

print(f'model precision: {model_precision:.2%}')
print(f'baseline precision: {baseline_precision:.2%}')

model precision: 75.00%
baseline precision: 62.50%


In [22]:
#model accuracy
(df.actual == df.prediction).mean()

0.625

In [23]:
#baseline accuracy
(df.actual == df.baseline_prediction).mean()

0.625

### Exercises

### 2. Given the following confusion matrix, evaluate (by hand) the model's performance.

In [27]:

#|               | pred dog   | pred cat   |
#|:------------  |-----------:|-----------:|
#| actual dog    |         46 |         7  |
#| actual cat    |         13 |         34 |


#### In the context of this problem, what is a false positive?
- False positive would be we predict its a dog, but its a cat

#### In the context of this problem, what is a false negative?
- False negative would be that we predict its a cat, but its a dog

#### How would you describe this model?

### 3. You are working as a datascientist working for Codeup Cody Creator (C3 for short), a rubber-duck manufacturing plant. Unfortunately, some of the rubber ducks that are produced will have defects. Your team has built several models that try to predict those defects, and the data from their predictions can be found here.

### Use the predictions dataset and pandas to help answer the following questions:

In [30]:
c3_df = pd.read_csv('c3.csv')
c3_df

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect
...,...,...,...,...
195,No Defect,No Defect,Defect,Defect
196,Defect,Defect,No Defect,No Defect
197,No Defect,No Defect,No Defect,No Defect
198,No Defect,No Defect,Defect,Defect


In [44]:
#Confusion matrix Model 1

confusion_matrix(c3_df.actual, c3_df.model1,
                 labels = ('No Defect', 'Defect'))

array([[182,   2],
       [  8,   8]])

In [45]:
#Confusion matrix Model 2

confusion_matrix(c3_df.actual, c3_df.model2,
                 labels = ('No Defect', 'Defect'))

array([[103,  81],
       [  7,   9]])

In [46]:
#Confusion matrix Model 3
confusion_matrix(c3_df.actual, c3_df.model3,
                 labels = ('No Defect', 'Defect'))

array([[98, 86],
       [ 3, 13]])

### An internal team wants to investigate the cause of the manufacturing defects. They tell you that they want to identify as many of the ducks that have a defect as possible. Which model would be the best fit for this use case?

#### Which evaluation metric would be appropriate here?

In [34]:
# Recall would be the most appropriate because the company wants to minimize false negatives (false negative
#is predicting no defects when there is a defect)

#### Which model would be the best fit for this use case?

In [None]:
# Baseline
df['baseline_prediction'] = 'coffee'

In [35]:
# Model positives
subset = c3_df[c3_df.actual == 'Defect']
subset

Unnamed: 0,actual,model1,model2,model3
13,Defect,No Defect,Defect,Defect
30,Defect,Defect,No Defect,Defect
65,Defect,Defect,Defect,Defect
70,Defect,Defect,Defect,Defect
74,Defect,No Defect,No Defect,Defect
87,Defect,No Defect,Defect,Defect
118,Defect,No Defect,Defect,No Defect
135,Defect,Defect,No Defect,Defect
140,Defect,No Defect,Defect,Defect
147,Defect,Defect,No Defect,Defect


In [37]:
#model1
model_recall = (subset.actual == subset.model1).mean()
model_recall

0.5

In [39]:
#model 2
model_recall = (subset.actual == subset.model2).mean()
model_recall

0.5625

In [41]:
#model 3
model_recall = (subset.actual == subset.model3).mean()
model_recall

0.8125

In [42]:
#### Conclusion: Model 3 appears to have the highest recall of the three models

### Recently several stories in the local news have come out highlighting customers who received a rubber duck with a defect, and portraying C3 in a bad light. The PR team has decided to launch a program that gives customers with a defective duck a vacation to Hawaii. They need you to predict which ducks will have defects, but tell you the really don't want to accidentally give out a vacation package when the duck really doesn't have a defect.  Which model would be the best fit for this use case?

#### Which evaluation metric would be appropriate here? 

In [None]:
#Precision
