# Applying Metrics to your POI Identifier

In [1]:
import pickle

from sklearn.tree import DecisionTreeClassifier
from sklearn import cross_validation
from feature_format import featureFormat, targetFeatureSplit

data_dict = pickle.load(open('final_project_dataset.pkl', 'r'))
features_list = ['poi', 'salary']

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

features_train, features_test, labels_train, labels_test = \
   cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)
    
clf = DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
print clf.score(features_test, labels_test)

0.724137931034




# Number of POIs in Test Set

In [2]:
pred = clf.predict(features_test)
print sum(pred)

4.0


# Number of People in Test Set

In [3]:
print len(features_test)

29


# Accuracy of a Biased Identifier

In [4]:
from sklearn.metrics import accuracy_score

In [9]:
bad_pred = [0.0] * len(features_test)
print accuracy_score(bad_pred, labels_test)

0.862068965517


# Number of True Positives

In [14]:
clf = DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

true_positives = [(x,y) for x, y in zip(pred, labels_test) if x == y and x == 1.0]
print "True positives on the Overfitted model: ", len(true_positives)

True positives on the Overfitted model:  0


# Unpacking Into Precision and Recall

In [19]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# precision = true_positives / (true_positives + false_positives)
precision_score(pred, labels_test)

0.0

# Recall of Your POI Identifier

In [20]:
# recall = true_positives / (true_positives + false_negatives)
recall_score(pred, labels_test)

0.0

# How Many True Positives?

In [36]:
predictions = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] 
true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]

true_positives = [(x,y) for x, y in zip(predictions, true_labels) if x == y and x == 1.0]
print len(true_positives)

6


# How Many True Negatives?

In [37]:
predictions = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] 
true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]

true_negatives = [(x,y) for x, y in zip(predictions, true_labels) if x == y and x == 0.0]
print len(true_negatives)

9


# False Positives?

In [38]:
predictions = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] 
true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]

false_positives = [(x,y) for x, y in zip(predictions, true_labels) if x != y and x == 1.0]
print len(false_positives)

3


# False Negatives?

In [39]:
predictions = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] 
true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]

false_negatives = [(x,y) for x, y in zip(predictions, true_labels) if x != y and x == 0.0]
print len(false_negatives)

2


# Precision

In [45]:
# precision = true_positives / (true_positives + false_positives)
print float(len(true_positives)) / float(len(true_positives) + len(false_positives))

0.666666666667


# Recall

In [46]:
# recall = true_positives / (true_positives + false_negatives)
print float(len(true_positives)) / float(len(true_positives) + len(false_negatives))

0.75
