In [1]:
import sys
import pickle
sys.path.append("../tools/")
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
import numpy as np
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi'] 
#copied all features in from Udacity course
financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 
                     'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 
                     'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees'] 

email_features = ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 
                  'shared_receipt_with_poi']
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

all_features = features_list + email_features + financial_features 
all_features.remove('email_address') 



In [2]:
### Task 2: Remove outliers

'''Removing outlying keys based on manual examination of the dataset above- including The Travel Agency... and 'Total'. 
Additionaly, removing email addresses as they will not add prediction value.'''
email_features.remove('email_address') 
data_dict.pop('THE TRAVEL AGENCY IN THE PARK')
data_dict.pop('TOTAL')

{'bonus': 97343619,
 'deferral_payments': 32083396,
 'deferred_income': -27992891,
 'director_fees': 1398517,
 'email_address': 'NaN',
 'exercised_stock_options': 311764000,
 'expenses': 5235198,
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 83925000,
 'long_term_incentive': 48521928,
 'other': 42667589,
 'poi': False,
 'restricted_stock': 130322299,
 'restricted_stock_deferred': -7576788,
 'salary': 26704229,
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 309886585,
 'total_stock_value': 434509511}

In [3]:
#Print minimums and maximums to look for outliers
for feature in all_features:
    print feature
    feature = [item[feature] for k, item in 
    data_dict.iteritems() if not item[feature] == "NaN"]
    print ('min is: %d' % min(feature))
    print ('max is: %d' % max(feature))

poi
min is: 0
max is: 1
to_messages
min is: 57
max is: 15149
from_poi_to_this_person
min is: 0
max is: 528
from_messages
min is: 12
max is: 14368
from_this_person_to_poi
min is: 0
max is: 609
shared_receipt_with_poi
min is: 2
max is: 5521
salary
min is: 477
max is: 1111258
deferral_payments
min is: -102500
max is: 6426990
total_payments
min is: 148
max is: 103559793
loan_advances
min is: 400000
max is: 81525000
bonus
min is: 70000
max is: 8000000
restricted_stock_deferred
min is: -1787380
max is: 15456290
deferred_income
min is: -3504386
max is: -833
total_stock_value
min is: -44093
max is: 49110078
expenses
min is: 148
max is: 228763
exercised_stock_options
min is: 3285
max is: 34348384
other
min is: 2
max is: 10359729
long_term_incentive
min is: 69223
max is: 5145434
restricted_stock
min is: -2604490
max is: 14761694
director_fees
min is: 3285
max is: 137864


In [4]:
### Task 3: Create new feature(s)
def calcluatePercent(messages, allMessages):
    percent = 0
    if (messages == 'NaN' or allMessages == 'NaN'):
        return percent
    percent = messages / float(allMessages)
    return percent


def createNewFeatures(data_dict):
    for poi_name in data_dict:
        new_dict = data_dict[poi_name]
        new_dict['from_poi_to_this_person_ratio'] = calcluatePercent(new_dict['from_poi_to_this_person'],
                                                                   new_dict['to_messages'])
        new_dict['from_this_person_to_poi_ratio'] = calcluatePercent(new_dict['from_this_person_to_poi'],
                                                                   new_dict['from_messages'])
    return new_dict, ['from_poi_to_this_person_ratio', 'from_this_person_to_poi_ratio']



for entry in data_dict:

    data_point = data_dict[entry]

    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    percent_from_poi = calcluatePercent(from_poi_to_this_person, to_messages )
    data_point["percent_from_poi"] = percent_from_poi


    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    percent_to_poi = calcluatePercent( from_this_person_to_poi, from_messages )
    data_point["percent_to_poi"] = percent_to_poi
features_list_n = all_features
features_list_n =  features_list_n + ['percent_from_poi', 'percent_to_poi']
pprint.pprint (features_list_n)


### Store to my_dataset for easy export below.
my_dataset = data_dict






['poi',
 'to_messages',
 'from_poi_to_this_person',
 'from_messages',
 'from_this_person_to_poi',
 'shared_receipt_with_poi',
 'salary',
 'deferral_payments',
 'total_payments',
 'loan_advances',
 'bonus',
 'restricted_stock_deferred',
 'deferred_income',
 'total_stock_value',
 'expenses',
 'exercised_stock_options',
 'other',
 'long_term_incentive',
 'restricted_stock',
 'director_fees',
 'percent_from_poi',
 'percent_to_poi']


In [5]:
#Accidentally pulled in Email Address again, removing:
#features_list_n.remove('email_address') 
pprint.pprint (features_list_n)

['poi',
 'to_messages',
 'from_poi_to_this_person',
 'from_messages',
 'from_this_person_to_poi',
 'shared_receipt_with_poi',
 'salary',
 'deferral_payments',
 'total_payments',
 'loan_advances',
 'bonus',
 'restricted_stock_deferred',
 'deferred_income',
 'total_stock_value',
 'expenses',
 'exercised_stock_options',
 'other',
 'long_term_incentive',
 'restricted_stock',
 'director_fees',
 'percent_from_poi',
 'percent_to_poi']


In [6]:
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.DataFrame.from_dict(data_dict, orient='index',
                       columns=['poi',
 'to_messages',
 'from_poi_to_this_person',
 'from_messages',
 'from_this_person_to_poi',
 'shared_receipt_with_poi',
 'salary',
 'deferral_payments',
 'total_payments',
 'loan_advances',
 'bonus',
 'restricted_stock_deferred',
 'deferred_income',
 'total_stock_value',
 'expenses',
 'exercised_stock_options',
 'other',
 'long_term_incentive',
 'restricted_stock',
 'director_fees',
 'percent_from_poi',
 'percent_to_poi'])

Unnamed: 0,poi,to_messages,from_poi_to_this_person,from_messages,from_this_person_to_poi,shared_receipt_with_poi,salary,deferral_payments,total_payments,loan_advances,...,deferred_income,total_stock_value,expenses,exercised_stock_options,other,long_term_incentive,restricted_stock,director_fees,percent_from_poi,percent_to_poi
ALLEN PHILLIP K,False,2902,47,2195,65,1407,201955,2869717,4484442,,...,-3081055,1729541,13868,1729541,152,304805,126027,,0.016196,0.029613
BADUM JAMES P,False,,,,,,,178980,182466,,...,,257817,3486,257817,,,,,0.000000,0.000000
BANNANTINE JAMES M,False,566,39,29,0,465,477,,916197,,...,-5104,5243487,56301,4046157,864523,,1757552,,0.068905,0.000000
BAXTER JOHN C,False,,,,,,267102,1295738,5634343,,...,-1386055,10623258,11200,6680544,2660303,1586055,3942714,,0.000000,0.000000
BAY FRANKLIN R,False,,,,,,239671,260455,827696,,...,-201641,63014,129142,,69,,145796,,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WINOKUR JR. HERBERT S,False,,,,,,,,84992,,...,-25000,,1413,,,,,108579,0.000000,0.000000
WODRASKA JOHN,False,,,,,,,,189583,,...,,,,,189583,,,,0.000000,0.000000
WROBEL BRUCE,False,,,,,,,,,,...,,139130,,139130,,,,,0.000000,0.000000
YEAGER F SCOTT,True,,,,,,158403,,360300,,...,,11884758,53947,8308552,147950,,3576206,,0.000000,0.000000


In [7]:
for feature in features_list_n:
    print feature
    feature = [item[feature] for k, item in 
    data_dict.iteritems() if not item[feature] == "NaN"]
    print ('min is: %d' % min(feature))
    print ('max is: %d' % max(feature))

poi
min is: 0
max is: 1
to_messages
min is: 57
max is: 15149
from_poi_to_this_person
min is: 0
max is: 528
from_messages
min is: 12
max is: 14368
from_this_person_to_poi
min is: 0
max is: 609
shared_receipt_with_poi
min is: 2
max is: 5521
salary
min is: 477
max is: 1111258
deferral_payments
min is: -102500
max is: 6426990
total_payments
min is: 148
max is: 103559793
loan_advances
min is: 400000
max is: 81525000
bonus
min is: 70000
max is: 8000000
restricted_stock_deferred
min is: -1787380
max is: 15456290
deferred_income
min is: -3504386
max is: -833
total_stock_value
min is: -44093
max is: 49110078
expenses
min is: 148
max is: 228763
exercised_stock_options
min is: 3285
max is: 34348384
other
min is: 2
max is: 10359729
long_term_incentive
min is: 69223
max is: 5145434
restricted_stock
min is: -2604490
max is: 14761694
director_fees
min is: 3285
max is: 137864
percent_from_poi
min is: 0
max is: 0
percent_to_poi
min is: 0
max is: 1


In [8]:
'''
def findKbestFeatures(data_dict, features_list_n, k):
    from sklearn.feature_selection import f_classif
    data = featureFormat(data_dict, features_list_n)
    labels, features = targetFeatureSplit(data)

    k_best = SelectKBest(f_classif, k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    return k_best_features
    unsorted_pairs = zip(features_list[1:], scores)
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    print("sorted_pairs", sorted_pairs)
    k_best_features = dict(sorted_pairs[:k])
'''

    


'\ndef findKbestFeatures(data_dict, features_list_n, k):\n    from sklearn.feature_selection import f_classif\n    data = featureFormat(data_dict, features_list_n)\n    labels, features = targetFeatureSplit(data)\n\n    k_best = SelectKBest(f_classif, k=k)\n    k_best.fit(features, labels)\n    scores = k_best.scores_\n    return k_best_features\n    unsorted_pairs = zip(features_list[1:], scores)\n    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))\n    print("sorted_pairs", sorted_pairs)\n    k_best_features = dict(sorted_pairs[:k])\n'

In [12]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list_n, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [18]:
def skipOne(elem):
    return elem[1]
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k = 5)
selector.fit(features, labels)
scores = zip(features_list_n[1:], selector.scores_)
sorted_scores = sorted(scores, key = skipOne, reverse = True)
pprint.pprint('SelectKBest scores: ')
pprint.pprint( sorted_scores)
all_features =  features_list + [(i[0]) for i in sorted_scores[0:20]]
pprint.pprint( all_features)
kBest_features = features_list + [(i[0]) for i in sorted_scores[0:10]]
pprint.pprint( 'KBest')
pprint.pprint( kBest_features)



'SelectKBest scores: '
[('exercised_stock_options', 24.815079733218194),
 ('total_stock_value', 24.18289867856688),
 ('bonus', 20.792252047181535),
 ('salary', 18.289684043404513),
 ('percent_to_poi', 16.40971254803579),
 ('deferred_income', 11.458476579280369),
 ('long_term_incentive', 9.922186013189823),
 ('restricted_stock', 9.2128106219771),
 ('total_payments', 8.772777730091676),
 ('shared_receipt_with_poi', 8.589420731682381),
 ('loan_advances', 7.184055658288725),
 ('expenses', 6.094173310638945),
 ('from_poi_to_this_person', 5.243449713374958),
 ('other', 4.187477506995375),
 ('percent_from_poi', 3.128091748156719),
 ('from_this_person_to_poi', 2.382612108227674),
 ('director_fees', 2.1263278020077054),
 ('to_messages', 1.6463411294420076),
 ('deferral_payments', 0.2246112747360099),
 ('from_messages', 0.16970094762175533),
 ('restricted_stock_deferred', 0.06549965290994214)]
['poi',
 'exercised_stock_options',
 'total_stock_value',
 'bonus',
 'salary',
 'percent_to_poi',
 'def

In [14]:

from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)


In [15]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Example starting point. Try investigating other evaluation techniques!

from time import time

def naive_bayes_clf(features_train, features_test, labels_train, labels_test):
    from sklearn.naive_bayes import GaussianNB
    clf = GaussianNB()
    # train
    t0 = time()
    clf.fit(features_train, labels_train)
    print "\ntraining time:", round(time()-t0, 3), "s"

    # predict
    t0 = time()
    pred = clf.predict(features_test)
    print "predicting time:", round(time()-t0, 3), "s"
    accuracy = accuracy_score(pred, labels_test)
    print '\naccuracy = {0}'.format(accuracy)

    return clf


def svm_clf(features_train, features_test, labels_train, labels_test):
    from sklearn.svm import SVC
    clf = SVC(kernel="linear", C=1000)
    # train
    t0 = time()
    clf.fit(features_train, labels_train)
    print "\ntraining time:", round(time()-t0, 3), "s"

    # predict
    t0 = time()
    pred = clf.predict(features_test)
    print "predicting time:", round(time()-t0, 3), "s"
    accuracy = accuracy_score(pred, labels_test)
    print '\naccuracy = {0}'.format(accuracy)

    return clf


def decision_tree_clf(features_train, features_test, labels_train, labels_test):
    from sklearn import tree
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    # train
    t0 = time()
    clf.fit(features_train, labels_train)
    print "\ntraining time:", round(time()-t0, 3), "s"

    # predict
    t0 = time()
    pred = clf.predict(features_test)
    print "predicting time:", round(time()-t0, 3), "s"
    accuracy = accuracy_score(pred, labels_test)
    print '\naccuracy = {0}'.format(accuracy)

    return clf

def adaboost_clf(features_train, features_test, labels_train, labels_test):
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(learning_rate=1, algorithm='SAMME', n_estimators=23)
    # train
    t0 = time()
    clf.fit(features_train, labels_train)
    print "\ntraining time:", round(time()-t0, 3), "s"

    # predict
    t0 = time()
    pred = clf.predict(features_test)
    print "predicting time:", round(time()-t0, 3), "s"
    accuracy = accuracy_score(pred, labels_test)
    print '\naccuracy = {0}'.format(accuracy)

    return clf

In [16]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
#!/usr/bin/pickle

#clf = naive_bayes_clf(features_train, features_test, labels_train, labels_test)
#clf = svm_clf(features_train, features_test, labels_train, labels_test)
clf = decision_tree_clf(features_train, features_test, labels_train, labels_test)
#clf = adaboost_clf(features_train, features_test, labels_train, labels_test)


training time: 0.003 s
predicting time: 0.002 s

accuracy = 0.860465116279


In [17]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list_n)