In [46]:
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict
from time import time

# Import classifiers to test
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# Import metrics to analyze results
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# Import functions for cross validation and parameter optimization
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

In [25]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

features_list = ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments',
       'exercised_stock_options', 'bonus', 'restricted_stock',
       'shared_receipt_with_poi', 'restricted_stock_deferred',
       'total_stock_value', 'expenses', 'loan_advances', 'from_messages',
       'other', 'from_this_person_to_poi', 'poi', 'director_fees',
       'deferred_income', 'long_term_incentive',
       'from_poi_to_this_person']

In [63]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [38]:
len(data_dict.keys())

146

In [39]:
for item in data_dict['METTS MARK'].keys():
    print item
print len(data_dict['METTS MARK'].keys())

salary
to_messages
deferral_payments
total_payments
exercised_stock_options
bonus
restricted_stock
shared_receipt_with_poi
restricted_stock_deferred
total_stock_value
expenses
loan_advances
from_messages
other
from_this_person_to_poi
poi
director_fees
deferred_income
long_term_incentive
email_address
from_poi_to_this_person
21


### Number of data points
Number of POI vs. Non-POI

In [64]:
print 'DATA POINTS'
print '==========='
print len(data_dict), 'total data points'
count_poi = 0
for person in data_dict:
	if data_dict[person]['poi'] == 1:
		count_poi += 1
print count_poi, 'POI;	', len(data_dict) - count_poi, 'non-POI'

DATA POINTS
146 total data points
18 POI;	128 non-POI


### Convert all 'str' and 'int' types to 'float' type

In [66]:
df = pd.DataFrame.from_dict(data_dict, orient='index')
df = df.replace('NaN',np.nan)
df = df.drop('email_address', 1)
df.head()
data_dict = df.T.to_dict()

### Number of Features
Split by type

In [81]:
print 'FEATURES'
print '========'
first_key = data_dict.keys()[0]
print len(data_dict[first_key].keys()), 'features total'
print
print 'Feature breakdown:'
data_types = defaultdict(list)
for key in data_dict[first_key].keys():
	data_type = type(data_dict[first_key][key])
	data_types[data_type].append(key)
print
print 'Data Types:'
for key in data_types:
	print key,':'
#	for item in data_types[key]:
#		print '   ', item
	print data_types[key]
	print '------------'

print np.isnan(data_dict['LOCKHART EUGENE E']['salary'])

FEATURES
20 features total

Feature breakdown:

Data Types:
<type 'float'> :
['salary', 'to_messages', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'total_stock_value', 'shared_receipt_with_poi', 'long_term_incentive', 'exercised_stock_options', 'from_messages', 'other', 'from_poi_to_this_person', 'from_this_person_to_poi', 'deferred_income', 'expenses', 'restricted_stock', 'director_fees']
------------
<type 'bool'> :
['poi']
------------
True


### Number of NaN values

In [82]:
print 'MISSING DATA (NaN)'
print '=================='
print 'Count of NaN Values (for each feature) :'
print
# Initialize Counts:
count_nan = defaultdict(int)
for key in data_dict[first_key].keys():
	count_nan[key] = 0
# Increment Counter
for person in data_dict:
	for key in data_dict[person].keys():
		if np.isnan(data_dict[person][key]):
			count_nan[key] += 1
# Sort dictionary by value and print values
for key in sorted(count_nan, key=count_nan.get, reverse=True):
  print key, count_nan[key]

print 
print
print 'NaN by Data Point'
print '================='
count_nan_ind = defaultdict(int)
for person in data_dict:
	cnt = 0
	for key in data_dict[person].keys():
		if np.isnan(data_dict[person][key]):
			cnt += 1
	count_nan_ind[person] = cnt

for key in sorted(count_nan_ind, key = count_nan_ind.get, reverse=True)[0:14]:
	print key, count_nan_ind[key]

MISSING DATA (NaN)
Count of NaN Values (for each feature) :

loan_advances 142
director_fees 129
restricted_stock_deferred 128
deferral_payments 107
deferred_income 97
long_term_incentive 80
bonus 64
to_messages 60
shared_receipt_with_poi 60
from_messages 60
from_this_person_to_poi 60
from_poi_to_this_person 60
other 53
salary 51
expenses 51
exercised_stock_options 44
restricted_stock 36
total_payments 21
total_stock_value 20
poi 0


NaN by Data Point
LOCKHART EUGENE E 19
WODRASKA JOHN 17
WHALEY DAVID A 17
WROBEL BRUCE 17
THE TRAVEL AGENCY IN THE PARK 17
SCRIMSHAW MATTHEW 17
GRAMM WENDY L 17
CLINE KENNETH W 16
WAKEHAM JOHN 16
GILLIS JOHN 16
CHRISTODOULOU DIOMEDES 16
SAVAGE FRANK 16
LOWRY CHARLES P 15
CHAN RONNIE 15


### Remove Outliers
'TOTAL'

'THE TRAVEL AGENCY IN THE PARK'

'LOCKHART EUGENE E'

In [83]:
outliers = ['TOTAL','THE TRAVEL AGENCY IN THE PARK','LOCKHART EUGENE E']
print 'Initial Length:', len(data_dict)
for name in outliers:
    data_dict.pop(name)
print 'Length after Outlier Removal:', len(data_dict)

Initial Length: 146
Length after Outlier Removal: 143


### Apply Imputer to remove NaN values

In [88]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values = np.nan, strategy = 'median', axis = 0)
imp
imp.fit(data_dict)
#data_dict = imp.transform(data_dict)

TypeError: float() argument must be a string or a number

In [71]:
df = pd.DataFrame.from_dict(data_dict, orient='index')
df.head()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,loan_advances,bonus,restricted_stock_deferred,total_stock_value,shared_receipt_with_poi,long_term_incentive,exercised_stock_options,from_messages,other,from_poi_to_this_person,from_this_person_to_poi,poi,deferred_income,expenses,restricted_stock,director_fees
ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442,,4175000.0,-126027.0,1729541,1407.0,304805.0,1729541.0,2195.0,152.0,47.0,65.0,False,-3081055.0,13868,126027.0,
BADUM JAMES P,,,178980.0,182466,,,,257817,,,257817.0,,,,,False,,3486,,
BANNANTINE JAMES M,477.0,566.0,,916197,,,-560222.0,5243487,465.0,,4046157.0,29.0,864523.0,39.0,0.0,False,-5104.0,56301,1757552.0,
BAXTER JOHN C,267102.0,,1295738.0,5634343,,1200000.0,,10623258,,1586055.0,6680544.0,,2660303.0,,,False,-1386055.0,11200,3942714.0,
BAY FRANKLIN R,239671.0,,260455.0,827696,,400000.0,-82782.0,63014,,,,,69.0,,,False,-201641.0,129142,145796.0,


In [60]:
### Fill NaN Values
df = df.apply(lambda x: x.fillna(x.mean()), axis=0)
df.head()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,from_poi_to_this_person
ALLEN PHILLIP K,201955,2902,2869717.0,4484442,1729541,4175000,126027,1407.0,-126027,1729541,13868,2000000,2195,152,65,False,106164.5,-3081055,304805,47
BADUM JAMES P,258741,1211,178980.0,182466,257817,750000,441096,740.5,-140264,257817,3486,2000000,41,51587,8,False,106164.5,-151927,422158,35
BANNANTINE JAMES M,477,566,221063.5,916197,4046157,750000,1757552,465.0,-560222,5243487,56301,2000000,29,864523,0,False,106164.5,-5104,422158,39
BAXTER JOHN C,267102,1211,1295738.0,5634343,6680544,1200000,3942714,740.5,-140264,10623258,11200,2000000,41,2660303,8,False,106164.5,-1386055,1586055,35
BAY FRANKLIN R,239671,1211,260455.0,827696,1297049,400000,145796,740.5,-82782,63014,129142,2000000,41,69,8,False,106164.5,-201641,422158,35
