# Just the Code

In [2]:
import sys
import pickle
sys.path.append("../tools/")
import pandas as pd
import numpy as np
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB 
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA

In [3]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [4]:
#Create Pandas Dataframe
enron_df = pd.DataFrame.from_records(list(data_dict.values()))
employees=np.array(data_dict.keys())
enron_df.set_index(employees, inplace=True)

#Investigation of Entry Names
print "Number of Entries:", len(enron_df) 

for person in enron_df.iterrows():
    print person[0]


Number of Entries: 146
METTS MARK
BAXTER JOHN C
ELLIOTT STEVEN
CORDES WILLIAM R
HANNON KEVIN P
MORDAUNT KRISTINA M
MEYER ROCKFORD G
MCMAHON JEFFREY
HORTON STANLEY C
PIPER GREGORY F
HUMPHREY GENE E
UMANOFF ADAM S
BLACHMAN JEREMY M
SUNDE MARTIN
GIBBS DANA R
LOWRY CHARLES P
COLWELL WESLEY
MULLER MARK S
JACKSON CHARLENE R
WESTFAHL RICHARD K
WALTERS GARETH W
WALLS JR ROBERT H
KITCHEN LOUISE
CHAN RONNIE
BELFER ROBERT
SHANKMAN JEFFREY A
WODRASKA JOHN
BERGSIEKER RICHARD P
URQUHART JOHN A
BIBI PHILIPPE A
RIEKER PAULA H
WHALEY DAVID A
BECK SALLY W
HAUG DAVID L
ECHOLS JOHN B
MENDELSOHN JOHN
HICKERSON GARY J
CLINE KENNETH W
LEWIS RICHARD
HAYES ROBERT E
MCCARTY DANNY J
KOPPER MICHAEL J
LEFF DANIEL P
LAVORATO JOHN J
BERBERIAN DAVID
DETMERING TIMOTHY J
WAKEHAM JOHN
POWERS WILLIAM
GOLD JOSEPH
BANNANTINE JAMES M
DUNCAN JOHN H
SHAPIRO RICHARD S
SHERRIFF JOHN R
SHELBY REX
LEMAISTRE CHARLES
DEFFNER JOSEPH M
KISHKILL JOSEPH G
WHALLEY LAWRENCE G
MCCONNELL MICHAEL S
PIRO JIM
DELAINEY DAVID W
SULLIVAN-SHAKLOV

Clearly there are two seapearate names that Are Issues:<br>
'THE TRAVEL AGENCY IN THE PARK' <br>
'TOTAL'<br>
These entries will be deleted as outliers. 

In [5]:
#Number of POIs
count=0
for person in data_dict:
    if data_dict[person]["poi"]==True:
        count+=1      
print "POI's:",count

POI's: 18


In [6]:
# people with too many Nans
for person in data_dict:
    NaN=0
    for feature in data_dict[person]:
        if data_dict[person][feature] == "NaN":
            NaN += 1
    if NaN >= 18:
        print person, NaN

WHALEY DAVID A 18
WROBEL BRUCE 18
LOCKHART EUGENE E 20
THE TRAVEL AGENCY IN THE PARK 18
GRAMM WENDY L 18


In [7]:
#Take a look at Eugene Lockhart
print enron_df.loc['LOCKHART EUGENE E']

bonus                          NaN
deferral_payments              NaN
deferred_income                NaN
director_fees                  NaN
email_address                  NaN
exercised_stock_options        NaN
expenses                       NaN
from_messages                  NaN
from_poi_to_this_person        NaN
from_this_person_to_poi        NaN
loan_advances                  NaN
long_term_incentive            NaN
other                          NaN
poi                          False
restricted_stock               NaN
restricted_stock_deferred      NaN
salary                         NaN
shared_receipt_with_poi        NaN
to_messages                    NaN
total_payments                 NaN
total_stock_value              NaN
Name: LOCKHART EUGENE E, dtype: object


'LOCKHART EUGENE E' literally only has one data entry -- FALSE for POI -- and no other Data.

In [8]:
###Remove outliers
try:
    enron_df.drop('THE TRAVEL AGENCY IN THE PARK', inplace=True)
    enron_df.drop('TOTAL', inplace=True)
    enron_df.drop('LOCKHART EUGENE E', inplace=True)
except:
    print "Outliers Already Removed"'\n'

In [9]:
#Number of Features and their Types
features=0
for person in data_dict:
    for feature in data_dict[person]:
        print feature, type(feature)
        features+=1
    break
print "Number of Feautres:", features
print''

#Number of NaN's for Each Feature
for feature in data_dict['METTS MARK'].keys():
    Nans=0    
    for person in data_dict:
        if data_dict[person][feature]=='NaN':
            Nans += 1
    print "Number of", feature, "NaNs: ", Nans

salary <type 'str'>
to_messages <type 'str'>
deferral_payments <type 'str'>
total_payments <type 'str'>
exercised_stock_options <type 'str'>
bonus <type 'str'>
restricted_stock <type 'str'>
shared_receipt_with_poi <type 'str'>
restricted_stock_deferred <type 'str'>
total_stock_value <type 'str'>
expenses <type 'str'>
loan_advances <type 'str'>
from_messages <type 'str'>
other <type 'str'>
from_this_person_to_poi <type 'str'>
poi <type 'str'>
director_fees <type 'str'>
deferred_income <type 'str'>
long_term_incentive <type 'str'>
email_address <type 'str'>
from_poi_to_this_person <type 'str'>
Number of Feautres: 21

Number of salary NaNs:  51
Number of to_messages NaNs:  60
Number of deferral_payments NaNs:  107
Number of total_payments NaNs:  21
Number of exercised_stock_options NaNs:  44
Number of bonus NaNs:  64
Number of restricted_stock NaNs:  36
Number of shared_receipt_with_poi NaNs:  60
Number of restricted_stock_deferred NaNs:  128
Number of total_stock_value NaNs:  20
Number o

In [10]:
### Create new feature(s)
# POI_From_To_Ratio is just the # of emails to a poi divided by emails from a poi to this person
enron_df['combined_emails'] = (enron_df['from_messages'].astype(float)) + (enron_df['from_poi_to_this_person'].astype(float)) +\
(enron_df['to_messages'].astype(float)) + (enron_df['from_this_person_to_poi'].astype(float)) 
   
enron_df['combined_emails'].fillna(value=0, inplace=True)
        
print enron_df.keys()

Index([u'bonus', u'deferral_payments', u'deferred_income', u'director_fees',
       u'email_address', u'exercised_stock_options', u'expenses',
       u'from_messages', u'from_poi_to_this_person',
       u'from_this_person_to_poi', u'loan_advances', u'long_term_incentive',
       u'other', u'poi', u'restricted_stock', u'restricted_stock_deferred',
       u'salary', u'shared_receipt_with_poi', u'to_messages',
       u'total_payments', u'total_stock_value', u'combined_emails'],
      dtype='object')


In [11]:
enron_df['combined_emails'].head()

METTS MARK           875.0
BAXTER JOHN C          0.0
ELLIOTT STEVEN         0.0
CORDES WILLIAM R     786.0
HANNON KEVIN P      1130.0
Name: combined_emails, dtype: float64

In [12]:
### RETURN to DICTIONARY
data_dict=enron_df.to_dict('index')  

### Store to my_dataset for easy export below.
my_dataset = data_dict

print len(my_dataset)
print my_dataset['METTS MARK'].keys()

143
['to_messages', 'deferral_payments', 'expenses', 'poi', 'long_term_incentive', 'email_address', 'from_poi_to_this_person', 'deferred_income', 'combined_emails', 'restricted_stock_deferred', 'shared_receipt_with_poi', 'loan_advances', 'from_messages', 'other', 'director_fees', 'bonus', 'total_stock_value', 'from_this_person_to_poi', 'restricted_stock', 'salary', 'total_payments', 'exercised_stock_options']


In [13]:
### KBEst to select what features to use.

temp_features = ['poi','to_messages', 
                 'deferral_payments','combined_emails', 'expenses',
                 'long_term_incentive', 'from_poi_to_this_person',
                 'deferred_income','restricted_stock_deferred', 
                 'shared_receipt_with_poi', 'loan_advances', 
                 'from_messages', 'other','director_fees', 
                 'bonus', 'total_stock_value', 
                 'from_this_person_to_poi', 'restricted_stock',
                 'salary', 'total_payments', 'exercised_stock_options']

print "Starting Feature Length:", len(temp_features)
# Create Data 
data = featureFormat(data_dict, temp_features, sort_keys = True)
labels, features = targetFeatureSplit(data)

features_list=['poi']
from sklearn.cross_validation import StratifiedShuffleSplit
feature_counts={}
kf=StratifiedShuffleSplit(labels, n_iter=5, random_state=50)


for train_indices, test_indices in kf:
    features_train= [features[ii] for ii in train_indices]
    features_test= [features[ii] for ii in test_indices]
    labels_train= [labels[ii] for ii in train_indices]
    labels_test= [labels[ii] for ii in test_indices]
    
    #Select Features
    kbest=SelectKBest(k=14)
    scaler=MinMaxScaler()
    features_scaled = scaler.fit_transform(features_train, labels_train )
    kbest.fit(features_scaled, labels_train )
    features_to_use=kbest.get_support()
    for i, item in enumerate(features_to_use):
        if item:
            feature=temp_features[i + 1 ]
            if feature in feature_counts:
                feature_counts[feature]+=1
            else:
                feature_counts[feature]=1

for feature in feature_counts:
    if feature_counts[feature] > 1:
        features_list.append(feature)

print "\n Features List Now is: \n",len(features_list), features_list

Starting Feature Length: 21

 Features List Now is: 
16 ['poi', 'salary', 'total_payments', 'exercised_stock_options', 'bonus', 'director_fees', 'deferred_income', 'shared_receipt_with_poi', 'loan_advances', 'other', 'total_stock_value', 'long_term_incentive', 'expenses', 'restricted_stock', 'from_poi_to_this_person', 'from_this_person_to_poi']


In [14]:
# Re Create Data With New List
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [15]:
### Try a varity of classifiers
### Definition of Classifier Tester Function
def test_class(clssf):
    kf=StratifiedShuffleSplit(labels, test_size=0.1,n_iter=20)
    accuracy=0
    precision=0
    recall=0
    for train_indices, test_indices in kf:
        features_train= [features[ii] for ii in train_indices]
        features_test= [features[ii] for ii in test_indices]
        labels_train= [labels[ii] for ii in train_indices]
        labels_test= [labels[ii] for ii in test_indices]
        
        pca=PCA(n_components= 6)
        pipe=Pipeline([
            ("Classifier",clssf)
        ])

        pipe.fit(features_train,labels_train)
        pred=pipe.predict(features_test)

        acc = accuracy_score(labels_test, pred)
        prec = precision_score(labels_test, pred)
        recl = recall_score(labels_test, pred)

        accuracy += acc
        precision += prec
        recall += recl

    accuracy= accuracy/len(kf)
    precision= precision/len(kf)
    recall= recall/len(kf)
    print "Accuracy:", accuracy
    print "Precision:", precision
    print "Recall:", recall
    return

In [16]:
clssf = AdaBoostClassifier()
test_class(clssf)

  'precision', 'predicted', average, warn_for)


Accuracy: 0.85
Precision: 0.241666666667
Recall: 0.225


In [17]:
clssf=DecisionTreeClassifier()
test_class(clssf)

Accuracy: 0.826666666667
Precision: 0.258333333333
Recall: 0.25


In [18]:
clssf=GaussianNB()
test_class(clssf)

Accuracy: 0.73
Precision: 0.269597069597
Recall: 0.275


In [19]:
clssf=SVC()
test_class(clssf)

Accuracy: 0.866666666667
Precision: 0.0
Recall: 0.0


In [20]:
clssf=RandomForestClassifier()
test_class(clssf)

Accuracy: 0.84
Precision: 0.1
Recall: 0.05


In [28]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
tree_params = {"Classifier__criterion":['gini','entropy'], 
               "Classifier__splitter": ["best", "random"],
               "Classifier__min_samples_split":[2,5,18],
               'Classifier__max_leaf_nodes':[5,10,30],
               'Classifier__max_depth':[3,4,5,6],
               'PCA__n_components': [4,6,8,10]
              }

pca=PCA()
clssf=DecisionTreeClassifier()
pipe=Pipeline([
            ("PCA", pca),("Classifier",clssf)
        ])
CV = StratifiedShuffleSplit(labels, test_size=0.1, n_iter=100)


gs= GridSearchCV(pipe, tree_params , cv=CV, scoring='recall')

gs.fit(features, labels)
clssf=gs.best_estimator_

test_class(clssf)
print clssf

Accuracy: 0.816666666667
Precision: 0.318333333333
Recall: 0.275
Pipeline(steps=[('PCA', PCA(copy=True, n_components=8, whiten=False)), ('Classifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=30, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])


In [29]:
gaussian_params = {
               'PCA__n_components': range(2,15)
              }

pca=PCA()
clssf=GaussianNB()
pipe=Pipeline([
            ("PCA", pca),("Classifier",clssf)
        ])

CV = StratifiedShuffleSplit(labels, test_size=0.1, n_iter=100)

gs= GridSearchCV(pipe, gaussian_params , cv=CV, scoring='recall')

gs.fit(features, labels)
clssf=gs.best_estimator_

test_class(clssf)
print clssf

Accuracy: 0.826666666667
Precision: 0.191666666667
Recall: 0.15
Pipeline(steps=[('PCA', PCA(copy=True, n_components=8, whiten=False)), ('Classifier', GaussianNB())])


In [31]:
features_list=['poi', 'salary', 'total_payments', 'exercised_stock_options', 'bonus', 'director_fees', 'deferred_income', 'shared_receipt_with_poi', 'loan_advances', 'other', 'total_stock_value', 'long_term_incentive', 'expenses', 'restricted_stock', 'from_poi_to_this_person', 'from_this_person_to_poi']

clf = Pipeline(steps=[('PCA', PCA(copy=True, n_components=8, whiten=False)), 
                      ('Classifier', DecisionTreeClassifier(
                class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=30, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

from tester import test_classifier
print "Tester Classification report" 
test_classifier(clf, my_dataset, features_list)

Tester Classification report
Pipeline(steps=[('PCA', PCA(copy=True, n_components=8, whiten=False)), ('Classifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=30, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.83053	Precision: 0.35752	Recall: 0.34000	F1: 0.34854	F2: 0.34336
	Total predictions: 15000	True positives:  680	False positives: 1222	False negatives: 1320	True negatives: 11778

