# Project: Identify Fraud from Enron Email

### Features

In [1]:
# %load poi_id.py
#!/usr/bin/python

import sys
import pickle
import matplotlib
sys.path.append("../tools/")
import numpy as np
import math

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from tester import test_classifier

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi', 'salary_log', 'long_term_incentive_log', 'total_payments_log', 'shared_receipt_with_poi',
                'to_poi_ratio', 'from_poi_ratio', 'messages_total', 'to_message_ratio', 
                 'from_message_ratio' ] # You will need to use more features

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

    
# Store all names (keys of dictionary) in a list
name_list = []
for key in data_dict:
    #print key
    name_list.append(key)
    
#print name_list
#print len(name_list) # 146 names in this dataset

#print data_dict['METTS MARK']

# Store all available features in a list
all_features = []

for k in data_dict['METTS MARK']:
    all_features.append(k)

#print all_features

#print len(all_features) # 21 features in total in the beginning
#print len(data_dict['METTS MARK'])




In [2]:
# Get the number of POIs in the dataset and their names

poi_list = []

for i in data_dict:
    if data_dict[i]['poi'] == 1:
        poi_list.append(i)

print poi_list # names of the POIs
print "POIs: ", len(poi_list) # 18 POIs in the dataset
print "Total number: ", len(data_dict) # Total number of people in the dataset

['HANNON KEVIN P', 'COLWELL WESLEY', 'RIEKER PAULA H', 'KOPPER MICHAEL J', 'SHELBY REX', 'DELAINEY DAVID W', 'LAY KENNETH L', 'BOWEN JR RAYMOND M', 'BELDEN TIMOTHY N', 'FASTOW ANDREW S', 'CALGER CHRISTOPHER F', 'RICE KENNETH D', 'SKILLING JEFFREY K', 'YEAGER F SCOTT', 'HIRKO JOSEPH', 'KOENIG MARK E', 'CAUSEY RICHARD A', 'GLISAN JR BEN F']
POIs:  18
Total number:  146


### Removing outliers

In [3]:
### Task 2: Remove outliers
no_people = 0

for i in data_dict:
    #print i
    no_people += 1

#print no_people

# 146 keys in the data_dict -> 2 outliers: Total and the travel agency in the park -> remove



In [4]:
# remove outlier entries "TOTAL" and "THE TRAVEL AGENCY IN THE PARK"

data_dict_clean = data_dict

outlier_1 = data_dict_clean.pop("TOTAL")
outlier_2 = data_dict_clean.pop("THE TRAVEL AGENCY IN THE PARK")

people_clean = 0

for a in data_dict_clean:
    #print a
    people_clean += 1

#print people_clean

In [5]:
# identify entries where more than a certain percentage of features has value "NaN"

my_dataset_clean = data_dict_clean

total_features = []

for m in my_dataset_clean['METTS MARK']:
    total_features.append(m)

cutoff = 0.75

outlier_list = []
    
    
for person in my_dataset_clean:
    nan_features = []
    for feat in my_dataset_clean[person]:
        if my_dataset_clean[person][feat] == "NaN" or my_dataset_clean[person][feat] == np.nan:
            nan_features.append(feat)
    #print nan_features
    nan_ratio = float(len(nan_features))/float(len(total_features))
    #print nan_ratio
    if nan_ratio >= cutoff:
        outlier_list.append(person)
    
    
#print total_features
#print len(total_features)
print "\nOutliers: ", outlier_list
print len(outlier_list)

poi_list_2 = []

for out in outlier_list:
    if my_dataset_clean[out]['poi'] == 1:
        poi_list_2.append(out)

print "POIs in outliers: ", poi_list_2

# at a cutoff of 0.75 there are 22 people where >= 75% of features have a value of "NaN", none of those 22 people is a
# known POI



Outliers:  ['LOWRY CHARLES P', 'CHAN RONNIE', 'WODRASKA JOHN', 'URQUHART JOHN A', 'WHALEY DAVID A', 'MENDELSOHN JOHN', 'CLINE KENNETH W', 'WAKEHAM JOHN', 'WROBEL BRUCE', 'MEYER JEROME J', 'SCRIMSHAW MATTHEW', 'GATHMANN WILLIAM D', 'GILLIS JOHN', 'LOCKHART EUGENE E', 'PEREIRA PAULO V. FERRAZ', 'BLAKE JR. NORMAN P', 'CHRISTODOULOU DIOMEDES', 'WINOKUR JR. HERBERT S', 'YEAP SOON', 'FUGH JOHN L', 'SAVAGE FRANK', 'GRAMM WENDY L']
22
POIs in outliers:  []


In [6]:
# remove entries where more than a certain percentage of features has value "NaN"


def remove_outliers(data, outliers):
    data_out = data
    for x in outliers:
        if x in data_out:
            data_out.pop(x)
    return data_out
    

In [7]:
cleaned_data = remove_outliers(my_dataset_clean, outlier_list)

#print cleaned_data
#print len(cleaned_data)

### New features

In [8]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.

# function for transforming feature into log10

my_dataset = cleaned_data
feat_list = ['salary', 'total_payments', 'long_term_incentive']

def feature_log10(dataset, features):
    dataset_log = dataset
    
    features_log = features
    
    for feature in features_log:
    
        for key in dataset_log:
        #print key
            if my_dataset[key][feature] != 'NaN':
                feature_log = math.log10(float(my_dataset[key][feature]))
                dataset_log[key][str(feature)+'_log'] = feature_log
            else:
                dataset_log[key][str(feature)+'_log'] = 'NaN'
    return dataset_log
            
dataset_log10 = feature_log10(my_dataset, feat_list)

#print dataset_log10['METTS MARK']
print "No. of features in dataset: ", len(dataset_log10['METTS MARK'])

No. of features in dataset:  24


In [9]:
# new feature: ratio of from_messages / to_messages from total messages

my_dataset_v1 = dataset_log10

for key in my_dataset_v1:
    #print key
    if my_dataset_v1[key]['to_messages'] != 'NaN' and my_dataset_v1[key]['from_messages'] != 'NaN':
        messages_total = float(my_dataset_v1[key]['to_messages'] + my_dataset_v1[key]['from_messages'])
        to_message_ratio = my_dataset_v1[key]['to_messages'] / messages_total
        from_message_ratio = my_dataset_v1[key]['from_messages'] / messages_total
        
        my_dataset_v1[key]['messages_total'] = messages_total
        my_dataset_v1[key]['to_message_ratio'] = to_message_ratio
        my_dataset_v1[key]['from_message_ratio'] = from_message_ratio
    
    else: 
        my_dataset_v1[key]['messages_total'] = 'NaN'
        my_dataset_v1[key]['to_message_ratio'] = 'NaN'
        my_dataset_v1[key]['from_message_ratio'] = 'NaN'

#print my_dataset_v1['METTS MARK']
print "No. of features in dataset: ", len(my_dataset_v1['METTS MARK'])

No. of features in dataset:  27


In [10]:
# new feature: ratio of messages from/to POI in to/from messages

my_dataset_v2 = my_dataset_v1

for key in my_dataset_v2:
    #print key
    if my_dataset_v2[key]['to_messages'] != 'NaN' and my_dataset_v2[key]['from_messages'] != 'NaN':
        if my_dataset_v2[key]['from_poi_to_this_person'] != 'NaN' and my_dataset_v2[key]['from_this_person_to_poi'] != 'NaN':
            if my_dataset_v2[key]['to_messages'] != 0 and my_dataset_v2[key]['from_messages'] != 0:
                to_poi_ratio = my_dataset_v2[key]['from_poi_to_this_person'] / float(my_dataset_v2[key]['to_messages'])
                from_poi_ratio = my_dataset_v2[key]['from_this_person_to_poi'] / float(my_dataset_v2[key]['from_messages'])

                
                my_dataset_v2[key]['to_poi_ratio'] = to_poi_ratio
                my_dataset_v2[key]['from_poi_ratio'] = from_poi_ratio
            else:
                my_dataset_v2[key]['to_poi_ratio'] = 'NaN'
                my_dataset_v2[key]['from_poi_ratio'] = 'NaN'
                
        else:
                my_dataset_v2[key]['to_poi_ratio'] = 'NaN'
                my_dataset_v2[key]['from_poi_ratio'] = 'NaN'
    else:
                my_dataset_v2[key]['to_poi_ratio'] = 'NaN'
                my_dataset_v2[key]['from_poi_ratio'] = 'NaN'

#print my_dataset_v2['METTS MARK']
print "No. of features in dataset: ", len(my_dataset_v2['METTS MARK'])

No. of features in dataset:  29


In [11]:
# new feature: long_term_incentive / total_payments -> incentive_total_ratio

my_dataset_v3 = my_dataset_v2


for key in my_dataset_v3:
    #print key
    #print "incentive: ", my_dataset_v3[key]['long_term_incentive']
    #print type(my_dataset_v3[key]['long_term_incentive'])
    #print "payments: ", my_dataset_v3[key]['total_payments']
    #print type(my_dataset_v3[key]['total_payments'])
    if my_dataset_v3[key]['long_term_incentive'] != 'NaN' and my_dataset_v3[key]['total_payments'] != 'NaN':
        if my_dataset_v3[key]['long_term_incentive'] !=0 and my_dataset_v3[key]['total_payments'] != 0:
            
            incentive_total_ratio = float(my_dataset_v3[key]['long_term_incentive']) / float(my_dataset_v3[key]['total_payments'])
            my_dataset_v3[key]['incentive_total_ratio'] = incentive_total_ratio
            
        else:
            my_dataset_v3[key]['incentive_total_ratio'] = 'NaN'
               
    else:
        my_dataset_v3[key]['incentive_total_ratio'] = 'NaN'
        
       
    #print my_dataset_v3[key]['incentive_total_ratio']
    #print type(my_dataset_v3[key]['incentive_total_ratio']) 

#print my_dataset_v3['METTS MARK']
print "No. of features in dataset: ", len(my_dataset_v3['METTS MARK'])

No. of features in dataset:  30


In [12]:
# print all existing features

#for feature in features_list_all:
    #print "\n" + feature
    
# new feature: bonus / total_payments -> bonus_total_ratio

my_dataset_v4 = my_dataset_v3

for key in my_dataset_v4:
    #print key
    if my_dataset_v4[key]['bonus'] != 'NaN' and my_dataset_v4[key]['total_payments'] != 'NaN':
        if my_dataset_v4[key]['bonus'] != 0 and my_dataset_v4[key]['total_payments'] != 0:
            
            bonus_total_ratio = float(my_dataset_v4[key]['bonus']) / float(my_dataset_v4[key]['total_payments'])
            my_dataset_v4[key]['bonus_total_ratio'] = bonus_total_ratio
            #print math.isinf(bonus_total_ratio)
            
        else:
            my_dataset_v4[key]['bonus_total_ratio'] = 'NaN'
           
    else:
        my_dataset_v4[key]['bonus_total_ratio'] = 'NaN'
                

#print my_dataset_v4['COLWELL WESLEY']
print "No. of features in dataset: ", len(my_dataset_v4['COLWELL WESLEY'])

No. of features in dataset:  31


In [13]:
# print all existing features

#for feature in features_list_all:
    #print "\n" + feature
    
# new feature: bonus / salary -> bonus_salary_ratio

my_dataset_v5 = my_dataset_v4

for key in my_dataset_v5:
    #print key
    if my_dataset_v5[key]['bonus'] != 'NaN' and my_dataset_v5[key]['salary'] != 'NaN':
        if my_dataset_v5[key]['bonus'] != 0 and my_dataset_v5[key]['salary'] != 0:
            
            bonus_salary_ratio = float(my_dataset_v5[key]['bonus']) / float(my_dataset_v5[key]['salary'])
            my_dataset_v5[key]['bonus_salary_ratio'] = bonus_salary_ratio
            #print math.isinf(bonus_salary_ratio)
            
        else:
            my_dataset_v5[key]['bonus_salary_ratio'] = 'NaN'
           
    else:
        my_dataset_v5[key]['bonus_salary_ratio'] = 'NaN'


#print my_dataset_v5['COLWELL WESLEY']
print "No. of features in dataset: ", len(my_dataset_v5['COLWELL WESLEY'])

No. of features in dataset:  32


### Identify features with a lot of missing values

In [14]:
# For each feature, identify the number of people, where this feature has a value of "NaN"

my_dataset_clean_2 = my_dataset_v5

total_features = []

for m in my_dataset_clean['METTS MARK']:
    total_features.append(m)

thresh = 0.75

# create a dict where each feature is a key

feature_dict = {}
    
for person in my_dataset_clean_2:
    for feat in my_dataset_clean_2[person]:
        feature_dict[feat] = {'NaNs': 0}
        
        

for person in my_dataset_clean_2:
    for feat in my_dataset_clean_2[person]:
        
        if my_dataset_clean_2[person][feat] == "NaN" or my_dataset_clean_2[person][feat] == np.nan:
            #print feature_dict[feat]['NaNs']
            #count += 1
            #print "Count2: ", count
            feature_dict[feat]['NaNs'] += 1

#print feature_dict        

# feature_dict holds the name of the feature and the number of times this feature has a value of "NaN" 

In [15]:
# add ratio of NaNs for each feature

dataset = my_dataset_v5

#print len(my_dataset_v5)

for feature in feature_dict:
    feature_dict[feature]['NaN_ratio'] = float(feature_dict[feature]['NaNs'])/float(len(dataset))
    
#print feature_dict

for feature in feature_dict:
    print "\nFeature: ", feature
    print "NaN ratio: ", feature_dict[feature]['NaN_ratio']


Feature:  to_messages
NaN ratio:  0.295081967213

Feature:  deferral_payments
NaN ratio:  0.696721311475

Feature:  expenses
NaN ratio:  0.295081967213

Feature:  poi
NaN ratio:  0.0

Feature:  deferred_income
NaN ratio:  0.672131147541

Feature:  bonus_salary_ratio
NaN ratio:  0.33606557377

Feature:  from_poi_to_this_person
NaN ratio:  0.295081967213

Feature:  from_message_ratio
NaN ratio:  0.295081967213

Feature:  salary_log
NaN ratio:  0.229508196721

Feature:  email_address
NaN ratio:  0.114754098361

Feature:  restricted_stock_deferred
NaN ratio:  0.893442622951

Feature:  messages_total
NaN ratio:  0.295081967213

Feature:  shared_receipt_with_poi
NaN ratio:  0.295081967213

Feature:  loan_advances
NaN ratio:  0.975409836066

Feature:  from_messages
NaN ratio:  0.295081967213

Feature:  other
NaN ratio:  0.262295081967

Feature:  to_poi_ratio
NaN ratio:  0.295081967213

Feature:  to_message_ratio
NaN ratio:  0.295081967213

Feature:  director_fees
NaN ratio:  0.950819672131



### Split dataset

In [16]:
from sklearn.cross_validation import StratifiedShuffleSplit

my_dataset_t = my_dataset_v5

features_list_all = ['poi', 'to_messages', 'deferral_payments', 'expenses', 'deferred_income',  
                     'long_term_incentive', 'from_message_ratio', 'salary_log', 'restricted_stock_deferred', 
                     'messages_total', 'shared_receipt_with_poi', 'loan_advances', 'from_messages', 'other', 
                     'to_poi_ratio', 'to_message_ratio', 'director_fees', 'bonus', 'total_stock_value', 
                     'from_poi_to_this_person', 'from_this_person_to_poi', 'total_payments_log', 
                     'from_poi_ratio', 'restricted_stock', 'long_term_incentive_log', 'salary', 'total_payments', 
                     'exercised_stock_options', 'bonus_salary_ratio', 'incentive_total_ratio', 'bonus_total_ratio']

features_list_selector_9 = ['poi', 'from_message_ratio', 'messages_total', 'total_stock_value', 'bonus', 
                             'total_payments_log', 
                            'long_term_incentive_log', 'bonus_total_ratio', 'exercised_stock_options', 
                            'incentive_total_ratio']

features_list_selector_3c = ['poi','deferred_income', 'loan_advances', 'bonus', 'total_stock_value', 'from_poi_ratio', 
                             'salary', 'exercised_stock_options', 'bonus_salary_ratio', 'bonus_total_ratio']

features_list_selector_4 = ['poi','total_stock_value', 'bonus', 'total_payments_log', 'exercised_stock_options']

features_list_selector_4m = ['poi','expenses', 'shared_receipt_with_poi', 'other', 'bonus']

features_list_selector_6 = ['poi','bonus', 'total_stock_value', 'from_poi_ratio', 'exercised_stock_options',
                            'bonus_salary_ratio', 'bonus_total_ratio']

features_list_selector_6m = ['poi','deferral_payments', 'expenses', 'shared_receipt_with_poi', 'other', 'bonus',
                             'from_poi_ratio']


features_list_selector_9m = ['poi','deferral_payments', 'expenses', 'restricted_stock_deferred', 'shared_receipt_with_poi',
                             'other', 'to_poi_ratio', 'bonus', 'from_poi_ratio', 'long_term_incentive_log']

features_list_selector_12 = ['poi','deferred_income', 'shared_receipt_with_poi', 'loan_advances', 'bonus', 
                             'total_stock_value', 'from_poi_ratio', 'salary', 'total_payments', 
                             'exercised_stock_options', 'bonus_salary_ratio', 'incentive_total_ratio', 'bonus_total_ratio']

features_list_selector_12m = ['poi','deferral_payments', 'expenses', 'restricted_stock_deferred', 'shared_receipt_with_poi',
                              'other', 'to_poi_ratio', 'director_fees', 'bonus', 'total_stock_value', 'from_poi_ratio',
                              'restricted_stock', 'bonus_salary_ratio']


features_list_selector_15 = ['poi','expenses', 'deferred_income', 'long_term_incentive', 'salary_log', 
                             'shared_receipt_with_poi', 'loan_advances', 'bonus', 'total_stock_value', 
                             'from_poi_ratio', 'salary', 'total_payments', 'exercised_stock_options', 
                             'bonus_salary_ratio', 'incentive_total_ratio', 'bonus_total_ratio']

features_list_selector_15m = ['poi','expenses', 'deferred_income', 'shared_receipt_with_poi', 'other', 'to_poi_ratio', 
                              'bonus', 'total_stock_value', 'from_this_person_to_poi', 'from_poi_ratio', 
                              'restricted_stock', 'total_payments', 'exercised_stock_options', 'bonus_salary_ratio',
                              'incentive_total_ratio', 'bonus_total_ratio']



data_dt = featureFormat(my_dataset_t, features_list_selector_9, sort_keys = True)
labels, features = targetFeatureSplit(data_dt)


# Split into training and test data

folds = 1000
cv = StratifiedShuffleSplit(labels, folds, random_state = 25,
        test_size = 0.15)

for train_idx, test_idx in cv:
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )


### SelectKBest features from all features 


In [None]:
# select K-best features

# list of all (current) features

my_features = []

for k in my_dataset_v5['METTS MARK']:
    my_features.append(k)

#print my_features
#print len(my_features)

# important: 'poi' has to be the first feature!! 
# remove email adress as this feature is a string and causes problems in the featureFormat fucntion
features_list_all = ['poi', 'to_messages', 'deferral_payments', 'expenses', 'deferred_income',  
                     'long_term_incentive', 'from_message_ratio', 'salary_log', 'restricted_stock_deferred', 
                     'messages_total', 'shared_receipt_with_poi', 'loan_advances', 'from_messages', 'other', 
                     'to_poi_ratio', 'to_message_ratio', 'director_fees', 'bonus', 'total_stock_value', 
                     'from_poi_to_this_person', 'from_this_person_to_poi', 'total_payments_log', 
                     'from_poi_ratio', 'restricted_stock', 'long_term_incentive_log', 'salary', 'total_payments', 
                     'exercised_stock_options', 'bonus_salary_ratio', 'incentive_total_ratio', 'bonus_total_ratio']


#print len(features_list_all)
# feature selection

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

# create the selector

selector = SelectKBest(f_classif, k = 9)

selector.fit(features_train, labels_train)

#print "\n", selector.scores_
#print len(selector.scores_)

# get the selected k best features by first getting their indices calling .get_support
# get the features from list with all features by selecting their index 
indexes = selector.get_support(indices=True)

#print indexes


k_features = [features_list_all[i+1] for i in indexes]

print "\nK-Best features: ", k_features


#print "features_train: ", features_train[0]

# reduce features_train to the k best features by transforming
features_train_transformed = selector.transform(features_train)

#print "\nfeatures_train_transformed: ", features_train_transformed[0]

# reduce features_test to the k best features by transforming
features_test_transformed = selector.transform(features_test)



### Decision Tree Classifier

In [17]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html


# Decision Tree

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn.cross_validation import StratifiedShuffleSplit


#########################################################
### your code goes here ###

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=11)
# try different parameters
clf_dt_1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_split=11, splitter='best')

t0 = time()
clf_dt_1.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s" # taking time part 2

t1 = time() # timing the prediction
pred = clf_dt_1.predict(features_test)
print "Prediction time:", round(time()-t0, 3), "s" # prediction time part 2

from sklearn.metrics import accuracy_score

acc = accuracy_score(pred, labels_test) 

print "Accuracy: ", acc

no_features = len(features_train[0])
print "No. of features: ", no_features


features_weight = clf_dt_1.feature_importances_

print "feature weight: ", features_weight

#print "\ntester result: ", test_classifier(clf_dt_1, my_dataset_t, features_list_selector_3c, folds=1000)



training time: 0.001 s
Prediction time: 0.002 s
Accuracy:  0.8421052631578947
No. of features:  9
feature weight:  [0.         0.         0.35866571 0.37107195 0.21631066 0.
 0.         0.05395168 0.        ]


In [18]:
print "\ntester result: ", test_classifier(clf_dt_1, my_dataset_t, features_list_selector_9, folds=1000)


tester result:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=11,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
	Accuracy: 0.82400	Precision: 0.41252	Recall: 0.33950	F1: 0.37246	F2: 0.35196
	Total predictions: 13000	True positives:  679	False positives:  967	False negatives: 1321	True negatives: 10033

None


In [None]:
# Decision Tree with GridsearchCV

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
#features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

from sklearn import tree
from sklearn.model_selection import GridSearchCV

parameters = {'min_samples_split': [2,3,5,7,9,11,13,15,17,19],
             'max_features': [2,3,4,5,6,7,8,9] }

parameters_2 = {'min_samples_split': [2,3,5,7,9,11,13,15,17,19]}

parameters_3 = {'max_features': [2,3,4,5,6,7,8,9]}

# pass scoring parameters by which gridsearch evaluates the performance
scoring = {'prec': 'precision', 'rec': 'recall'}

tree = tree.DecisionTreeClassifier()

clf = GridSearchCV(tree, parameters_3, scoring = scoring, refit = 'prec')

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s" # taking time part 2

t1 = time() # timing the prediction
pred = clf.predict(features_test)
print "Prediction time:", round(time()-t0, 3), "s" # prediction time part 2

from sklearn.metrics import accuracy_score

acc = accuracy_score(pred, labels_test) 

print "Accuracy: ", acc

no_features = len(features_train[0])
print "No. of features: ", no_features

best_params = clf.best_params_
print best_params

print "\ntester result: ", test_classifier(clf, my_dataset_t, features_list_selector_9, folds=1000)



### Tune classifier and features

In [None]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# get stats (min, max, mean, median) of each feature (column)

import matplotlib.pyplot as plt

features_train_info = features_train

#for a in features_info:
    #print a

def feature_stats(features):
    # turn features (list of arrays) into an array
    
    feature_array = np.empty((0,len(features[0])), float) # create empty array with the correct dimensions
    
    #print feature_array.shape
    
    for i in range(0, len(features)):
        #print features[i].shape
        # reshape 1D array into 2D array with 1 row and No. of features columns
        reshape = np.reshape(features[i], (1,len(features[0]))) 
        
        feature_array = np.vstack([feature_array, reshape])
        
        #np.append(feature_array, reshape, axis=0)
    
    #print feature_array
    
    # create empty dict of dict for storing min, max, mean, median per feature (column)
    
    #print "\n", feature_array[:,1]
    
    stat_dict = {}
    
    for x in range (0, len(feature_array[0,:])):
        #print len(feature_array[0,:])
        temp_dict = {}
        # use nanmax to ignore NaNs in the columns
        temp_dict["max"] = np.nanmax(feature_array[:,x])
        temp_dict["min"] = np.nanmin(feature_array[:,x])
        temp_dict["mean"] = np.nanmean(feature_array[:,x])
        temp_dict["median"] = np.nanmedian(feature_array[:,x])
        
        stat_dict[x] = temp_dict
        
        plt.hist(feature_array[:,x], bins = 'auto', range = (np.nanmin(feature_array[:,x]), np.nanmax(feature_array[:,x]) ))
        plt.title("Histogram of feature {} with 10 bins".format(x))
        plt.show()
        #print "\nFeature ", features_list[x+1]
        #print "Max: ", rescaled_max
        #print "Min: ", rescaled_min
        #print "Average: ", rescaled_mean
        #print "Median: ", rescaled_median
    
    # returns a dict of dict. The feature number (index) is the key to a dict containing the max, min, mean
    # and median (keys) values for this feature
    return stat_dict
    
print feature_stats(features_train_info)
#print "\n", feature_stats(features_train)

For features where the mode of the distribution or a very high peak in the histogram is at 0 -> impute 0s in this feature with the mean (as the median is also 0 in some cases).

In [None]:
# impute 0 values in array for certain features with median or mean
# use scikit learn preprocessing Imputer 
# -> http://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values

# only use for features where it makes sense e.g salaray, total payments...
# first replace 0 in columns of interest with NaN for NaN to be replaced by the Imputer

# The featureFormat function replaces "NaN" in the dataset with 0 -> so the 0s in features_train etc have probably been "NaN"
# in the original dataset, but there is no way to be sure. Does transforming them back to "NaN" and then
# imputing them make sense?



#print len(features_train)

features_train_nan = features_train

#print "Features before: ", features_train_nan

features_index = [1,3,4,5,6,7,8]

col_array = np.array([[]])


def nan_in_features_of_interest(features, feature_index_list):
    # replaces 0 in features (list of arrays, e.g. features_train) in columns of interest 
    # (index in array given by feature_index_list) with "NaN" -> preparation for Imputer
    array_out = []
    for b in range(0, len(features)):
        #print features[b]
        array = features[b].copy()
        for feat in feature_index_list:
            #print features[b][feat]
            if array[feat] == 0.:
                array[feat] = np.nan
        array_out.append(array)
    return array_out
                           

#print "\n", features_nan 

#print "\n", nan_in_features_of_interest(features_train, features_index)

features_train_nan = nan_in_features_of_interest(features_train_nan, features_index)

print "\nFeatures with NaN: ", features_train_nan
# Impute NaN with median value for that column





In [None]:
# Impute - replace NaN in columns of interest with median of that column

from sklearn.preprocessing import Imputer

imputer_median = Imputer(missing_values='NaN', strategy='median', axis=0, copy=True)

features_train_impute_median = imputer_median.fit_transform(features_nan)

#print features_impute_median

imputer_mean = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=True)

features_train_impute_mean = imputer_mean.fit_transform(features_nan)

print features_impute_mean

In [None]:
# Impute function -> replace 'NaN' with the mean of that feature

from sklearn.preprocessing import Imputer

missing_values = 'NaN'
strategy = 'mean'
axis = 0

def imputer_mean(features_in):
    imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=True)
    features_out = imputer.fit_transform(features_in)
    return features_out

#features_train_impute_mean = imputer_mean(features_train_nan)

### Run DT classifier again with imputed features

In [None]:
# Decision Tree

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess



features_dt_train = features_train    
labels_dt_train = labels_train   
features_dt_test = features_test   
labels_dt_test = labels_test  


# replace 0 in features where most of the values are 0 (see features_stat function) with 'NaN'
features_index = [1,3,4,5,6,7,8]
features_train_nan = nan_in_features_of_interest(features_dt_train, features_index)
features_test_nan = nan_in_features_of_interest(features_dt_test, features_index)

# impute 'NaN' in features (see above) with the mean value of that feature
features_train_impute_mean = imputer_mean(features_train_nan)
features_test_impute_mean = imputer_mean(features_test_nan)


#########################################################
### your code goes here ###

from sklearn import tree
clf_imp = tree.DecisionTreeClassifier(min_samples_split=10)

t0 = time()
clf_imp.fit(features_train_impute_mean, labels_train)
print "training time:", round(time()-t0, 3), "s" # taking time part 2

t1 = time() # timing the prediction
pred = clf_imp.predict(features_test_impute_mean)
print "Prediction time:", round(time()-t0, 3), "s" # prediction time part 2

from sklearn.metrics import accuracy_score

acc = accuracy_score(pred, labels_test) 

print "Accuracy: ", acc

no_features = len(features_train_impute_mean[0])
print "No. of features: ", no_features


features_weight = clf_imp.feature_importances_

print "feature weight: ", features_weight

#tree.export_graphviz(clf, out_file='tree.dot') 

print "\ntester result: ", test_classifier(clf_imp, my_dataset_t, features_list_selector_9, folds=1000)

# impute seems to have a small negative effect

In [None]:
#print features_test

feature_dict = {}

# features_test is a list of numpy arrays, the len of one array is the number of features used
feat_len = len(features_test[0])

# create an empty list for each feature in the empty feature dictionary
for a in range(0, feat_len):
    feature_dict[a] = []

# append all the values to the corresponding feature list
for feature in features_test:
    for i in range(0, len(feature)):
        feature_dict[i].append(feature[i])
        
# get the min/max, average values for each of the features
for key in feature_dict:
    print "\nFeature ", features_list[key+1]
    print "max: ", max(feature_dict[key])
    print "min: ", min(feature_dict[key])
    print "average: ", sum(feature_dict[key])/len(feature_dict[key])
    print "median: ", np.median(feature_dict[key])
#print feature_dict
#print len(feature_dict[0])



In [None]:
# Scale the features

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

def scale_features(features):
    scaler = MinMaxScaler()

    # first put list of values from feature_dict into list of list
    # reshape list of list into numpy array ---- or use features_test (which is a list of np arrays)

    rescaled = scaler.fit_transform(features)
    
    return rescaled

rescaled = scale_features(features_test)
#print rescaled

#print len(rescaled[0,:])

# get min, max, average for each of the rescaled columns

c_index = 0

for x in range (0, len(rescaled[0,:])):
    rescaled_max = np.max(rescaled[:,x])
    rescaled_min = np.min(rescaled[:,x])
    rescaled_mean = np.mean(rescaled[:,x])
    rescaled_median = np.median(rescaled[:,x])
    hist = np.histogram(rescaled[:,x])
    
    print "\nFeature ", features_list[x+1]
    print "Max: ", rescaled_max
    print "Min: ", rescaled_min
    print "Average: ", rescaled_mean
    print "Median: ", rescaled_median
    print "\n"
    plt.hist(rescaled[:,x], bins = 10)
    plt.title("Histogram of feature with 10 bins")
    plt.show()


### Naive Bayes Classifier

In [None]:
# Naive Bayes


# replace 0 in features where most of the values are 0 (see features_stat function) with 'NaN'
features_index = [1,3,4,5,6,7,8]
features_train_nan_n = nan_in_features_of_interest(features_train, features_index)
features_test_nan_n = nan_in_features_of_interest(features_test, features_index)

# impute 'NaN' in features (see above) with the mean value of that feature
features_train_impute_mean_n = imputer_mean(features_train_nan_n)
features_test_impute_mean_n = imputer_mean(features_test_nan_n)

#########################################################
### your code goes here ###
from sklearn.naive_bayes import GaussianNB

clf_nb = GaussianNB()


# rescale features

#scaled_train = scale_features(features_nb_train)
#scaled_test = scale_features(features_nb_test)

t0 = time() # use checking how long fitting the classifier takes
clf_nb.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s" # taking time part 2

t1 = time() # timing the prediction
pred = clf_nb.predict(features_test)
print "Prediction time:", round(time()-t0, 3), "s" # prediction time part 2

#print pred
from sklearn.metrics import accuracy_score

acc = accuracy_score(pred, labels_test) # accuracy goes down from 98,4% to 88,4% when slicing training set to 1%

print acc

print "\ntester result: ", test_classifier(clf_nb, my_dataset_t, features_list_selector_9, folds=1000)

### PCA + Naive Bayes or Decision Tree using a pipeline

In [None]:
# create a pipeline with PCA and classifier (Naive Bayes or Decision tree)

from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

# PCA
pca = PCA(n_components=3)

# Classifier
clf_dt_p = tree.DecisionTreeClassifier(min_samples_split=10)

clf_NB_p = GaussianNB()

estimators = [('PCA', pca), ('clf', clf_dt_p)]
pipe = Pipeline(estimators)
pipe.fit(features_train, labels_train)

pred = pipe.predict(features_test)


print "\nPCA - explained variance: ", pca.explained_variance_ratio_

first_pc = pca.components_[0]

#print "\nFirst PC: ", first_pc

print "\ntester result: ", test_classifier(pipe, my_dataset_t, features_list_all, folds=1000)

# Best settings for NB: PCA - n_components = 8
# Best settings for DT: min_samples_split = 10; PCA - n_components = 3
#           

### Dump classifier

In [19]:
# dump decision tree classifier

clf_sub = clf_dt_1
my_dataset_sub = my_dataset_v5
features_list_sub = features_list_selector_9


dump_classifier_and_data(clf_sub, my_dataset_sub, features_list_sub)

In [None]:
clf_sub2 = pipe
my_dataset_sub2 = my_dataset_v5
features_list_sub2 = features_list_all


dump_classifier_and_data(clf_sub2, my_dataset_sub2, features_list_sub2)

### References

https://machinelearningmastery.com/handle-missing-data-python/
http://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values
https://machinelearningmastery.com/index-slice-reshape-numpy-arrays-machine-learning-python/
https://stackoverflow.com/questions/22392497/how-to-add-a-new-row-to-an-empty-numpy-array/22394181
https://docs.scipy.org/doc/numpy/reference/generated/numpy.nanmax.html
https://matplotlib.org/gallery/statistics/histogram_features.html
https://stackoverflow.com/questions/3179106/python-select-subset-from-list-based-on-index-set
https://stats.stackexchange.com/questions/244507/what-algorithms-need-feature-scaling-beside-from-svm