In [112]:
import pickle
import pandas as pd

features_list = ['poi','salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus',
                   'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses',
                   'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock',
                   'director_fees', 'to_messages', 'from_poi_to_this_person', 'from_messages',
                   'from_this_person_to_poi', 'shared_receipt_with_poi']

with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# Remove outlier
data_dict.pop('TOTAL')

df = pd.DataFrame(data_dict).transpose()
df.drop('email_address', inplace=True, axis=1)

# converting values to numeric format
df[features_list] = df[features_list].apply(pd.to_numeric, errors ='coerce')

# filling NaN as zero
#df.fillna(0, inplace=True)



from_poi_rate = 'from_poi_rate'
shared_poi_rate = 'shared_poi_rate'
to_poi_rate = 'to_poi_rate'

df[from_poi_rate] = df['from_this_person_to_poi']/df['from_messages']
df[shared_poi_rate] = df['shared_receipt_with_poi']/df['to_messages']
df[to_poi_rate] = df['from_poi_to_this_person']/df['to_messages']
df.fillna(0, inplace=True)
my_dataset = df.to_dict(orient='index')
features_list.extend([from_poi_rate, shared_poi_rate, to_poi_rate])

#my_dataset = data_dict

#df[df['poi'] == True].head(5)
df.head(5)

my_dataset

{'ALLEN PHILLIP K': {'bonus': 4175000.0,
  'deferral_payments': 2869717.0,
  'deferred_income': -3081055.0,
  'director_fees': 0.0,
  'exercised_stock_options': 1729541.0,
  'expenses': 13868.0,
  'from_messages': 2195.0,
  'from_poi_rate': 0.029612756264236904,
  'from_poi_to_this_person': 47.0,
  'from_this_person_to_poi': 65.0,
  'loan_advances': 0.0,
  'long_term_incentive': 304805.0,
  'other': 152.0,
  'poi': False,
  'restricted_stock': 126027.0,
  'restricted_stock_deferred': -126027.0,
  'salary': 201955.0,
  'shared_poi_rate': 0.4848380427291523,
  'shared_receipt_with_poi': 1407.0,
  'to_messages': 2902.0,
  'to_poi_rate': 0.016195727084769126,
  'total_payments': 4484442.0,
  'total_stock_value': 1729541.0},
 'BADUM JAMES P': {'bonus': 0.0,
  'deferral_payments': 178980.0,
  'deferred_income': 0.0,
  'director_fees': 0.0,
  'exercised_stock_options': 257817.0,
  'expenses': 3486.0,
  'from_messages': 0.0,
  'from_poi_rate': 0.0,
  'from_poi_to_this_person': 0.0,
  'from_thi

In [106]:
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
data = featureFormat(my_dataset, features_list, sort_keys = True, remove_NaN=True)
labels, features = targetFeatureSplit(data)

feature_columns = list(features_list)
feature_columns.pop(0)

pd.DataFrame(features, columns=feature_columns).head(3)

Unnamed: 0,salary,deferral_payments,total_payments,loan_advances,bonus,restricted_stock_deferred,deferred_income,total_stock_value,expenses,exercised_stock_options,...,restricted_stock,director_fees,to_messages,from_poi_to_this_person,from_messages,from_this_person_to_poi,shared_receipt_with_poi,from_poi_rate,shared_poi_rate,to_poi_rate
0,201955.0,2869717.0,4484442.0,0.0,4175000.0,-126027.0,-3081055.0,1729541.0,13868.0,1729541.0,...,126027.0,0.0,2902.0,47.0,2195.0,65.0,1407.0,0.029613,0.484838,0.016196
1,0.0,178980.0,182466.0,0.0,0.0,0.0,0.0,257817.0,3486.0,257817.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,477.0,0.0,916197.0,0.0,0.0,-560222.0,-5104.0,5243487.0,56301.0,4046157.0,...,1757552.0,0.0,566.0,39.0,29.0,0.0,465.0,0.0,0.821555,0.068905


In [107]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
#scaler = StandardScaler()
scaler = MinMaxScaler()

feat_scale = scaler.fit_transform(features)
feat_scale_pd = pd.DataFrame(feat_scale, columns=feature_columns)

feat_scale_pd.describe()

Unnamed: 0,salary,deferral_payments,total_payments,loan_advances,bonus,restricted_stock_deferred,deferred_income,total_stock_value,expenses,exercised_stock_options,...,restricted_stock,director_fees,to_messages,from_poi_to_this_person,from_messages,from_this_person_to_poi,shared_receipt_with_poi,from_poi_rate,shared_poi_rate,to_poi_rate
count,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,...,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0
mean,0.166879,0.049711,0.021814,0.007149,0.0845,0.107912,0.944731,0.060094,0.154638,0.060434,...,0.199988,0.072392,0.081758,0.073403,0.025305,0.040435,0.127262,0.109922,0.363327,0.104317
std,0.177314,0.115492,0.085425,0.083342,0.154144,0.075505,0.172929,0.12591,0.198062,0.139614,...,0.116121,0.22704,0.147704,0.140676,0.100966,0.130999,0.195126,0.185935,0.372292,0.167558
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.015698,0.000962,0.0,0.0,0.103654,0.989417,0.005868,0.0,0.0,...,0.151377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.189511,0.015698,0.00909,0.0,0.0375,0.103654,1.0,0.020549,0.088222,0.01771,...,0.170735,0.0,0.022939,0.007576,0.001218,0.0,0.020648,0.0,0.279559,0.022782
75%,0.242669,0.017005,0.018788,0.0,0.1,0.103654,1.0,0.04759,0.233116,0.049015,...,0.19244,0.0,0.107136,0.078125,0.003689,0.022989,0.169127,0.198827,0.726273,0.137655
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [111]:
### selected features: ['salary' 'bonus' 'total_stock_value' 'exercised_stock_options']

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

##salary 	total_payments 	loan_advances 	bonus 	total_stock_value 	exercised_stock_options 	long_term_incentive
skb = SelectKBest(chi2, k=7)
selected_features = skb.fit_transform(feat_scale, labels)
feature_idx = skb.get_support()

##salary 	deferral_payments 	bonus 	deferred_income 	total_stock_value 	expenses 	exercised_stock_options
#from sklearn.feature_selection import SelectFromModel
#from sklearn.svm import LinearSVC
#svc = LinearSVC(C=0.1, dual=False)
#svc.fit(feat_scale, labels)
#model = SelectFromModel(svc, prefit=True, threshold='1.5*median')
#selected_features = model.transform(feat_scale)
#feature_idx = model.get_support()



### removing the first feature (poi), to match selected indexes
import numpy as np
feat_names = list(features_list)
poi = feat_names.pop(0)
features_np = np.array(feat_names)[feature_idx]

selected_features_pd = pd.DataFrame(selected_features, columns=features_np)
selected_features_pd.head()

Unnamed: 0,salary,loan_advances,bonus,total_stock_value,exercised_stock_options,from_poi_rate,shared_poi_rate
0,0.181735,0.0,0.521875,0.036083,0.050353,0.029613,0.484283
1,0.0,0.0,0.0,0.006142,0.007506,0.0,0.0
2,0.000429,0.0,0.0,0.107571,0.117798,0.0,0.820615
3,0.24036,0.0,0.15,0.217018,0.194494,0.0,0.0
4,0.215675,0.0,0.05,0.002179,0.0,0.0,0.0
