In [1]:
import numpy as np
import pandas as pd

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data



In [2]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "rb") as data_file:
    data_dict = pickle.load(data_file)

In [3]:
enron_data = pd.DataFrame.from_dict(data_dict, orient = 'index')

In [4]:
enron_data.head()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,loan_advances,bonus,email_address,restricted_stock_deferred,deferred_income,total_stock_value,...,from_poi_to_this_person,exercised_stock_options,from_messages,other,from_this_person_to_poi,poi,long_term_incentive,shared_receipt_with_poi,restricted_stock,director_fees
ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442,,4175000.0,phillip.allen@enron.com,-126027.0,-3081055.0,1729541,...,47.0,1729541.0,2195.0,152.0,65.0,False,304805.0,1407.0,126027.0,
BADUM JAMES P,,,178980.0,182466,,,,,,257817,...,,257817.0,,,,False,,,,
BANNANTINE JAMES M,477.0,566.0,,916197,,,james.bannantine@enron.com,-560222.0,-5104.0,5243487,...,39.0,4046157.0,29.0,864523.0,0.0,False,,465.0,1757552.0,
BAXTER JOHN C,267102.0,,1295738.0,5634343,,1200000.0,,,-1386055.0,10623258,...,,6680544.0,,2660303.0,,False,1586055.0,,3942714.0,
BAY FRANKLIN R,239671.0,,260455.0,827696,,400000.0,frank.bay@enron.com,-82782.0,-201641.0,63014,...,,,,69.0,,False,,,145796.0,


In [5]:
enron_data.describe().transpose()

Unnamed: 0,count,unique,top,freq
salary,146,95,,51
to_messages,146,87,,60
deferral_payments,146,40,,107
total_payments,146,126,,21
loan_advances,146,5,,142
bonus,146,42,,64
email_address,146,112,,35
restricted_stock_deferred,146,19,,128
deferred_income,146,45,,97
total_stock_value,146,125,,20


In [6]:
enron_data.replace(to_replace='NaN', value=0.0, inplace=True)

In [7]:
# Import plotly and and set your credentials
from plotly import tools
tools.set_credentials_file(username='imuser', api_key='2mVFrAILwP5A8KCcxG0f')

In [8]:
import plotly.plotly as py
import plotly.graph_objs as go

# Make scatterplot before outlier removal
trace0 = go.Scatter(
    x=enron_data.salary,
    y=enron_data.bonus,
    text = enron_data.index,
    mode = 'markers'
)

# Remove Outlier
enron_data.drop(['TOTAL'], axis = 0, inplace= True)

# Make scatterplot after outlier removal
trace1 = go.Scatter(
    x=enron_data.salary,
    y=enron_data.bonus,
    text = enron_data.index,
    mode = 'markers'
)


# Layout the plots together side by side
fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Before outlier removal', 'After outlier removal'))

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)

fig['layout']['xaxis1'].update(title='salary')
fig['layout']['xaxis2'].update(title='salary')

fig['layout']['yaxis1'].update(title='bonus')
fig['layout']['yaxis2'].update(title='bonus')

py.iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



PlotlyRequestError: No message

In [48]:
enron_data.drop(['THE TRAVEL AGENCY IN THE PARK'], axis = 0, inplace= True)

In [49]:
#Create new feature(s)
enron_data["fraction_from_poi"] = enron_data["from_poi_to_this_person"].\
divide(enron_data["to_messages"], fill_value = 0)

enron_data["fraction_to_poi"] = enron_data["from_this_person_to_poi"].\
divide(enron_data["from_messages"], fill_value = 0)

In [50]:
enron_data["fraction_from_poi"] = enron_data["fraction_from_poi"].fillna(0.0)
enron_data["fraction_to_poi"] = enron_data["fraction_to_poi"].fillna(0.0)

In [51]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.layouts import row
from bokeh.models import ColumnDataSource

output_notebook()

colormap = {False: 'blue', True: 'red'}
colors = [colormap[x] for x in enron_data['poi']]

labelmap = {False: 'Non-POI', True: 'POI'}
labels = [labelmap[x] for x in enron_data['poi']]

source = ColumnDataSource(dict(
    x1=enron_data["from_poi_to_this_person"],
    y1=enron_data["from_this_person_to_poi"],
    x2=enron_data["fraction_from_poi"],
    y2=enron_data["fraction_to_poi"],
    color=colors,
    label=labels
))

# Before feature engineering
s1 = figure(plot_width=450, plot_height=400)
s1.xaxis.axis_label = 'no. of emails from POI to this person'
s1.yaxis.axis_label = 'no. of emails from this person to POI'

s1.circle('x1', 'y1', size = 10, alpha = 0.5, 
          color='color', legend = 'label', source = source)

# After feature engineering
s2 = figure(plot_width=450, plot_height=400)
s2.xaxis.axis_label = 'fraction of emails this person gets from POI'
s2.yaxis.axis_label = 'fraction of emails this person sends to POI'

s2.circle('x2', 'y2', size = 10, alpha = 0.5, 
          color='color', legend = 'label', source = source)

show(row(s1, s2))

In [52]:
# Store to my_dataset for easy export below.
# create a dictionary from the dataframe
my_dataset = enron_data.to_dict('index')

In [53]:
from sklearn.feature_selection import SelectKBest, f_classif

features_list = ["poi", "bonus", "exercised_stock_options", "expenses", "other", "restricted_stock", "salary", 
                  "shared_receipt_with_poi", "total_payments", "total_stock_value", "fraction_to_poi",
                 "fraction_from_poi"]

data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(features, labels)

# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)

In [54]:
features_list = ["poi", "bonus", "exercised_stock_options", 
                 "fraction_to_poi", "shared_receipt_with_poi"]
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [55]:
# These are feature importance I obtained from running my final classiifer and testing it with tester.py.
# Check Algorithm Performance section below.
feature_importances = [0.13442256, 0.0433088, 0.48651064, 0.335758]

data = {'importance':feature_importances, 'features':features_list[1:]}


In [9]:
# Not restricting the maximum width in characters of a column
pd.options.display.max_colwidth = 0
data = {"Algorithms":["DecisionTreeClassifier", 
                      "RandomForestClassifier",
                      "AdaBoostClassifier",
                      "GaussianNB"],
        "Parameters":["criterion='entropy', max_depth =2, min_samples_split=2, min_samples_leaf=6", 
                      "n_estimators=150, min_samples_split=5", 
                      "n_estimators=150",
                      "Default"],
        "Accuracy":[0.83962,0.85031,0.81823,0.82769], 
        "Precision":[ 0.47848,0.52246,0.40066,0.41215], 
        "Recall":[0.47250,0.31400,0.36600,0.28150], 
        "F1":[0.47547,0.39225,0.38255,0.33452], 
        "F2":[0.47368,0.34123,0.37244,0.30056]}
algorithms = pd.DataFrame(data, columns = ["Algorithms", "Parameters", "Accuracy", "Precision", "Recall", "F1", "F2"])
algorithms


Unnamed: 0,Algorithms,Parameters,Accuracy,Precision,Recall,F1,F2
0,DecisionTreeClassifier,"criterion='entropy', max_depth =2, min_samples_split=2, min_samples_leaf=6",0.83962,0.47848,0.4725,0.47547,0.47368
1,RandomForestClassifier,"n_estimators=150, min_samples_split=5",0.85031,0.52246,0.314,0.39225,0.34123
2,AdaBoostClassifier,n_estimators=150,0.81823,0.40066,0.366,0.38255,0.37244
3,GaussianNB,Default,0.82769,0.41215,0.2815,0.33452,0.30056


In [24]:
# Specify parameters of the algorithm
clf_params= {'criterion': ['gini', 'entropy'],
             'max_depth': [None, 1, 2, 5, 10],
             'min_samples_split': [2, 3, 4, 5],
             'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8]
            }

# Specify algorithm
dt = DecisionTreeClassifier()

# GridSearchCV
cv = StratifiedShuffleSplit(labels, n_iter = 1000,random_state = 42)
clf = GridSearchCV(dt, param_grid = clf_params,cv = cv, scoring = 'f1')
clf.fit(features,labels)

# pick a winner
best_clf = clf.best_estimator_
print (best_clf)

NameError: name 'DecisionTreeClassifier' is not defined

In [56]:
max_depth = [1,2,5,10,15,20]
F1_score = [0.06125, 0.21550, 0.39598, 0.41575, 0.41767, 0.42029]

data = {'max_depth':max_depth, 'F1-score':F1_score}


In [57]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion='entropy', max_depth =2, min_samples_split=2, min_samples_leaf=6)


In [27]:
dump_classifier_and_data(clf, my_dataset, features_list)

TypeError: write() argument must be str, not bytes

In [58]:
# Get confusion matrix (cm)
from tester import test_classifier
import pandas as pd

cm = pd.Series(test_classifier(clf, my_dataset, features_list))


# Seaborn and matplotlib library
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# View confusion matrix with a heatmap
sns.heatmap(cm, annot = True, fmt = 'd', cmap='Reds', xticklabels=['no', 'yes'], yticklabels=['no', 'yes'])
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion matrix for:\n{}'.format(clf.__class__.__name__));

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
	Accuracy: 0.83962	Precision: 0.47846	Recall: 0.47200	F1: 0.47521	F2: 0.47328
	Total predictions: 13000	True positives:  944	False positives: 1029	False negatives: 1056	True negatives: 9971



IndexError: Inconsistent shape between the condition and the input (got (0, 1) and (0,))

In [None]:
mm = [[2,3,4,5,6,7,8], [4,6,87,9]]

sns.heatmap(mm, annot = True, fmt = 'd', cmap='Reds', xticklabels=['no', 'yes'], yticklabels=['no', 'yes'])
