In [18]:
import pickle
import pandas as pd
import numpy as np

# importing our classification steps
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SelectFromModel
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import precision_recall_fscore_support 

# importing our models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [90]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

del data_dict["TOTAL"]

# transpose rows and columns
df = pd.DataFrame(data_dict).transpose()

# list of columns we want as numeric
numeric_columns = ["bonus", "deferral_payments", "deferred_income", "director_fees",\
   "exercised_stock_options", "expenses", "from_messages", "from_poi_to_this_person",\
   "from_this_person_to_poi", "loan_advances", "long_term_incentive", "other", "restricted_stock",\
    "restricted_stock_deferred", "salary", "shared_receipt_with_poi", "to_messages",\
    "total_payments", "total_stock_value"]

df[numeric_columns] = df[numeric_columns].apply(lambda x: pd.to_numeric(x, errors="coerce"))

# imputation
df.fillna(0, inplace=True)

# creating our new features
# df["payout"] = df["bonus"] + df["salary"]
# df["stock"] = df["exercised_stock_options"] + df["loan_advances"] + df["long_term_incentive"]
# df["poi_conversation"] = df["from_poi_to_this_person"] + df["shared_receipt_with_poi"]

# Changing our poi field from object to int so it appears in our correlation table
labels = np.where(df["poi"] == True, 1, 0)

# Drop unnecessary columns
del df["email_address"]
del df["total_payments"]
del df["total_stock_value"]
del df["loan_advances"]

columns = df.columns

# Scale features
scaler = MinMaxScaler(feature_range=(0,1))
df = scaler.fit_transform(df)
df = pd.DataFrame(df, columns=columns)

df.to_csv("EnronData.csv")

# Creating our training data
df_train = df[list(df.columns)]
df_label = df_train["poi"]
del df_train["poi"]

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.25)

for train_ix, test_ix in sss.split(df_train, df_label):
    x_train, x_test = df_train.iloc[train_ix], df_train.iloc[test_ix]
    y_train, y_test = df_label.iloc[train_ix], df_label.iloc[test_ix]

In [79]:
def custom_score(clf, x, y):
    accuracy = clf.score(x, y)
    y_true = y
    y_pred = clf.predict(x)
    return  precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=[0,1], average="macro")[:-1]

In [80]:
def get_features_used(data, k_best_estimator):
    return list(df_train.columns[k_best.get_support()])

In [81]:
def report_performance(clf, x, y, data, k_best):
    features = get_features_used(data, k_best)
    accuracy = clf.score(x,y)
    precision, recall, f1 = custom_score(clf, x, y)
    return "Features used: {}\nAccuracy: {}\nPrecision: {}\nRecall: {}\nF1: {}".format(features, accuracy, precision/2, recall/2, f1/2)

### Decision Tree

In [93]:
dtc = DecisionTreeClassifier()

k_best = SelectKBest(score_func=chi2, k=3)

clf_param = {"criterion": ("gini", "entropy")}
clf_grid = GridSearchCV(dtc, clf_param)

clf = Pipeline([
        ("feature_selection", k_best),
        ("classifier", clf_grid)
    ])

clf.fit(x_train, y_train)
print report_performance(clf, x_test, y_test, df_train, k_best)

Features used: ['bonus', 'exercised_stock_options', 'shared_receipt_with_poi']
Accuracy: 0.837837837838
Precision: 0.340476190476
Recall: 0.36875
F1: 0.350806451613


### Random Forest

In [99]:
rfc = RandomForestClassifier()

k_best = SelectKBest(score_func=chi2, k=3)

clf_param = {"n_estimators": [1, 10, 100, 1000], "criterion": ("gini", "entropy")}
clf_grid = GridSearchCV(rfc, clf_param)

clf = Pipeline([
        ("feature_selection", k_best),
        ("classifier", clf_grid)
    ])

clf.fit(x_train, y_train)
print report_performance(clf, x_test, y_test, df_train, k_best)

Features used: ['bonus', 'exercised_stock_options', 'shared_receipt_with_poi']
Accuracy: 0.918918918919
Precision: 0.422348484848
Recall: 0.3921875
F1: 0.405128205128


In [100]:
clf.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('feature_selection', SelectKBest(k=3, score_func=<function chi2 at 0x000000000A568A58>)), ('classifier', GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', ma...     pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0))])>

### Naive Bayes

In [95]:
nb = GaussianNB()

k_best = SelectKBest(score_func=chi2, k=3)

clf = Pipeline([
        ("feature_selection", k_best),
        ("classifier", nb)
    ])

clf.fit(x_train, y_train)
print report_performance(clf, x_test, y_test, df_train, k_best)

Features used: ['bonus', 'exercised_stock_options', 'shared_receipt_with_poi']
Accuracy: 0.783783783784
Precision: 0.26875
Recall: 0.26875
F1: 0.26875


### SVM SVC

In [96]:
svc = SVC()

k_best = SelectKBest(score_func=chi2, k=3)

clf_param = {"kernel": ("linear", "rbf"), "C": [1, 10, 100, 1000]}
clf_grid = GridSearchCV(svc, clf_param)

clf = Pipeline([
        ("feature_selection", k_best),
        ("classifier", clf_grid)
    ])

clf.fit(x_train, y_train)
print report_performance(clf, x_test, y_test, df_train, k_best)

Features used: ['bonus', 'exercised_stock_options', 'shared_receipt_with_poi']
Accuracy: 0.891891891892
Precision: 0.472222222222
Recall: 0.3
F1: 0.31862745098


### Logistic Regression

In [97]:
lr = LogisticRegression()

k_best = SelectKBest(score_func=chi2, k=3)

clf_param = {"C": [1, 10, 100, 1000]}
clf_grid = GridSearchCV(lr, clf_param)

clf = Pipeline([
        ("feature_selection", k_best),
        ("classifier", clf_grid)
    ])

clf.fit(x_train, y_train)
print report_performance(clf, x_test, y_test, df_train, k_best)

Features used: ['bonus', 'exercised_stock_options', 'shared_receipt_with_poi']
Accuracy: 0.864864864865
Precision: 0.346428571429
Recall: 0.2921875
F1: 0.302771855011
