In [1]:
import pandas as pd
import re
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
train = pd.read_csv("/Users/chloe/Google Drive/Kaggle_Data/Kaggle_3rd_ML/train_data.csv")
test = pd.read_csv("/Users/chloe/Google Drive/Kaggle_Data/Kaggle_3rd_ML/test_features.csv")
sub = pd.read_csv("/Users/chloe/Google Drive/Kaggle_Data/Kaggle_3rd_ML/sample_submission.csv")
target = train["poi"].map(lambda x: 1 if x==True else 0)
test_name = test["name"]
del train["poi"], train["name"], test["name"]
data = pd.concat([train, test])
data = data.reset_index()
del data["index"], data["email_address"]

In [3]:
# fillna with mean: salary 
data["salary"] = data["salary"].fillna(data["salary"].mean())
data = data.fillna(0)

In [4]:
# check the target is balance or not => imbalance
target.value_counts()

0    100
1     13
Name: poi, dtype: int64

In [5]:
# feaure engineering 

# total_stock = exercise_stock+restricted_stock+restricted_stock_deferred=total_stock_value

data["% of stock exercise"] = data.apply(lambda x: x["exercised_stock_options"]/x["total_stock_value"] if x["total_stock_value"]!=0 else 0, axis=1)
data["% of restricted stock"] = data.apply(lambda x: x["restricted_stock"]/x["total_stock_value"] if x["total_stock_value"]!=0 else 0, axis=1)
data["total_earned"] = data["bonus"] + data["salary"]
data["% of message sent to poi"] = data.apply(lambda x: x["from_this_person_to_poi"]/x["from_messages"] if x["from_messages"]!=0 else 0, axis=1)
data["% of message reply to poi"] = data.apply(lambda x: x["from_this_person_to_poi"]/x["from_poi_to_this_person"] if x["from_poi_to_this_person"]!=0 else 0, axis=1)
data["% of loan advance from salary"] = data.apply(lambda x: x["loan_advances"]/x["salary"] if x["salary"]!=0 else 0, axis=1)
data["all_deferral"] = data["deferral_payments"] + abs(data["deferred_income"])
#data["extra"] = data["total_payments"]-data["expenses"]-data["other"]-data["deferral_payments"]
#data["% of extra to payment"] = data.apply(lambda x: x["extra"]/x["total_payments"] if x["total_payments"]!=0 else 0, axis=1)


# data["has_loan_advance"] = data["loan_advances"].map(lambda x: 1 if x>0 else 0)
# data["inv_deferral_income"] = data.apply(lambda x: x["deferred_income"]**-1 if x["deferred_income"]!=0 else 0, axis=1)

# data["all_cash_flow"] = data["bonus"]+data["deferral_payments"]+data['deferred_income']+data["director_fees"]+data["exercised_stock_options"] + data['expenses']+data['long_term_incentive']+data['other']+data['restricted_stock']+data["restricted_stock_deferred"]+data["salary"]
  
    
# data["% of share receipt"] = data.apply(lambda x: x["shared_receipt_with_poi"]/x["to_messages"] if x["to_messages"]!=0 else 0, axis=1)
# data["% expense"] = data.apply(lambda x: x['expenses']/x['total_payments'] if x['total_payments']!=0 else 0, axis=1)
# data["% of message from poi"] = data.apply(lambda x: x["from_poi_to_this_person"]/x["to_messages"] if x["to_messages"]!=0 else 0, axis=1)

# data["% of restricted stock deferred"] = data.apply(lambda x: abs(x["restricted_stock_deferred"])/x["restricted_stock"] if x["restricted_stock"]!=0 else 0, axis=1)
# data["% of bonus"] = data.apply(lambda x: x["bonus"]/x["total_earned"] if x["total_earned"]!=0 else 0, axis=1)
# data["all_incentives"] = data["long_term_incentive"] + data['director_fees']
# data["% of deferral payment"] = data.apply(lambda x: x["deferral_payments"]/x["total_payments"] if x["total_payments"]!=0 else 0, axis=1)
# data["final payment"] = data["deferral_payments"]+data["total_payments"]

# data["exercise_stock_x_salary"] = data["exercised_stock_options"]*data["salary"]

In [6]:
# # feature importance
# train = data[:len(train)]
# test_scale = data[len(train):]

# X_train, X_test, y_train, y_test = train_test_split(train, target.values, test_size=0.2, random_state=42)

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)
# rf.score(X_test, y_test)
# #zip(rf.feature_importances_
# feature_importances = pd.DataFrame(rf.feature_importances_,
#                                    index = train.columns,
#                                     columns=['importance']).sort_values('importance', ascending=False)
# feature_importances

In [7]:
# scale the numerical data
#scaler = StandardScaler()
scaler = MinMaxScaler()
data = scaler.fit_transform(data)


columns = train.columns
# split data to train and test

train = data[:len(train)]
test_scale = data[len(train):]

In [8]:
# # feature importance
# X_train, X_test, y_train, y_test = train_test_split(train, target.values, test_size=0.2, random_state=42)

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)
# rf.score(X_test, y_test)
# #zip(rf.feature_importances_
# feature_importances = pd.DataFrame(rf.feature_importances_,
#                                    index = range(0,24),
#                                     columns=['importance']).sort_values('importance', ascending=False)
# feature_importances

In [9]:
# baseline model -> logistic 

# X_train, X_test, y_train, y_test = train_test_split(train, target.values, test_size=0.2, random_state=42)
# clf = LogisticRegression(random_state=0, solver='lbfgs', class_weight='balanced', C=0.5)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# y_pred_prob = clf.predict_proba(X_test)[:,1]
# lr_prob = clf.predict_proba(test_scale)[:,1]
# print("auc score: ", roc_auc_score(y_test, y_pred_prob))

cv_score = []
lr_cv_pred = np.zeros(len(test_scale))
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(train, target.values):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    clf = LogisticRegression(random_state=0, solver='lbfgs', class_weight='balanced', C=10)
    clf.fit(X_train, y_train)
    y_pred_prob = clf.predict_proba(X_test)[:,1]
    lr_prob = clf.predict_proba(test_scale)[:,1]
    #rf_cv_pred.append(rf_prob)
    lr_cv_pred+=lr_prob
    cv_score.append(roc_auc_score(y_test, y_pred_prob))
print('straified score: ', np.mean(cv_score))


straified score:  0.8466666666666667


In [10]:
# random forest

# X_train, X_test, y_train, y_test = train_test_split(train, target.values, test_size=0.2, random_state=42)
# rf = RandomForestClassifier(n_estimators=100, random_state=0)
# rf.fit(X_train, y_train)
# y_pred_prob = rf.predict_proba(X_test)[:,1]
# rf_prob = rf.predict_proba(test_scale)[:,1]
# print("auc score: ", roc_auc_score(y_test, y_pred_prob))


cv_score = []
rf_cv_pred = np.zeros(len(test_scale))
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(train, target.values):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf.fit(X_train, y_train)
    y_pred_prob = rf.predict_proba(X_test)[:,1]
    rf_prob = rf.predict_proba(test_scale)[:,1]
    #rf_cv_pred.append(rf_prob)
    rf_cv_pred+=rf_prob
    cv_score.append(roc_auc_score(y_test, y_pred_prob))
print('straified score: ', np.mean(cv_score))

straified score:  0.8266666666666665


In [11]:
# gradient boost


# X_train, X_test, y_train, y_test = train_test_split(train, target.values, test_size=0.2, random_state=42)
# gb = GradientBoostingClassifier(n_estimators=100, random_state=0)
# gb.fit(X_train, y_train)
# y_pred_prob = gb.predict_proba(X_test)[:,1]
# gb_prob = gb.predict_proba(test_scale)[:,1]
# print("auc score: ", roc_auc_score(y_test, y_pred_prob))

cv_score = []
gb_cv_pred = np.zeros(len(test_scale))
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(train, target.values):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    gb = GradientBoostingClassifier(n_estimators=300, random_state=0)
    gb.fit(X_train, y_train)
    y_pred_prob = gb.predict_proba(X_test)[:,1]
    gb_prob = gb.predict_proba(test_scale)[:,1]
    #rf_cv_pred.append(rf_prob)
    gb_cv_pred+=gb_prob
    cv_score.append(roc_auc_score(y_test, y_pred_prob))
print('straified score: ', np.mean(cv_score))

straified score:  0.7733333333333332


In [12]:
# output
output = sub
output["poi"] = rf_cv_pred/5 
#output["poi_lr"] = lr_prob
#output["poi_rf"] = rf_prob
#output["poi_gb"] = gb_prob
output.to_csv("rf_stratified_engineering.csv", index=False)