https://www.kaggle.com/c/ml100/overview

In [227]:
import pandas as pd
import numpy as np
import copy, time
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

data_path = 'dataML100Midterm/'
df_train = pd.read_csv(data_path + 'train_data.csv')
df_test = pd.read_csv(data_path + 'test_features.csv')
#Get Label
all_label = df_train['poi'] 
#Get list of test names for submission
names = df_test['name'] 
df_train = df_train.drop(['name', 'email_address', 'poi'] , axis=1)
df_test = df_test.drop(['name', 'email_address'] , axis=1)
df = pd.concat([df_train, df_test])
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,19794175.0,46950.0,18.0,42.0,4.0,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,,,19250000.0,29336.0,108.0,88.0,30.0,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,,-4167.0,,1624396.0,22884.0,39.0,13.0,14.0,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,,,118134.0,,,,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,1250000.0,,-262500.0,,,35818.0,144.0,199.0,25.0,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [228]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
na_check(df)

Unnamed: 0,Missing Ratio
loan_advances,97.260274
director_fees,88.356164
restricted_stock_deferred,87.671233
deferral_payments,73.287671
deferred_income,66.438356
long_term_incentive,54.794521
bonus,43.835616
from_this_person_to_poi,41.09589
from_poi_to_this_person,41.09589
from_messages,41.09589


In [229]:
#fill 0 in following cols
zero_cols = ['loan_advances', 'director_fees', 'restricted_stock_deferred', 'deferral_payments', 'deferred_income', 'long_term_incentive', 'bonus', 'from_this_person_to_poi', 'from_poi_to_this_person', 'from_messages', 'to_messages', 'shared_receipt_with_poi', 'other', 'salary', 'expenses', 'exercised_stock_options', 'restricted_stock', 'total_payments', 'total_stock_value']
for n in zero_cols:
    df[n] = df[n].fillna(0)

In [230]:
# 做完各種補缺值, 確認一下有沒有遺漏
na_check(df)

Unnamed: 0,Missing Ratio


In [231]:
df = MinMaxScaler().fit_transform(df)
# split data 
label_num = all_label.shape[0]
all_fea = df[:label_num] # all feature for training
test = df[label_num:] # feature for testing


In [232]:
lr = LogisticRegression()
cross_val_score(lr, all_fea, all_label, cv=7).mean()

0.8854341736694679

In [233]:
dtc = DecisionTreeClassifier()
cross_val_score(dtc, all_fea, all_label, cv=7).mean()


0.8323879551820729

In [234]:
rfc = RandomForestClassifier()
cross_val_score(rfc, all_fea, all_label, cv=7).mean()


0.8849089635854341

In [235]:
gbc = GradientBoostingClassifier()
cross_val_score(gbc, all_fea, all_label, cv=7).mean()

0.8854341736694679

In [236]:
# define submit function (output predicted result to csv file)
def submit(filename, model):
    pred = model.predict_proba(test)[:, 1]
    sub = pd.DataFrame({'name' : names, 'poi' : pred})
    sub.to_csv(filename, index=False)

In [237]:
# lr.fit(all_fea, all_label)
# submit("LogisticRegression.csv", lr)
# rfc.fit(all_fea, all_label)
# submit("RandomForestClassifier.csv", rfc)
# gbc.fit(all_fea, all_label)
# submit("GradientBoostingClassifier.csv", gbc)
# dtc2 = DecisionTreeClassifier()
# dtc2.fit(all_fea, all_label)
# submit("DecisionTreeClassifier.csv", dtc2)

In [238]:
#grid search 
#hyper-parameter dict
n_estimators = [100, 200, 300]
max_dep = [1, 3, 5, ]
param_grid = dict(n_estimators=n_estimators, max_depth=max_dep)
#search best parameter
#create search object
para_search = GridSearchCV(gbc, param_grid, n_jobs=-1)
#start search
search_result = para_search.fit(all_fea, all_label)

In [239]:
#create GradientBoostingClassifier with the parameter
gbc_bestpara = GradientBoostingClassifier(n_estimators=search_result.best_params_['n_estimators'], max_depth=search_result.best_params_['max_depth'])
cross_val_score(gbc_bestpara, all_fea, all_label, cv=7).mean()

0.8670518207282913

In [240]:
# gbc_bestpara.fit(all_fea, all_label)
# submit("GradientBoostingClassifier_bestpara.csv", gbc_bestpara)