https://www.kaggle.com/c/ml100/overview

In [23]:
import pandas as pd
import numpy as np
import copy, time
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

data_path = 'dataML100Midterm/'
df_train = pd.read_csv(data_path + 'train_data.csv')
df_test = pd.read_csv(data_path + 'test_features.csv')
#Get Label
train_label = df_train['poi'] 
#Get list of test names for submission
names = df_test['name'] 
df_train = df_train.drop(['name', 'email_address', 'poi'] , axis=1)
df_test = df_test.drop(['name', 'email_address'] , axis=1)
df = pd.concat([df_train, df_test])
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,19794175.0,46950.0,18.0,42.0,4.0,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,,,19250000.0,29336.0,108.0,88.0,30.0,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,,-4167.0,,1624396.0,22884.0,39.0,13.0,14.0,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,,,118134.0,,,,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,1250000.0,,-262500.0,,,35818.0,144.0,199.0,25.0,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [24]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
na_check(df)

Unnamed: 0,Missing Ratio
loan_advances,97.260274
director_fees,88.356164
restricted_stock_deferred,87.671233
deferral_payments,73.287671
deferred_income,66.438356
long_term_incentive,54.794521
bonus,43.835616
from_this_person_to_poi,41.09589
from_poi_to_this_person,41.09589
from_messages,41.09589


In [25]:
# 因為需要把類別型與數值型特徵都加入, 故使用最簡版的特徵工程
LEncoder = LabelEncoder()
MMEncoder = MinMaxScaler()
for c in df.columns:
    df[c] = df[c].fillna(-1)
    if df[c].dtype == 'object':
        df[c] = LEncoder.fit_transform(list(df[c].values))
    df[c] = MMEncoder.fit_transform(df[c].values.reshape(-1, 1))#reshape(-1, 1) means reshape(df[c].shape[0], 1)
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,0.017978,0.003185,0.874812,0.0,0.063491,0.008968,0.001322,0.081285,0.008197,0.0,0.033325,0.004098,0.040269,0.328952,0.015752,0.156646,0.059802,0.00163,0.051977
1,0.057528,0.003185,1.0,0.0,0.061745,0.005604,0.007586,0.168242,0.05082,0.0,0.03957,0.000518,0.071078,0.328952,0.041614,0.369975,0.239472,0.028019,0.060149
2,0.002055,0.003185,0.999851,0.0,0.00521,0.004371,0.002784,0.026465,0.02459,0.0,0.0,0.036874,0.026133,0.328952,0.007933,0.016661,0.014917,0.006467,0.00584
3,0.008218,0.003185,1.0,0.0,0.0,0.022566,0.0,0.0,0.0,0.0,0.012421,0.021269,0.027004,0.328952,0.0084,0.0,0.0,0.00856,0.002368
4,0.012841,0.003185,0.990623,0.0,0.0,0.006842,0.010091,0.378072,0.042623,0.0,0.007735,1.1e-05,0.020542,0.328952,0.008994,0.396414,0.171551,0.00529,0.000391


In [26]:
# 做完各種補缺值, 確認一下有沒有遺漏
na_check(df)

Unnamed: 0,Missing Ratio


In [27]:
# split data 
label_num = train_label.shape[0]
train_fea = df[:label_num] # all feature for training
# test = df[label_num:] # feature for testing


In [28]:
# 隨機森林擬合後, 將結果依照重要性由高到低排序 
estimator = RandomForestClassifier()
estimator.fit(train_fea.values, train_label)
feats = pd.Series(data=estimator.feature_importances_, index=train_fea.columns)
feats = feats.sort_values(ascending=False)
feats

exercised_stock_options      0.154652
salary                       0.124519
expenses                     0.099102
other                        0.097237
total_payments               0.082740
total_stock_value            0.071808
restricted_stock             0.070335
bonus                        0.061507
from_poi_to_this_person      0.046112
deferred_income              0.042061
from_messages                0.039969
from_this_person_to_poi      0.036488
long_term_incentive          0.027230
deferral_payments            0.021034
shared_receipt_with_poi      0.013317
to_messages                  0.011887
restricted_stock_deferred    0.000000
director_fees                0.000000
loan_advances                0.000000
dtype: float64

In [None]:
lr = LogisticRegression()
cross_val_score(lr, train_fea, train_label, cv=7).mean()

In [None]:
dtc = DecisionTreeClassifier()
cross_val_score(dtc, train_fea, train_label, cv=7).mean()


In [None]:
rfc = RandomForestClassifier()
cross_val_score(rfc, train_fea, train_label, cv=7).mean()


In [None]:
gbc = GradientBoostingClassifier()
cross_val_score(gbc, train_fea, train_label, cv=7).mean()

In [None]:
# define submit function (output predicted result to csv file)
def submit(filename, model):
    pred = model.predict_proba(test)[:, 1]
    sub = pd.DataFrame({'name' : names, 'poi' : pred})
    sub.to_csv(filename, index=False)

In [None]:
# lr.fit(train_fea, train_label)
# submit("LogisticRegression.csv", lr)
# rfc.fit(train_fea, train_label)
# submit("RandomForestClassifier.csv", rfc)
# gbc.fit(train_fea, train_label)
# submit("GradientBoostingClassifier.csv", gbc)
# dtc2 = DecisionTreeClassifier()
# dtc2.fit(train_fea, train_label)
# submit("DecisionTreeClassifier.csv", dtc2)

In [None]:
#grid search 
#hyper-parameter dict
n_estimators = [100, 200, 300]
max_dep = [1, 3, 5, ]
param_grid = dict(n_estimators=n_estimators, max_depth=max_dep)
#search best parameter
#create search object
para_search = GridSearchCV(gbc, param_grid, n_jobs=-1)
#start search
search_result = para_search.fit(train_fea, train_label)

In [None]:
#create GradientBoostingClassifier with the parameter
gbc_bestpara = GradientBoostingClassifier(n_estimators=search_result.best_params_['n_estimators'], max_depth=search_result.best_params_['max_depth'])
cross_val_score(gbc_bestpara, train_fea, train_label, cv=7).mean()

In [None]:
# gbc_bestpara.fit(train_fea, train_label)
# submit("GradientBoostingClassifier_bestpara.csv", gbc_bestpara)