In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [4]:
def grid_search(clf, parameters, X_train, y_train):
    acc_scorer = make_scorer(accuracy_score)
    grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer, n_jobs=-1, cv=5)
    grid_obj = grid_obj.fit(X_train, y_train.values.ravel())
    clf = grid_obj.best_estimator_
    return(clf)
    
    
def classifier(clf):
    
    clf_name = clf.__class__.__name__
    parameters = parameter_set(clf_name)
    print(parameters)
    # return predictions from gird search best model
    clf = grid_search(clf, parameters, x_train, y_train)
    
    # fit best model
    clf.fit(x_train, y_train.values.ravel())
    
    predictions = clf.predict(x_test) 
    if clf_name == 'XGBClassifier':
        predictions = [value for value in predictions]
    return(predictions)

def parameter_set(clf_name):
    if clf_name == 'RandomForestClassifier':
        parameters = {'n_estimators': [5, 10, 50, 100, 150, 200], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
#               'max_depth': list(range(2,10)), 
#               'min_samples_split': list(range(2,5)),
#               'min_samples_leaf': list(range(1,5)),
              'verbose': [0]
             }
    if clf_name == 'DecisionTreeClassifier':
        parameters = {
              'max_depth': list(range(2,10)), 
              'min_samples_split': list(range(2,10))
             }
    if clf_name == 'AdaBoostClassifier':
        parameters = {
            "n_estimators" : [5, 10, 50, 100, 150, 200],
            "algorithm" :  ["SAMME", "SAMME.R"],
            'learning_rate':[0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.5, 0.7]
             }
    if clf_name == 'GradientBoostingClassifier':
        parameters = {
            "loss":["deviance"],
            "learning_rate": [0.075, 0.1, 0.15, 0.2, 0.3, 0.5, 0.7],
#             "min_samples_split": list(range(2,5)),
#             "min_samples_leaf": list(range(1,5)),
            "max_depth": [3,5,8],
            "max_features": ["log2","sqrt"],
            "criterion": ["friedman_mse",  "mae"],
            "subsample": [0.5, 0.8, 0.9, 1.0],
            "n_estimators": [5, 10, 50, 100, 150, 200]
             }
    if clf_name == 'XGBClassifier':
        parameters = {
            'learning_rate': np.linspace(0.01, 0.5, 9),
#             'max_depth': list(range(5,10)),
#             'min_child_weight': list(range(3,10)),
            'gamma': np.linspace(0, 0.5, 11),
#             'subsample': [0.8, 0.9],
#             'colsample_bytree': [0.3, 0.4, 0.5 , 0.7, 0.8, 0.9],
            'objective': ['binary:logistic']
        }
    return(parameters)

In [73]:
#設定路徑
dir_data = './data/' 
train_data = os.path.join(dir_data,'adult.data.csv')
test_data = os.path.join(dir_data,'adult.test.csv')
x_train = pd.read_csv(train_data)
x_test = pd.read_csv(test_data)
x_train.head()
x_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationshiprace,race,sex,capital-gain,capital-loss,hours-per-week,native-country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [6]:
#設置column
col_label = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationshiprace','race','sex','capital-gain','capital-loss','hours-per-week','native-country','wage_class']
x_train.columns = col_label
x_test.columns = col_label
x_train.info()
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age                 32561 non-null int64
workclass           32561 non-null object
fnlwgt              32561 non-null int64
education           32561 non-null object
education-num       32561 non-null int64
marital-status      32561 non-null object
occupation          32561 non-null object
relationshiprace    32561 non-null object
race                32561 non-null object
sex                 32561 non-null object
capital-gain        32561 non-null int64
capital-loss        32561 non-null int64
hours-per-week      32561 non-null int64
native-country      32561 non-null object
wage_class          32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
age                 16281 non-null int64
workclass           16281 non-null object
fnlwgt              16281

In [7]:
#缺失值處理:先替換掉?在dropna
print( x_train.shape) 
print( x_test.shape)
x_train=x_train.replace(' ?', np.nan).dropna()
x_test=x_test.replace(' ?', np.nan).dropna()
print( x_train.shape) 
print( x_test.shape)

(32561, 15)
(16281, 15)
(30162, 15)
(15060, 15)


In [13]:
#替換測試集中的wage_class值使得其與x_train一致,多一個.replace適用dict的方式換
print (x_train.wage_class.unique())
# [' <=50K' ' >50K']
print (x_test.wage_class.unique())
# [' <=50K' ' >50K']
x_test['wage_class'] = x_test.wage_class.replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})
print (x_train.wage_class.unique())
# [' <=50K' ' >50K']
print (x_test.wage_class.unique())
# [' <=50K' ' >50K']

[' <=50K' ' >50K']
[' <=50K.' ' >50K.']
[' <=50K' ' >50K']
[' <=50K' ' >50K']


In [14]:
#查看列屬性和類別的關係
#我們可以查看下，教育程度和類別(年收入>=50Kde 關係，一般來說學歷越高，年收入高的概率越大)
print( x_train.education.unique())
print(pd.crosstab(x_train['wage_class'], x_train['education'], rownames=['wage_class']))

[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
education    10th   11th   12th   1st-4th   5th-6th   7th-8th   9th  \
wage_class                                                            
 <=50K        871   1115    400       162       317       606   487   
 >50K          62     60     33         6        16        40    27   

education    Assoc-acdm   Assoc-voc   Bachelors   Doctorate   HS-grad  \
wage_class                                                              
 <=50K              802        1021        3134         107      8826   
 >50K               265         361        2221         306      1675   

education    Masters   Preschool   Prof-school   Some-college  
wage_class                                                     
 <=50K           764          51           153           5904  
 >50K            959           0           

In [15]:
#我們可以看到Masters(研究生)的>=50K的比例較高，而Preschool沒有上個學的基本沒有>=50K的。

In [16]:
#字符串類型轉化爲數值類型,爲了保證測試集和訓練集的encoding類型一致，我們首先將兩個表join，編碼完成之後，在分開到原始的表中:

combined_set = pd.concat([x_train, x_test], axis=0)
#合併完成將表中的object數據轉化爲int類型：

for feature in combined_set.columns:
    if combined_set[feature].dtype == 'object':
        combined_set[feature] = pd.Categorical(combined_set[feature]).codes

In [17]:
#將數據轉到原有的訓練集以及測試集中:

x_train = combined_set[:x_train.shape[0]]
x_test = combined_set[x_train.shape[0]:]
#我們可以看下，education以及wage_class的編碼:

print (x_train.education.unique())
print (x_train.wage_class.unique())

[ 9 11  1 12  6 15  7  8  5 10 14  4  0  3 13  2]
[0 1]


In [18]:
y_train=x_train.pop('wage_class')
y_test=x_test.pop('wage_class')


In [19]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(32561, 14)
(32561,)
(16281, 14)
(16281,)


In [20]:
#decisiontree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
rf = DecisionTreeClassifier(max_depth=10)
x_train.shape
y_train.shape
rf.fit(x_train, y_train)
Y_pred = rf.predict(x_test)
print(Y_pred)
print (confusion_matrix(y_test, Y_pred))
print("準確率",rf.score(x_test,y_test))

[0 0 0 ... 1 0 1]
[[11725   710]
 [ 1651  2195]]
準確率 0.8549843375714022


In [21]:
print (confusion_matrix(y_test, Y_pred))


[[11725   710]
 [ 1651  2195]]


In [22]:
print("準確率",rf.score(x_test,y_test))

準確率 0.8549843375714022


In [23]:
#RandomForestClassifier模型分類以及驗證
def rfc_fit_test(X_train, X_test, Y_train, Y_test):
  
    rf = RandomForestClassifier(n_jobs=2)
    rf.fit(X_train, Y_train)
    Y_pred = rf.predict(X_test)
    print (classification_report(Y_test, Y_pred))

In [24]:
rfc_fit_test(x_train,x_test,y_train,y_test)



              precision    recall  f1-score   support

           0       0.87      0.93      0.90     12435
           1       0.72      0.56      0.63      3846

    accuracy                           0.84     16281
   macro avg       0.79      0.75      0.77     16281
weighted avg       0.84      0.84      0.84     16281



In [25]:
#XGB
from xgboost import XGBClassifier
xgbc = XGBClassifier()
xgbc.fit(x_train, y_train)
print('The accuracy of eXtreme Gradient Boosting Classifier on testing set:', xgbc.score(x_test, y_test))

The accuracy of eXtreme Gradient Boosting Classifier on testing set: 0.8661630120999939


#https://www.ycc.idv.tw/confusion-matrix.html

In [26]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    xgb.XGBClassifier()
]

In [27]:
# Logging for Visual Comparison# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy"]
log = pd.DataFrame([],columns=log_cols)

for clf in classifiers:
    
    name = clf.__class__.__name__
    clf.fit(x_train, y_train.values.ravel())
    print("="*30)
    print(name)
    
    train_predictions = clf.predict(x_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.2%}".format(acc))
    
    log_entry = pd.DataFrame([[name, acc*100]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

DecisionTreeClassifier
Accuracy: 80.94%




RandomForestClassifier
Accuracy: 84.99%
XGBClassifier
Accuracy: 86.62%


In [28]:
# Grid Search
for clf in classifiers:
    name = clf.__class__.__name__ + 'Grid'
    print("="*30)
    print(name)
    train_predictions = classifier(clf)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.2%}".format(acc))
    log_entry = pd.DataFrame([[name, acc*100]], columns=log_cols)
    log = log.append(log_entry)
print("="*30)

DecisionTreeClassifierGrid
{'max_depth': [2, 3, 4, 5, 6, 7, 8, 9], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9]}
Accuracy: 85.39%
RandomForestClassifierGrid
{'n_estimators': [5, 10, 50, 100, 150, 200], 'max_features': ['log2', 'sqrt', 'auto'], 'criterion': ['entropy', 'gini'], 'verbose': [0]}
Accuracy: 85.49%
XGBClassifierGrid
{'learning_rate': array([0.01   , 0.07125, 0.1325 , 0.19375, 0.255  , 0.31625, 0.3775 ,
       0.43875, 0.5    ]), 'gamma': array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ]), 'objective': ['binary:logistic']}
Accuracy: 87.44%


In [None]:
sns.set_color_codes("muted")
g=sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")

plt.xlabel('Accuracy %')
plt.title('Classifier Accuracy')  

for p in g.patches:
    x = p.get_x() + p.get_width() +.3
    y = p.get_y() + p.get_height()/2 + .1
    g.annotate("%.2f %%" % (p.get_width()), (x, y))

plt.show()

In [102]:
#印出相關欄位類別的資料筆數，不包含空值
x_train=x_train.replace(' ?', np.nan)
for col in x_train.columns:
    print(col,':',sum(x_train[col].value_counts()))        
    #print(x_train.iloc[:,col].value_counts())
#print(x_train.iloc[:,1].value_counts())
#print(x_train.loc[:,'occupation'].value_counts())
#print(x_train['occupation'].value_counts())


age : 32561
workclass : 30725
fnlwgt : 32561
education : 32561
education-num : 32561
marital-status : 32561
occupation : 30718
relationshiprace : 32561
race : 32561
sex : 32561
capital-gain : 32561
capital-loss : 32561
hours-per-week : 32561
native-country : 31978
wage_class : 32561


In [94]:
#計算該類別nan的筆數
x_train=x_train.replace(' ?', np.nan)
sum(x_train['workclass'].value_counts())

False    30725
True      1836
Name: workclass, dtype: int64