# ADULT Dataset 分析及預測報告 <h1>
                                                                                                               2019-09-02 
                                                                                                                   吳明宗  

# 說明：
* 資料來源：使用UCI Adult Data Set。 http://archive.ics.uci.edu/ml/datasets/Adult <br>
* 分析流程：<br>
    1.資料檢視<br>
    2.資料工程<br>
    3.建立模型<br>

# 零、事前準備

# Import

In [8]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import xgboost as xgb
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, make_scorer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

from keras import layers
from keras import optimizers
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.optimizers import Adam, SGD, RMSprop, Adagrad

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [9]:
def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

def train_to_test_split(train_x, train_y):
    sc = MinMaxScaler()
    train_x = sc.fit_transform(train_x)
    joblib.dump(sc.fit(train_x), scaler_filename) 
    return train_test_split(train_x, train_y, test_size=0.3, random_state=SEED)

def grid_search(clf, parameters, X_train, y_train):
    acc_scorer = make_scorer(accuracy_score)
    grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer, n_jobs=-1, cv=5)
    grid_obj = grid_obj.fit(X_train, y_train.values.ravel())
    clf = grid_obj.best_estimator_
    return(clf)
    
def grid_classifier(clf):
    clf_name = clf.__class__.__name__
    parameters = parameter_set(clf_name)
    print(parameters)
    # return predictions from gird search best model
    clf = grid_search(clf, parameters, X_train, y_train)
    print('-'*5)
    print('Get Grid Parameters')
    print(clf.get_params())
    # fit best model
    clf.fit(X_train, y_train.values.ravel())
    model_name = 'model/'+ clf_name + 'Grid' +'_00001.pkl'
    joblib.dump(clf, model_name) 
    predictions = clf.predict(X_test) 
    if clf_name == 'XGBClassifier':
        predictions = [value for value in predictions]
    return clf, predictions

def get_oof(clf, X_train, y_train, X_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    i = 0
    for train_index, test_index in kf.split(X_train):
        x_tr = X_train[train_index]
        y_tr = y_train.values.ravel()[train_index]
        x_te = X_train[test_index]
        clf.fit(x_tr, y_tr)
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(X_test)
        i += 1
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

def parameter_set(clf_name):
    if clf_name == 'RandomForestClassifier':
        parameters = {'n_estimators': [100, 150, 200], 
#               'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
#               'max_depth': list(range(2,10)), 
#               'min_samples_split': list(range(2,5)),
#               'min_samples_leaf': list(range(1,5)),
              'n_jobs': [-1],
              'verbose': [1]
             }
    if clf_name == 'DecisionTreeClassifier':
        parameters = {
              'max_depth': list(range(2,10)), 
              'min_samples_split': list(range(2,10))
             }
    if clf_name == 'AdaBoostClassifier':
        parameters = {
            "n_estimators" : [150, 200],
            "algorithm" :  ["SAMME", "SAMME.R"],
            'learning_rate': [0.2, 0.5, 0.9]
             }
    if clf_name == 'GradientBoostingClassifier':
        parameters = {
            "loss":["deviance"],
            "learning_rate": [0.2, 0.5, 0.9],
#             "min_samples_split": list(range(2,5)),
#             "min_samples_leaf": list(range(1,5)),
            "max_depth": [3, 8],
#             "max_features":["log2","sqrt"],
#             "criterion": ["friedman_mse",  "mae"],
#             "subsample": [0.5, 0.8, 0.9, 1.0],
#             "n_estimators": [100, 150]
             }
    if clf_name == 'XGBClassifier':
        parameters = {
            'learning_rate': [0.2, 0.5, 0.9],
#             'max_depth': list(range(5,10)),
#             'min_child_weight': list(range(3,10)),
            'gamma': np.linspace(0, 0.9, 3),
#             'subsample': [0.8, 0.9],
#             'colsample_bytree': [0.3, 0.4, 0.5 , 0.7, 0.8, 0.9],
            'objective': ['binary:logistic']
        }
    if clf_name == 'ExtraTreesClassifier':
        parameters = {
            "n_estimators" : [100, 150, 200],
            'criterion': ['entropy', 'gini']
        }
    return(parameters)

In [17]:
# Dirs -------------------------------------------
scaler_filename = "model/scaler.save"

# Config -----------------------------------------
SEED = 111 
NFOLDS = 5

# Load Data --------------------------------------
dt = pd.read_csv("data/adult.data.csv").sample(frac=1)

In [18]:
dt.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationshiprace,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
24564,41,Private,88368,HS-grad,9,Never-married,Sales,Not-in-family,White,Female,0,0,50,United-States,<=50K
29642,45,Self-emp-inc,204196,Bachelors,13,Divorced,Exec-managerial,Unmarried,White,Male,0,0,50,United-States,>50K
4925,28,?,290267,Bachelors,13,Never-married,?,Not-in-family,White,Male,0,0,18,United-States,<=50K
15568,31,Private,115488,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,40,United-States,<=50K
15873,58,Private,314153,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K


In [20]:
str_column = []
num_column = []
for x in dt.columns:
    if dt[x].dtypes == 'object':
        dt[x] = dt[x].str.strip()
        dt[x].replace('?','NA',inplace=True)
        str_column.append(x)
    else:
        num_column.append(x)
        
dt['salary'].replace('>50K', 1, inplace=True)
dt['salary'].replace('<=50K', 0, inplace=True)
print(dt['salary'].value_counts())
print('>50K比例：',dt['salary'].value_counts()[1]/(dt['salary'].value_counts()[0]+dt['salary'].value_counts()[1]))


0    24720
1     7841
Name: salary, dtype: int64
>50K比例： 0.2408095574460244


In [21]:
dt.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,0.24081
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,0.427581
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [22]:
cont_col = [x for x in dt.describe().columns]
corr = dt[cont_col].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
age,1.0,-0.077,0.037,0.078,0.058,0.069,0.23
fnlwgt,-0.077,1.0,-0.043,0.00043,-0.01,-0.019,-0.0095
education-num,0.037,-0.043,1.0,0.12,0.08,0.15,0.34
capital-gain,0.078,0.00043,0.12,1.0,-0.032,0.078,0.22
capital-loss,0.058,-0.01,0.08,-0.032,1.0,0.054,0.15
hours-per-week,0.069,-0.019,0.15,0.078,0.054,1.0,0.23
salary,0.23,-0.0095,0.34,0.22,0.15,0.23,1.0


In [23]:
dt = dt.drop(['fnlwgt'], axis = 1 )

In [24]:
cont_col = [x for x in dt.describe().columns][:-1]
X_logit = dt[cont_col]
y_logit = dt['salary']
sc = StandardScaler()
X_logit = pd.DataFrame(sc.fit_transform(X_logit))
X_logit.describe()

Unnamed: 0,0,1,2,3,4
count,32561.0,32561.0,32561.0,32561.0,32561.0
mean,-2.684093e-17,1.479525e-16,1.1565600000000001e-17,-4.5389540000000005e-17,-2.793203e-17
std,1.000015,1.000015,1.000015,1.000015,1.000015
min,-1.582206,-3.529656,-0.1459205,-0.2166595,-3.19403
25%,-0.7757679,-0.4200596,-0.1459205,-0.2166595,-0.03542945
50%,-0.1159546,-0.03136003,-0.1459205,-0.2166595,-0.03542945
75%,0.6904838,0.7460392,-0.1459205,-0.2166595,0.3695194
max,3.769612,2.300838,13.39458,10.59351,4.742967


In [25]:
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_logit, y_logit)

In [26]:
print(cont_col)
print(clf.coef_)

['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
[[0.29334828 0.41506075 1.1737482  0.14079209 0.25136639]]
