In [135]:
!pip install xgboost



In [287]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, roc_auc_score 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [288]:
# Remy's "print-out-every-metric-you-could-ever-possibly-need" method

def clf_metrics(model,X,y,model_name):
    preds = model.predict(X)
    accuracy = accuracy_score(y,preds)
    roc_auc = roc_auc_score(y,preds)
    c_matrix = confusion_matrix(y, preds)
    tn, fp, fn, tp = c_matrix.ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    precision = tp/(tp+fp)
    f1_score = 2*tp/(2*tp + fp + fn)
    
    metric_dict = {
        'Confusion Matrix':c_matrix,
        'Accuracy':accuracy,
        'ROC AUC':roc_auc,
        'Sensitivity':sensitivity,
        'Specificity':specificity,
        'Precision':precision,
        'F1 Score':f1_score
    }
    c_matrix_df = pd.DataFrame([[tn,fn],[fp,tp]],
                               columns = ['Actual Negative','Actual Positive'],
                               index=['Predicted Negative','Predicted Positive'])
    
    # Print Block
    print('-'*75)
    print(f'Classification Metrics for {model_name}')
    print('')
    print(f'Accuracy:\t\t{round(accuracy,4)}')
    print(f'Sensitivity:\t\t{round(sensitivity,4)}')
    print(f'Specificity:\t\t{round(specificity,4)}')
    print(f'Precision:\t\t{round(precision,4)}')
    print(f'F1 Score:\t\t{round(f1_score,4)}')
    print(f'ROC AUC:\t\t{round(roc_auc,4)}')
    print(f'Confusion Matrix:\n{c_matrix_df}')
    print('-'*75)
    # print('')
    
    return metric_dict

In [164]:
cheap = pd.read_csv("../data/cheap_clean.csv")

In [165]:
weights = (cheap["wage"] == 1).map({True: 3, False: 1})

In [166]:
features = [col for col in cheap.columns if col not in ["fnlwgt", "wage"] and cheap[col].dtype != "object"]

features

['age',
 'education-num',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'is_investor',
 'sum_investments',
 'full_time',
 'workclass_Federal-gov',
 'workclass_Local-gov',
 'workclass_Never-worked',
 'workclass_Private',
 'workclass_Self-emp-inc',
 'workclass_Self-emp-not-inc',
 'workclass_State-gov',
 'workclass_Without-pay',
 'occupation_Adm-clerical',
 'occupation_Armed-Forces',
 'occupation_Craft-repair',
 'occupation_Exec-managerial',
 'occupation_Farming-fishing',
 'occupation_Handlers-cleaners',
 'occupation_Machine-op-inspct',
 'occupation_Other-service',
 'occupation_Priv-house-serv',
 'occupation_Prof-specialty',
 'occupation_Protective-serv',
 'occupation_Sales',
 'occupation_Tech-support',
 'occupation_Transport-moving',
 'marital-status_Divorced',
 'marital-status_Married-AF-spouse',
 'marital-status_Married-civ-spouse',
 'marital-status_Married-spouse-absent',
 'marital-status_Never-married',
 'marital-status_Separated',
 'marital-status_Widowed',
 'relatio

In [167]:
X = cheap[features]
y = cheap["wage"]

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 35,
                                                    stratify = y)

In [169]:
xgbc = XGBClassifier()

# Pulling out the weights based on "wage" column
weights = (cheap["wage"] == 1).map({True: 3, False: 1})

# Using "fnlwgt" gives a bit better results
metrics = clf_metrics(xgbc.fit(X_train, y_train, sample_weight = cheap["fnlwgt"].to_list()), 
                      X_train, y_train, "XGBoost")

---------------------------------------------------------------------------
Classification Metrics for XGBoost

Accuracy:		0.8702
Sensitivity:		0.631
Specificity:		0.9461
Precision:		0.7877
F1 Score:		0.7007
ROC AUC:		0.7885
Confusion Matrix:
                    Actual Negative  Actual Positive
Predicted Negative             3508              434
Predicted Positive              200              742
---------------------------------------------------------------------------


In [170]:
metrics = clf_metrics(xgbc.fit(X_train, y_train, sample_weight = cheap["fnlwgt"].to_list()), 
                      X_test, y_test, "XGBoost")

---------------------------------------------------------------------------
Classification Metrics for XGBoost

Accuracy:		0.8699
Sensitivity:		0.6148
Specificity:		0.9507
Precision:		0.798
F1 Score:		0.6945
ROC AUC:		0.7827
Confusion Matrix:
                    Actual Negative  Actual Positive
Predicted Negative             1176              151
Predicted Positive               61              241
---------------------------------------------------------------------------


In [264]:
# Scaling
ss = StandardScaler()

ss.fit(X_train)

X_train_sc = ss.transform(X_train)
X_test_sc  = ss.transform(X_test)



# Principle component analysis
pca = PCA()

pca.fit(X_train)

X_train_pca = pca.transform(X_train_sc)
X_test_pca  = pca.transform(X_test_sc)

  return self.partial_fit(X, y)
  
  import sys


In [265]:
pca.explained_variance_ratio_.cumsum()

array([0.99732159, 0.99999655, 0.99999845, 0.99999991, 0.99999997,
       0.99999998, 0.99999998, 0.99999998, 0.99999998, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 0.99999999, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 0.99999999, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        ])

In [266]:
X_train_pca_8 = pd.DataFrame(X_train_pca)[range(8)]
X_test_pca_8  = pd.DataFrame(X_test_pca)[range(8)]

In [272]:
metrics = clf_metrics(xgbc.fit(X_train_pca_8, y_train, sample_weight = cheap["fnlwgt"].to_list()), 
                      X_train_pca_8, y_train, "XGBoost")

---------------------------------------------------------------------------
Classification Metrics for XGBoost

Accuracy:		0.8735
Sensitivity:		0.648
Specificity:		0.945
Precision:		0.7888
F1 Score:		0.7115
ROC AUC:		0.7965
Confusion Matrix:
                    Actual Negative  Actual Positive
Predicted Negative             3504              414
Predicted Positive              204              762
---------------------------------------------------------------------------


In [273]:
metrics = clf_metrics(xgbc.fit(X_train_pca, y_train, sample_weight = cheap["fnlwgt"].to_list()), 
                      X_test_pca, y_test, "XGBoost")

---------------------------------------------------------------------------
Classification Metrics for XGBoost

Accuracy:		0.8735
Sensitivity:		0.6224
Specificity:		0.9531
Precision:		0.8079
F1 Score:		0.7032
ROC AUC:		0.7878
Confusion Matrix:
                    Actual Negative  Actual Positive
Predicted Negative             1179              148
Predicted Positive               58              244
---------------------------------------------------------------------------


In [279]:
# Final XGBoost model

features = [col for col in cheap.columns if col not in ["fnlwgt", "wage"] and cheap[col].dtype != "object"]

X = cheap[features]
y = cheap["wage"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 35,
                                                    stratify = y)

# Scaling
ss = StandardScaler()

ss.fit(X_train)

X_train_sc = ss.transform(X_train)
X_test_sc  = ss.transform(X_test)



# Principle component analysis
pca = PCA()

pca.fit(X_train_sc)

X_train_pca = pca.transform(X_train_sc)
X_test_pca  = pca.transform(X_test_sc)

X_train_pca_8 = pd.DataFrame(X_train_pca)[range(8)]
X_test_pca_8  = pd.DataFrame(X_test_pca)[range(8)]

metrics = clf_metrics(xgbc.fit(X_train_pca_8, y_train, sample_weight = cheap["fnlwgt"].to_list()), 
                      X_train_pca_8, y_train, "XGBoost")

metrics = clf_metrics(xgbc.fit(X_train_pca, y_train, sample_weight = cheap["fnlwgt"].to_list()), 
                      X_test_pca, y_test, "XGBoost")

  return self.partial_fit(X, y)


---------------------------------------------------------------------------
Classification Metrics for XGBoost

Accuracy:		0.8655
Sensitivity:		0.6241
Specificity:		0.942
Precision:		0.7734
F1 Score:		0.6908
ROC AUC:		0.7831
Confusion Matrix:
                    Actual Negative  Actual Positive
Predicted Negative             3493              442
Predicted Positive              215              734
---------------------------------------------------------------------------
---------------------------------------------------------------------------
Classification Metrics for XGBoost

Accuracy:		0.8527
Sensitivity:		0.5918
Specificity:		0.9353
Precision:		0.7436
F1 Score:		0.6591
ROC AUC:		0.7636
Confusion Matrix:
                    Actual Negative  Actual Positive
Predicted Negative             1157              160
Predicted Positive               80              232
---------------------------------------------------------------------------


In [282]:
test = pd.read_csv("../data/test_clean.csv")

In [298]:
# Final XGBoost model

features = [col for col in cheap.columns if col not in ["fnlwgt", "wage"] and cheap[col].dtype != "object"]

X = cheap[features]
y = cheap["wage"]
X_prog = test[features]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 35,
                                                    stratify = y)

# Scaling
ss = StandardScaler()

ss.fit(X_train)

X_train_sc = ss.transform(X_train)
X_test_sc  = ss.transform(X_test)
X_prog_sc  = ss.transform(X_prog)



# Principle component analysis
pca = PCA()

pca.fit(X_train_sc)

X_train_pca = pca.transform(X_train_sc)
X_test_pca  = pca.transform(X_test_sc)
X_prog_pca  = pca.transform(X_prog_sc)

X_train_pca_8 = pd.DataFrame(X_train_pca)[range(8)]
X_test_pca_8  = pd.DataFrame(X_test_pca)[range(8)]
X_prog_pca_8  = pd.DataFrame(X_prog_pca)[range(8)]

xgbc = XGBClassifier()
xgbc.fit(X_train_pca_8, y_train, sample_weight = cheap["fnlwgt"].to_list())

metrics = clf_metrics(xgbc.fit(X_train_pca_8, y_train, sample_weight = cheap["fnlwgt"].to_list()), 
                      X_train_pca_8, y_train, "XGBoost")

predictions = xgbc.predict(X_prog_pca_8)

  return self.partial_fit(X, y)


---------------------------------------------------------------------------
Classification Metrics for XGBoost

Accuracy:		0.8655
Sensitivity:		0.6241
Specificity:		0.942
Precision:		0.7734
F1 Score:		0.6908
ROC AUC:		0.7831
Confusion Matrix:
                    Actual Negative  Actual Positive
Predicted Negative             3493              442
Predicted Positive              215              734
---------------------------------------------------------------------------


In [300]:
pd.DataFrame(predictions).to_csv("../data/predictions.csv", index = False)

In [301]:
# Stacking methods
# Our group decided to just use PCA XGBoost for simplicity.

# #### Predictions for LogReg
# lr = LogisticRegression()
# lr.fit(X_train_sc, y_train)
# test['logreg_pred'] = lr.predict(X_prog_sc)

# #### Predictions for Decision Tree
# dt = DecisionTreeClassifier()
# dt.fit(X_train_sc, y_train)
# test['dec_tree_pred'] = dt.predict(X_prog_sc)

# #### Predictions for XGBoost
# xgbc = XGBClassifier()
# xgbc.fit(X_train_pca, y_train, sample_weight = cheap["fnlwgt"].to_list())
# test['xgbc_pca_pred'] = xgbc.predict(X_prog_pca_8)



# ### 1) Have 3 models make class predictions and append them to DataFrame
# ### 2) Function takes DataFrame and 3 predictions and outputs a aggregate prediction
# def stack_models(df, pred1, pred2, pred3):
#     df['combo'] = pred1 + pred2 + pred3

#     for i, row in enumerate(df.iterrows()):
#         if row[1]['combo'] >= 2:
#             df.loc[row[0],'combo_pred'] = 1
#         else:
#             df.loc[row[0],'combo_pred'] = 0
#     return df


# stack_models(test, test['xgbc_pca_pred'], test['logreg_pred'], test['dec_tree_pred'])