In [148]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
pd.options.mode.chained_assignment = None

from sklearn.feature_selection import SelectKBest
from scipy.stats import pointbiserialr, spearmanr

In [133]:
df_train = pd.read_csv('train_final.csv')
df_test = pd.read_csv('test_final.csv')

In [134]:
df_train

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income>50K
0,53,Self-emp-not-inc,93449,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,1
1,33,Self-emp-not-inc,123424,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,47,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,40,Private,114580,HS-grad,9,Divorced,Craft-repair,Other-relative,White,Female,0,0,40,Vietnam,0
4,39,Private,115618,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,18,Private,83451,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,25,United-States,0
24996,64,Local-gov,202738,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,35,United-States,0
24997,39,Private,225544,Masters,14,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,Poland,0
24998,53,Private,346871,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Male,4787,0,46,United-States,1


In [159]:
df_train['income>50K'].value_counts()

0    18984
1     6016
Name: income>50K, dtype: int64

In [136]:
df_test = df_test.drop('ID',axis = 1)

In [137]:
df_train['native.country'][df_train['native.country'] != 'United-States'] = 'Non-US'
df_test['native.country'][df_test['native.country'] != 'United-States'] = 'Non-US'

In [138]:
df_train['marital.status'][df_train['marital.status'] == 'Married-AF-spouse'] = 'Married-spouse-absent'
df_test['marital.status'][df_test['marital.status'] == 'Married-AF-spouse'] = 'Married-spouse-absent'

In [139]:
df_train['education'][df_train['education'] == '11th'] = 'No-HS'
df_test['education'][df_test['education'] == '11th'] = 'No-HS'

df_train['education'][df_train['education'] == '10th'] = 'No-HS'
df_test['education'][df_test['education'] == '10th'] = 'No-HS'

df_train['education'][df_train['education'] == '7th-8th'] = 'No-HS'
df_test['education'][df_test['education'] == '7th-8th'] = 'No-HS'

df_train['education'][df_train['education'] == '9th'] = 'No-HS'
df_test['education'][df_test['education'] == '9th'] = 'No-HS'

df_train['education'][df_train['education'] == '12th'] = 'No-HS'
df_test['education'][df_test['education'] == '12th'] = 'No-HS'

df_train['education'][df_train['education'] == '5th-6th'] = 'No-HS'
df_test['education'][df_test['education'] == '5th-6th'] = 'No-HS'

df_train['education'][df_train['education'] == '1st-4th'] = 'No-HS'
df_test['education'][df_test['education'] == '1st-4th'] = 'No-HS'

df_train['education'][df_train['education'] == 'Preschool'] = 'No-HS'
df_test['education'][df_test['education'] == 'Preschool'] = 'No-HS'

In [140]:
df_train.education.value_counts()

HS-grad         8119
Some-college    5571
Bachelors       4044
No-HS           3273
Masters         1369
Assoc-voc       1055
Assoc-acdm       812
Prof-school      450
Doctorate        307
Name: education, dtype: int64

In [141]:
df_test['marital.status'].value_counts()

Married-civ-spouse       10936
Never-married             7891
Divorced                  3221
Separated                  801
Widowed                    681
Married-spouse-absent      312
Name: marital.status, dtype: int64

In [142]:
df_train.workclass.value_counts()

Private             17336
Self-emp-not-inc     1978
Local-gov            1617
?                    1437
State-gov            1037
Self-emp-inc          845
Federal-gov           730
Without-pay            15
Never-worked            5
Name: workclass, dtype: int64

In [143]:
category_cols = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'sex', 'race', 'native.country']

In [144]:
def one_hot_encode(df):
    for col in category_cols:
        one_hot_train = pd.get_dummies(df[col], prefix=col)

        df = df.drop(col,axis = 1)
        df = pd.concat([df,one_hot_train],axis=1)

In [145]:
def discretize(df):
    for col in category_cols:
        b, c = np.unique(df[col], return_inverse=True) 
        df[col] = c
    return df

In [146]:
# df_test = one_hot_encode(df_test)
# df_train = one_hot_encode(df_train)

df_test = discretize(df_test)
df_train = discretize(df_train)

In [149]:
col_names = df_train.columns

param=[]
correlation=[]
abs_corr=[]

for c in col_names:
    #Check if binary or continuous
    if c != "income>50K":
        if len(df_train[c].unique()) <= 2:
            corr = spearmanr(df_train['income>50K'],df_train[c])[0]
        else:
            corr = pointbiserialr(df_train['income>50K'],df_train[c])[0]
        param.append(c)
        correlation.append(corr)
        abs_corr.append(abs(corr))

#Create dataframe for visualization
param_df=pd.DataFrame({'correlation':correlation,'parameter':param, 'abs_corr':abs_corr})

#Sort by absolute correlation
param_df=param_df.sort_values(by=['abs_corr'], ascending=False)

#Set parameter name as index
param_df=param_df.set_index('parameter')

param_df

Unnamed: 0_level_0,correlation,abs_corr
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1
education.num,0.335894,0.335894
marital.status,-0.255391,0.255391
relationship,-0.25113,0.25113
hours.per.week,0.231433,0.231433
capital.gain,0.226695,0.226695
age,0.225395,0.225395
sex,0.21625,0.21625
capital.loss,0.151825,0.151825
education,-0.109948,0.109948
occupation,0.078128,0.078128


In [152]:
low_corr_cols = param_df.index[0:8]
for col in low_corr_cols:
    df_train = df_train.drop(col,axis = 1)
    df_test = df_test.drop(col,axis = 1)
    

In [93]:
# missing_col = None
# for col in df_train.columns:
#     if col not in df_test.columns:
#         # df_train[col] = 0
#         print(col)

income>50K


In [153]:
X = df_train.loc[:, df_train.columns != 'income>50K']
y = df_train['income>50K']


X_test_final = df_test.loc[:, df_test.columns != 'income>50K']

In [10]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05],
    "subsample":[0.5, 0.75, 1],
    "min_child_weight":[1,5,15]
}

In [11]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 1,
    verbose=True
)

In [None]:
grid_search.fit(X,y)

In [None]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_search.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_search.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_search.best_params_)

In [154]:
model = XGBClassifier(eta='0.1', subsample=0.8)
model.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta='0.1',
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.100000001, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, ...)

In [155]:
y_pred = model.predict(X_test_final)

In [156]:
y_pred.shape

(23842,)

In [157]:
y_pred

array([0, 0, 0, ..., 1, 0, 0])

In [158]:
import csv
ctr = 1
data = [['ID', 'Prediction']]
for pred in y_pred:
    row = [ctr, pred]
    data.append(row)
    ctr += 1
# open the file in the write mode
with open('results.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(data)

In [132]:
w = np.asarray([1,1,0])
x=np.asarray([0,1,1])
w@x

1

In [133]:
1 / np.linalg.norm(w)

0.7071067811865475