<a href="https://colab.research.google.com/github/dileep999999/ML/blob/main/7_XgBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost

In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df = pd.read_csv("/content/income_evaluation.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [5]:
df[' income'].value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
<=50K,24720
>50K,7841


In [6]:
df[' marital-status'].unique()

array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed'], dtype=object)

In [7]:
col_names = df.columns
col_names = [v.strip() for v in col_names]
col_names

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [8]:
df.columns = col_names
df.drop(columns="fnlwgt",inplace=True)

In [9]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
education,0
education-num,0
marital-status,0
occupation,0
relationship,0
race,0
sex,0
capital-gain,0


In [11]:
bins = [16,24,64,90]
labels=['young','adult','old']
df['age_types'] = pd.cut(df['age'], bins=bins,labels=labels)
df['income_num'] = np.where(df['income'] == " >50K",1,0).astype('int16')

In [12]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_types,income_num
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,adult,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,adult,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,adult,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,adult,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,adult,0


In [13]:
df.loc[df['workclass']=='?', 'workclass']= np.NaN
df.loc[df['occupation']=='?', 'occupation']= np.NaN
df.loc[df['native-country']=='?', 'native_country']= np.NaN

In [14]:
df = df.dropna(axis=1)

In [15]:
from sklearn.preprocessing import LabelEncoder
def label_encoder(a):
    le = LabelEncoder()
    df[a] = le.fit_transform(df[a])
label_list = ['workclass', 'education','marital-status',
       'occupation', 'relationship', 'race', 'sex','native-country', 'income']
for i in label_list:
    label_encoder(i)

In [37]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_types,income_num
0,39,7,9,13,4,1,1,4,1,2174,0,40,39,0,adult,0
1,50,6,9,13,2,4,0,4,1,0,0,13,39,0,adult,0
2,38,4,11,9,0,6,1,4,1,0,0,40,39,0,adult,0
3,53,4,1,7,2,6,0,2,1,0,0,40,39,0,adult,0
4,28,4,9,13,2,10,5,2,0,0,0,40,5,0,adult,0


In [16]:
from sklearn.preprocessing import MinMaxScaler

In [17]:
scaler = MinMaxScaler()

In [18]:
scaler.fit(df.drop(['income','age_types','income_num'],axis=1))

In [19]:
scaled_features = scaler.transform(df.drop(['income','age_types','income_num'],axis=1))

In [20]:
columns=['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country']

In [21]:
df_scaled = pd.DataFrame(scaled_features,columns=columns)
df_scaled.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,0.30137,0.875,0.6,0.8,0.666667,0.071429,0.2,1.0,1.0,0.02174,0.0,0.397959,0.95122
1,0.452055,0.75,0.6,0.8,0.333333,0.285714,0.0,1.0,1.0,0.0,0.0,0.122449,0.95122
2,0.287671,0.5,0.733333,0.533333,0.0,0.428571,0.2,1.0,1.0,0.0,0.0,0.397959,0.95122
3,0.493151,0.5,0.066667,0.4,0.333333,0.428571,0.0,0.5,1.0,0.0,0.0,0.397959,0.95122
4,0.150685,0.5,0.6,0.8,0.333333,0.714286,1.0,0.5,0.0,0.0,0.0,0.397959,0.121951


In [26]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

X = df_scaled
y= df.income

# Implementing Oversampling for Handling Imbalanced
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_resample(X,y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res,y_res,test_size=0.20,random_state=101,shuffle=True)
X_train.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country'],
      dtype='object')

In [23]:
import xgboost as xgb

In [38]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=columns)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=columns)

In [28]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}

In [29]:
model = xgb.train(xgb_params, dtrain, num_boost_round=10)

Parameters: { "silent" } are not used.



In [30]:
y_pred = model.predict(dtest)

In [31]:
y_pred[:5]

array([0.05364632, 0.02460734, 0.66863984, 0.8856433 , 0.81878173],
      dtype=float32)

In [32]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

0.940288664281243

In [33]:
watchlist = [(dtrain, 'train'), (dtest, 'test')]

In [34]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}

In [35]:
import warnings
warnings.filterwarnings('ignore')

In [36]:
model = xgb.train(xgb_params, dtrain,
                  num_boost_round=100,
                  evals=watchlist, verbose_eval=10)

[0]	train-auc:0.91287	test-auc:0.90708
[10]	train-auc:0.94699	test-auc:0.94289
[20]	train-auc:0.95802	test-auc:0.95238
[30]	train-auc:0.96500	test-auc:0.95823
[40]	train-auc:0.96945	test-auc:0.96236
[50]	train-auc:0.97200	test-auc:0.96397
[60]	train-auc:0.97421	test-auc:0.96552
[70]	train-auc:0.97671	test-auc:0.96771
[80]	train-auc:0.97831	test-auc:0.96892
[90]	train-auc:0.97954	test-auc:0.96958
[99]	train-auc:0.98026	test-auc:0.96995
