In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
df_data = pd.read_csv("adult.csv")

In [3]:
df_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df_data.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income             object
dtype: object

In [5]:
df_data["native-country"].value_counts().index[0]

' United-States'

In [6]:
df_data["income"].value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

In [7]:
df_data = df_data.drop(['fnlwgt', 'educational-num'], axis = 1) 

In [8]:
col_names = df_data.columns

In [9]:
for c in col_names: 
    df_data = df_data.replace("?", np.NaN) 
df_data = df_data.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [10]:
category_col =['workclass', 'education', 'marital-status', 'occupation', 'relationship', 
               'race', 'gender', 'native-country', 'income']  
labelEncoder = preprocessing.LabelEncoder() 
  
mapping_dict ={} 
for col in category_col: 
    df_data[col] = labelEncoder.fit_transform(df_data[col]) 
  
    le_name_mapping = dict(zip(labelEncoder.classes_, 
                        labelEncoder.transform(labelEncoder.classes_))) 
  
    mapping_dict[col]= le_name_mapping 
print(mapping_dict) 

{'workclass': {' ?': 0, ' Federal-gov': 1, ' Local-gov': 2, ' Never-worked': 3, ' Private': 4, ' Self-emp-inc': 5, ' Self-emp-not-inc': 6, ' State-gov': 7, ' Without-pay': 8}, 'education': {' 10th': 0, ' 11th': 1, ' 12th': 2, ' 1st-4th': 3, ' 5th-6th': 4, ' 7th-8th': 5, ' 9th': 6, ' Assoc-acdm': 7, ' Assoc-voc': 8, ' Bachelors': 9, ' Doctorate': 10, ' HS-grad': 11, ' Masters': 12, ' Preschool': 13, ' Prof-school': 14, ' Some-college': 15}, 'marital-status': {' Divorced': 0, ' Married-AF-spouse': 1, ' Married-civ-spouse': 2, ' Married-spouse-absent': 3, ' Never-married': 4, ' Separated': 5, ' Widowed': 6}, 'occupation': {' ?': 0, ' Adm-clerical': 1, ' Armed-Forces': 2, ' Craft-repair': 3, ' Exec-managerial': 4, ' Farming-fishing': 5, ' Handlers-cleaners': 6, ' Machine-op-inspct': 7, ' Other-service': 8, ' Priv-house-serv': 9, ' Prof-specialty': 10, ' Protective-serv': 11, ' Sales': 12, ' Tech-support': 13, ' Transport-moving': 14}, 'relationship': {' Husband': 0, ' Not-in-family': 1, ' 

In [11]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
  
X = df_data.drop('income', axis = 1) 
Y = df_data['income']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100) 

In [12]:
import warnings
warnings.filterwarnings('ignore')

## Bagging

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

lr_clf = LogisticRegression() 
dt_clf = DecisionTreeClassifier() 
rf_clf = RandomForestClassifier() 
et_clf = ExtraTreesClassifier() 
knn_clf = KNeighborsClassifier() 

classifiers =[lr_clf, dt_clf, rf_clf, et_clf, knn_clf]
for clf in classifiers:
    clf_scores = cross_val_score(clf, X_train, y_train, cv = 10)
    bagging_clf = BaggingClassifier(clf, max_samples=0.4, max_features=10, random_state=0)
    bagging_clf_scores = cross_val_score(bagging_clf, X_train, y_train, cv = 10)
    print(clf.__class__.__name__, ":::: Mean:", clf_scores.mean(), ", Std Dev:", clf_scores.std())
    print("Bagging", clf.__class__.__name__, ":::: Mean:", bagging_clf_scores.mean(), "Std Dev:", bagging_clf_scores.std(), "\n")

ensembler = VotingClassifier(estimators=[('LogisticRegression', lr_clf), ('DecisionTreeClassifier', dt_clf),
                                        ('RandomForestClassifier', rf_clf), ('ExtraTreesClassifier', et_clf),
                                        ('KNeighborsClassifier', knn_clf)], voting = 'hard')


ensembler_scores = cross_val_score(ensembler, X_train, y_train, cv = 10)
print(ensembler.__class__.__name__, ":::: Mean:", ensembler_scores.mean(), "Std Dev:", clf_scores.std())

LogisticRegression :::: Mean: 0.7995780505454071 , Std Dev: 0.006888373667690784
Bagging LogisticRegression :::: Mean: 0.8023420359806932 Std Dev: 0.00669463780099821 

DecisionTreeClassifier :::: Mean: 0.8119073077604059 , Std Dev: 0.005729415273647502
Bagging DecisionTreeClassifier :::: Mean: 0.849639923635328 Std Dev: 0.0046034229502244905 

RandomForestClassifier :::: Mean: 0.8489381115139759 , Std Dev: 0.005116577814042257
Bagging RandomForestClassifier :::: Mean: 0.8567037712754901 Std Dev: 0.004468761007278419 

ExtraTreesClassifier :::: Mean: 0.8414792383547726 , Std Dev: 0.0064275238258043816
Bagging ExtraTreesClassifier :::: Mean: 0.8511317483045042 Std Dev: 0.004708539080690846 

KNeighborsClassifier :::: Mean: 0.8238853221249702 , Std Dev: 0.006423083088668752
Bagging KNeighborsClassifier :::: Mean: 0.8396364017767104 Std Dev: 0.00599320955270458 

VotingClassifier :::: Mean: 0.8462174468641986 Std Dev: 0.006423083088668752


## Boosting

In [14]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

ab_clf = AdaBoostClassifier() 
gb_clf = GradientBoostingClassifier() 
xgb_clf = XGBClassifier() 

ensembler = VotingClassifier(estimators=[('AdaBoostClassifier', ab_clf), ('GradientBoostingClassifier', gb_clf),
                                        ('XGBClassifier', xgb_clf)], voting = 'hard')

classifiers =[ab_clf, gb_clf, xgb_clf, ensembler]
for clf in classifiers:
    clf_scores = cross_val_score(clf, X_train, y_train, cv = 10)
    print(clf.__class__.__name__, ":::: Mean:", clf_scores.mean(), "Std Dev:", clf_scores.std())

AdaBoostClassifier :::: Mean: 0.8604337082284473 Std Dev: 0.0032409094349287403
GradientBoostingClassifier :::: Mean: 0.8644262257222698 Std Dev: 0.0032315430892614675
XGBClassifier :::: Mean: 0.8641189579917322 Std Dev: 0.004561102596800773
VotingClassifier :::: Mean: 0.864645581703271 Std Dev: 0.0032985215353102735


## Stacking

In [15]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier

def get_stacking():
    level0 = [('lr_classifier', LogisticRegression()), ('dt_classifier', DecisionTreeClassifier()),
              ('knn_classifier', KNeighborsClassifier()), ('svm_classifier', SVC()), ('gnb_classifier', GaussianNB())]
    level1 = LogisticRegression()
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10)
    return model


lr_classifier = LogisticRegression()
dt_classifier = DecisionTreeClassifier()
knn_classifier = KNeighborsClassifier()
svm_classifier = SVC()
gnb_classifier = GaussianNB()
stacking_classifier = get_stacking()

classifiers = [lr_classifier, dt_classifier, knn_classifier, svm_classifier, gnb_classifier, stacking_classifier]
for clf in classifiers:
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
    clf_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print(clf.__class__.__name__, ":::: Mean:", clf_scores.mean(), "Std Dev:", clf_scores.std())

LogisticRegression :::: Mean: 0.799198344149096 Std Dev: 0.004958323931953346
DecisionTreeClassifier :::: Mean: 0.8130779055654346 Std Dev: 0.008467878845801694
KNeighborsClassifier :::: Mean: 0.8251287819886122 Std Dev: 0.00634438876282278
SVC :::: Mean: 0.8004562250294449 Std Dev: 0.005221775246052317
GaussianNB :::: Mean: 0.7964780515718138 Std Dev: 0.004996489474526567
StackingClassifier :::: Mean: 0.8376917712960184 Std Dev: 0.005593816155570199
