In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("adult.data",
                 header=None,
                 encoding='utf-8',
                na_values=[' ?', '? '])

df.columns = ['age', 'workclass', 'fnlwgt', 'education',
                   'education-num', 'marital-status', 'occupation',
                   'relationship', 'race', 'sex',
                   'capital-gain', 'capital-loss',
                   'hours-per-week', 'native-country', 'Listing of attributes']

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Listing of attributes
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
df['Listing of attributes'] = df['Listing of attributes'].replace(' <=50K', 0)
df['Listing of attributes'] = df['Listing of attributes'].replace(' >50K', 1)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Listing of attributes
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
df = pd.get_dummies(df)

In [4]:
df = df.dropna(axis=0,how='any')

In [5]:
from sklearn.model_selection import train_test_split

y = df['Listing of attributes']
del df['Listing of attributes']
X = df
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(22792, 105)
(9769, 105)
(22792,)


In [6]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler ,MaxAbsScaler ,RobustScaler

std_sc = StandardScaler()
std_sc.fit(x_train)

X_train_std = std_sc.transform(x_train)
std_sc.fit(x_test)
X_test_std = std_sc.transform(x_test)

mm_sc = MinMaxScaler()
mm_sc.fit(x_train)

X_train_mm = mm_sc.transform(x_train)
mm_sc.fit(x_test)
X_test_mm = mm_sc.transform(x_test)

ma_sc = MaxAbsScaler()
ma_sc.fit(x_train)

X_train_ma = ma_sc.transform(x_train)
ma_sc.fit(x_test)
X_test_ma = ma_sc.transform(x_test)

rb_sc = RobustScaler()
rb_sc.fit(x_train)

X_train_rb = rb_sc.transform(x_train)
rb_sc.fit(x_test)
X_test_rb = rb_sc.transform(x_test)

X_train_arr = [X_train_std, X_train_mm, X_train_ma, X_train_rb]
X_test_arr = [X_test_std, X_test_mm, X_test_ma, X_test_rb]

In [7]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.decomposition import PCA

X_name = ['Standard','MinMax','MaxAbs','Robust']

In [77]:
from sklearn.linear_model import Perceptron

for idx, X_train_ in enumerate(X_train_arr):
    ppn = Perceptron(eta0=0.1, random_state=1)
    ppn.fit(X_train_, y_train)

    y_pred = ppn.predict(X_test_arr[idx])
    print(X_name[idx])
    print('Misclassified examples: %d' % (y_test != y_pred).sum())

    print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    print('AUC: %.4f' % metrics.auc(fpr, tpr))

Standard
Misclassified examples: 2016
Accuracy: 0.7936
AUC: 0.7173
MinMax
Misclassified examples: 1774
Accuracy: 0.8184
AUC: 0.6538
MaxAbs
Misclassified examples: 1746
Accuracy: 0.8213
AUC: 0.7643
Robust
Misclassified examples: 1848
Accuracy: 0.8108
AUC: 0.7017


In [78]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

for idx, X_train_ in enumerate(X_train_arr):
    
    mlp = MLPClassifier(random_state=1, max_iter=500).fit(X_train_, y_train)
    y_pred = mlp.predict(X_test_arr[idx])

    print('Misclassified examples: %d' % (y_test != y_pred).sum())
    print('Accuracy: %.3f'%accuracy_score(y_test,y_pred))
    #print('Accuracy: %.3f'%mlp.score(X_train_,y_test))

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    print('AUC: %.4f' % metrics.auc(fpr, tpr))

Misclassified examples: 1685
Accuracy: 0.828
AUC: 0.7600
Misclassified examples: 1573
Accuracy: 0.839
AUC: 0.7610
Misclassified examples: 1610
Accuracy: 0.835
AUC: 0.7875
Misclassified examples: 1539
Accuracy: 0.842
AUC: 0.7459


In [79]:
from sklearn.tree import DecisionTreeClassifier

for idx, X_train_ in enumerate(X_train_arr):
    
    tree_model = DecisionTreeClassifier(criterion='gini', 
                                        max_depth=15, 
                                        random_state=1)
    tree_model.fit(X_train_, y_train)

    y_pred = tree_model.predict(X_test_arr[idx])
    print('Misclassified examples: %d' % (y_test != y_pred).sum())
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    #print('Accuracy: %.3f'%tree_model.score(X_train_,y_test))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    print('AUC: %.4f' % metrics.auc(fpr, tpr))

Misclassified examples: 1633
Accuracy: 0.833
AUC: 0.7581
Misclassified examples: 1561
Accuracy: 0.840
AUC: 0.7766
Misclassified examples: 1548
Accuracy: 0.842
AUC: 0.7801
Misclassified examples: 1511
Accuracy: 0.845
AUC: 0.7775


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

for idx, X_train_ in enumerate(X_train_arr):
    svm = SVC(kernel='linear', C=1.0, random_state=1)

    svm.fit(X_train_, y_train)
    y_pred = svm.predict(X_test_arr[idx])
    print('Misclassified examples: %d' % (y_test != y_pred).sum())
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    print('AUC: %.4f' % metrics.auc(fpr, tpr))

Misclassified examples: 1483
Accuracy: 0.848
AUC: 0.7537
Misclassified examples: 1492
Accuracy: 0.847
AUC: 0.7538
Misclassified examples: 1496
Accuracy: 0.847
AUC: 0.7540


In [80]:
from sklearn import svm

for idx, X_train_ in enumerate(X_train_arr):
    
    clf = svm.SVC(kernel='rbf', C=1.0, random_state=1,gamma="auto")
    #clf = svm(kernel='rbf', C=1.0, random_state=1)

    clf.fit(X_train_, y_train)
    y_pred = clf.predict(X_test_arr[idx])
    print('Misclassified examples: %d' % (y_test != y_pred).sum())
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    print('AUC: %.4f' % metrics.auc(fpr, tpr))

Misclassified examples: 1505
Accuracy: 0.846
AUC: 0.7473
Misclassified examples: 1649
Accuracy: 0.831
AUC: 0.7123
Misclassified examples: 1649
Accuracy: 0.831
AUC: 0.7126
Misclassified examples: 1336
Accuracy: 0.863
AUC: 0.7619


In [81]:
from sklearn.neighbors import KNeighborsClassifier

for idx, X_train_ in enumerate(X_train_arr):
    
    knn = KNeighborsClassifier(n_neighbors=5, 
                               p=2, 
                               metric='minkowski')
    knn.fit(X_train_, y_train)

    y_pred = knn.predict(X_test_arr[idx])
    print('Misclassified examples: %d' % (y_test != y_pred).sum())
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    print('AUC: %.4f' % metrics.auc(fpr, tpr))

Misclassified examples: 1728
Accuracy: 0.823
AUC: 0.7385
Misclassified examples: 1719
Accuracy: 0.824
AUC: 0.7413
Misclassified examples: 1710
Accuracy: 0.825
AUC: 0.7432
Misclassified examples: 1429
Accuracy: 0.854
AUC: 0.7842


In [10]:
from sklearn.linear_model import LogisticRegression

for idx, X_train_ in enumerate(X_train_arr):
    pca = PCA(n_components=105)
    X_train_pca = pca.fit_transform(X_train_)
    X_test_pca = pca.transform(X_test_arr[idx])

    lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs',max_iter=1500)
    lr = lr.fit(X_train_pca, y_train)

    y_pred = lr.predict(X_test_pca)
    print('Misclassified examples: %d' % (y_test != y_pred).sum())

    print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    print('AUC: %.4f' % metrics.auc(fpr, tpr))

Misclassified examples: 1468
Accuracy: 0.8497
AUC: 0.7615
Misclassified examples: 1482
Accuracy: 0.8483
AUC: 0.7619
Misclassified examples: 1482
Accuracy: 0.8483
AUC: 0.7623
Misclassified examples: 1459
Accuracy: 0.8507
AUC: 0.7604


In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
for idx, X_train_ in enumerate(X_train_arr):
    pca = PCA(n_components=60)
    
    mlp = MLPClassifier(random_state=1, max_iter=500).fit(X_train_, y_train)
    
    X_test_pca = pca.fit_transform(X_test_arr[idx])
    y_pred = mlp.predict(X_test_pca)

    print('Misclassified examples: %d' % (y_test != y_pred).sum())
    print('Accuracy: %.3f'%accuracy_score(y_test,y_pred))
    #print('Accuracy: %.3f'%mlp.score(X_train_,y_test))

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    print('AUC: %.4f' % metrics.auc(fpr, tpr))

ValueError: X has 60 features, but MLPClassifier is expecting 105 features as input.