In [2]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

df_adult = pd.read_csv("adult.data",
                 header=None,
                 encoding='utf-8',
                na_values=[' ?', '? '])

df_adult.columns = ['age', 'workclass', 'fnlwgt', 'education',
                   'education-num', 'marital-status', 'occupation',
                   'relationship', 'race', 'sex',
                   'capital-gain', 'capital-loss',
                   'hours-per-week', 'native-country', 'Listing of attributes']

df_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Listing of attributes
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df_adult.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Listing of attributes
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [4]:
df_adult['Listing of attributes'] = df_adult['Listing of attributes'].replace(' <=50K', 0)
df_adult['Listing of attributes'] = df_adult['Listing of attributes'].replace(' >50K', 1)
df_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Listing of attributes
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [5]:
df = df_adult.dropna(axis=0,how='any')

In [6]:
df = pd.get_dummies(df)

In [7]:
from sklearn.model_selection import train_test_split
n = df.shape[1]
target_part = df['Listing of attributes']
del df['Listing of attributes']
feture_part = df
x_train, x_test, y_train, y_test = train_test_split(feture_part, target_part, test_size=0.3, random_state=0)

In [8]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(x_train)
X_train_std = sc.transform(x_train)
sc.fit(x_test)
X_test_std = sc.transform(x_test)

In [9]:
y_train.shape

(21113,)

In [10]:
print(X_train_std.shape)
print(X_test_std.shape)

(21113, 104)
(9049, 104)


In [11]:
y_train = np.squeeze(y_train)
y_test = np.squeeze(y_test)

In [12]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn import metrics

ppn = Perceptron(eta0=0.1, random_state=1)
ppn.fit(X_train_std, y_train)

y_pred = ppn.predict(X_test_std)
print('Misclassified examples: %d' % (y_test != y_pred).sum())

print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
print('AUC: %.4f' % metrics.auc(fpr, tpr))

Misclassified examples: 1696
Accuracy: 0.8126
AUC: 0.7276


In [13]:
y_test.shape

(9049,)

In [14]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

pca = PCA(n_components=60)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')
lr = lr.fit(X_train_pca, y_train)

y_pred = lr.predict(X_test_pca)
print('Misclassified examples: %d' % (y_test != y_pred).sum())

print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
print('AUC: %.4f' % metrics.auc(fpr, tpr))

Misclassified examples: 1491
Accuracy: 0.8352
AUC: 0.7463
