# import packages 

In [1]:
from sklearn.metrics import accuracy_score
import sklearn as sk
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

read the files 

In [2]:
names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income'
]
# read the training data and trim the space in each field
train_data = pd.read_csv('adult.data', sep="\s*,\s*",
                         names=names, engine='python')
# read the testing data and trim the space in each field
test_data = pd.read_csv('adult.test', sep="\s*,\s*",
                        names=names, engine='python')

# in the test dataset, the colnum of the 'income' has a surplus period.
test_data['income'].replace(
    regex=True, inplace=True, to_replace=r'\.', value=r'')
print(train_data.info())
print(train_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad             

# shows the missing values in each colnums 

In [3]:
print('Missing values in training data')
for i, j in zip(train_data.columns, (train_data.values.astype(str) == '?').sum(axis=0)):
    if j > 0:
        print(str(i) + ': ' + str(j) + ' out of ' +
              str(len(train_data)) + ' records')
              
print('Missing values in test data')
for i, j in zip(test_data.columns, (test_data.values.astype(str) == '?').sum(axis=0)):
    if j > 0:
        print(str(i) + ': ' + str(j) + ' out of ' +
              str(len(test_data))+' records')

Missing values in training data
workclass: 1836 out of 32561 records
occupation: 1843 out of 32561 records
native-country: 583 out of 32561 records
Missing values in test data
workclass: 963 out of 16281 records
occupation: 966 out of 16281 records
native-country: 274 out of 16281 records


# removing the rows have the missing values from the traning dataset

In [4]:
clean_train_data = train_data.copy()
befor_train = len(clean_train_data)
for colnum in names:
    be = len(clean_train_data)
    clean_train_data = clean_train_data[(clean_train_data[colnum] != '?')]
    af = len(clean_train_data)
    if be != af:
        print("{}: before delete: {}     after delete:{}".format(colnum, be, af))
after_train = len(clean_train_data)
print("size of training dataset: before removing: {}    after delete:{}".format(
    befor_train, after_train))

workclass: before delete: 32561     after delete:30725
occupation: before delete: 30725     after delete:30718
native-country: before delete: 30718     after delete:30162
size of training dataset: before removing: 32561    after delete:30162
  result = method(y)


# removing the rows have the missing values from the testing dataset

In [5]:
clean_test_data = test_data.copy()
befor_test = len(clean_test_data)
for colnum in names:
    be = len(clean_test_data)
    clean_test_data = clean_test_data[(clean_test_data[colnum] != '?')]
    af = len(clean_test_data)
    if be != af:
        print("{}: before delete: {}     after delete:{}".format(colnum, be, af))
after_test = len(clean_test_data)
print("size of test dataset: before removing: {}     after delete:{}".format(
    befor_test, after_test))

workclass: before delete: 16281     after delete:15318
occupation: before delete: 15318     after delete:15315
native-country: before delete: 15315     after delete:15060
size of test dataset: before removing: 16281     after delete:15060


#  convert the categorical value to one-hot vector

In [6]:
# clean_train_data_one_hot = pd.get_dummies(clean_train_data, columns=["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"])
# clean_test_data_one_hot = pd.get_dummies(clean_test_data, columns=["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"])

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
full = pd.concat((clean_train_data, clean_test_data))
col_one_hot_idx = []
clean_train_data_label = clean_train_data.copy()
clean_test_data_label = clean_test_data.copy()

for col in full.columns:
    if clean_train_data[col].dtype == object and col != "income":
        # set of labels of the categorical columns
        print("Encoding {}".format(col))
        new_le = LabelEncoder()
        full[col] = new_le.fit_transform(full[col])
        clean_train_data_label[col] = new_le.transform(clean_train_data_label[col])
        clean_test_data_label[col] = new_le.transform(clean_test_data_label[col])
        col_one_hot_idx.append(full.columns.get_loc(col))
    elif clean_train_data[col].dtype == object and col == "income":
        print("Encoding {}".format(col))
        new_le = LabelEncoder()
        full[col] = new_le.fit_transform(full[col])
        clean_train_data_label[col] = new_le.transform(clean_train_data_label[col])
        clean_test_data_label[col] = new_le.transform(clean_test_data_label[col])

#[1, 3, 5, 6, 7, 8, 9, 13]
print(col_one_hot_idx)
ohe = OneHotEncoder(categorical_features=col_one_hot_idx, sparse=False, handle_unknown="ignore",n_values="auto")
ohe.fit(full)
clean_train_data_one_hot = ohe.transform(clean_train_data_label)
clean_test_data_one_hot = ohe.transform(clean_test_data_label)

Encoding workclass
Encoding education
Encoding marital-status
Encoding occupation
Encoding relationship
Encoding race
Encoding sex
Encoding native-country
Encoding income
[1, 3, 5, 6, 7, 8, 9, 13]


# Normalization

In [7]:
clean_train_data_one_hot = (clean_train_data_one_hot-clean_train_data_one_hot.min(axis=0))/(clean_train_data_one_hot.max(axis=0)-clean_train_data_one_hot.min(axis=0))
clean_test_data_one_hot = (clean_test_data_one_hot-clean_test_data_one_hot.min(axis=0))/(clean_test_data_one_hot.max(axis=0)-clean_test_data_one_hot.min(axis=0))
clean_train_data_one_hot=np.nan_to_num(clean_train_data_one_hot)
clean_test_data_one_hot=np.nan_to_num(clean_test_data_one_hot)

  


# linear Regression Model (with normalization)


In [8]:
clean_train_data_X = clean_train_data_one_hot[:, :-1]
clean_train_data_Y = clean_train_data_one_hot[:, -1:]

clean_test_data_X = clean_test_data_one_hot[:, :-1]
clean_test_data_Y = clean_test_data_one_hot[:, -1:]

print(clean_train_data_X.shape, clean_train_data_Y.shape,
      clean_test_data_X.shape, clean_test_data_Y.shape)

## us vs non-us
test_data_us_idx=[ i for i in range(len(clean_test_data.to_numpy())) if clean_test_data.iloc[i]['native-country'] == "United-States"]
test_data_non_us_idx=[ i for i in range(len(clean_test_data.to_numpy())) if clean_test_data.iloc[i]['native-country'] != "United-States"]
test_data_non_us = clean_test_data_one_hot[test_data_non_us_idx]
test_data_us = clean_test_data_one_hot[test_data_us_idx]
clean_test_data_X_non_us = test_data_non_us[:, :-1]
clean_test_data_Y_non_us = test_data_non_us[:, -1:]
clean_test_data_X_us = test_data_us[:, :-1]
clean_test_data_Y_us = test_data_us[:, -1:]

# ## male vs famale
test_data_male_idx = [ i for i in range(len(clean_test_data.to_numpy())) if clean_test_data.iloc[i]['sex'] == "Male"]
test_data_female_idx = [ i for i in range(len(clean_test_data.to_numpy())) if clean_test_data.iloc[i]['sex'] == "Female"]
test_data_male = clean_test_data_one_hot[test_data_male_idx]
test_data_female = clean_test_data_one_hot[test_data_female_idx]
clean_test_data_X_male = test_data_male[:, :-1]
clean_test_data_Y_male = test_data_male[:, -1:]
clean_test_data_X_female = test_data_female[:, :-1]
clean_test_data_Y_female = test_data_female[:, -1:]

# ## white vs non-white
test_data_white_idx = [ i for i in range(len(clean_test_data.to_numpy())) if clean_test_data.iloc[i]['race'] == "White"]
test_data_non_white_idx = [ i for i in range(len(clean_test_data.to_numpy())) if clean_test_data.iloc[i]['race'] != "White"]
test_data_white = clean_test_data_one_hot[test_data_white_idx]
test_data_non_white = clean_test_data_one_hot[test_data_non_white_idx]
clean_test_data_X_white = test_data_white[:, :-1]
clean_test_data_Y_white = test_data_white[:, -1:]
clean_test_data_X_non_white = test_data_non_white[:, :-1]
clean_test_data_Y_non_white = test_data_non_white[:, -1:]


clf = LogisticRegressionCV(cv=5, max_iter=2000).fit(clean_train_data_X, clean_train_data_Y.ravel())
y_pred = clf.predict(clean_test_data_X)
y_pred_non_us = clf.predict(clean_test_data_X_non_us)
y_pred_us = clf.predict(clean_test_data_X_us)
y_pred_male = clf.predict(clean_test_data_X_male)
y_pred_female = clf.predict(clean_test_data_X_female)
y_pred_white = clf.predict(clean_test_data_X_white)
y_pred_non_white = clf.predict(clean_test_data_X_non_white)

print('Accuracy: {:0.2%}' .format(accuracy_score(clean_test_data_Y, y_pred)))
print('Accuracy (non-us): {:0.2%}' .format(accuracy_score(clean_test_data_Y_non_us, y_pred_non_us)))
print('Accuracy (us): {:0.2%}' .format(accuracy_score(clean_test_data_Y_us, y_pred_us)))
print('Accuracy (male): {:0.2%}' .format(accuracy_score(clean_test_data_Y_male, y_pred_male)))
print('Accuracy (female): {:0.2%}' .format(accuracy_score(clean_test_data_Y_female, y_pred_female)))
print('Accuracy (white): {:0.2%}' .format(accuracy_score(clean_test_data_Y_white, y_pred_white)))
print('Accuracy (non-white): {:0.2%}' .format(accuracy_score(clean_test_data_Y_non_white, y_pred_non_white)))

(30162, 104) (30162, 1) (15060, 104) (15060, 1)
Accuracy: 84.78%
Accuracy (non-us): 86.56%
Accuracy (us): 84.62%
Accuracy (male): 81.01%
Accuracy (female): 92.57%
Accuracy (white): 84.09%
Accuracy (non-white): 89.09%


# PCA first, Then linear regrisson 

In [9]:
# len(clean_train_data_one_hot)*0.1
pca = PCA(n_components=int(len(clean_train_data_X[0])*0.3))
pca.fit(clean_train_data_X)  

clean_train_data_one_hot_pca = pca.transform(clean_train_data_X)
clean_test_data_one_hot_pca = pca.transform(clean_test_data_X)

clean_train_data_X_pca = clean_train_data_one_hot_pca
clean_test_data_X_pca = clean_test_data_one_hot_pca
print(clean_train_data_X.shape, clean_train_data_Y.shape, clean_test_data_X.shape, clean_test_data_Y.shape)

## us vs non-us
clean_test_data_X_non_us = clean_test_data_X_pca[test_data_non_us_idx]
clean_test_data_X_us = clean_test_data_X_pca[test_data_us_idx]
print(clean_test_data_X_non_us.shape, clean_test_data_Y_non_us.shape, clean_test_data_X_us.shape, clean_test_data_Y_us.shape)

## male vs female
clean_test_data_X_male = clean_test_data_X_pca[test_data_male_idx]
clean_test_data_X_female = clean_test_data_X_pca[test_data_female_idx]
print(clean_test_data_X_male.shape, clean_test_data_Y_male.shape, clean_test_data_X_female.shape, clean_test_data_Y_female.shape)

## male vs female
clean_test_data_X_white = clean_test_data_X_pca[test_data_white_idx]
clean_test_data_X_non_white = clean_test_data_X_pca[test_data_non_white_idx]
print(clean_test_data_X_white.shape, clean_test_data_Y_white.shape, clean_test_data_X_non_white.shape, clean_test_data_Y_non_white.shape)


clf = LogisticRegressionCV(cv=5).fit(clean_train_data_X_pca, clean_train_data_Y.ravel())

y_pred = clf.predict(clean_test_data_X_pca)
y_pred_non_us = clf.predict(clean_test_data_X_non_us)
y_pred_us = clf.predict(clean_test_data_X_us)
y_pred_male = clf.predict(clean_test_data_X_male)
y_pred_female = clf.predict(clean_test_data_X_female)
y_pred_white = clf.predict(clean_test_data_X_white)
y_pred_non_white = clf.predict(clean_test_data_X_non_white)

print('Accuracy: {:0.2%}' .format(accuracy_score(clean_test_data_Y, y_pred)))
print('Accuracy (non-us): {:0.2%}' .format(accuracy_score(clean_test_data_Y_non_us, y_pred_non_us)))
print('Accuracy (us): {:0.2%}' .format(accuracy_score(clean_test_data_Y_us, y_pred_us)))
print('Accuracy (male): {:0.2%}' .format(accuracy_score(clean_test_data_Y_male, y_pred_male)))
print('Accuracy (female): {:0.2%}' .format(accuracy_score(clean_test_data_Y_female, y_pred_female)))
print('Accuracy (white): {:0.2%}' .format(accuracy_score(clean_test_data_Y_white, y_pred_white)))
print('Accuracy (non-white): {:0.2%}' .format(accuracy_score(clean_test_data_Y_non_white, y_pred_non_white)))

(30162, 104) (30162, 1) (15060, 104) (15060, 1)
(1272, 31) (1272, 1) (13788, 31) (13788, 1)
(10147, 31) (10147, 1) (4913, 31) (4913, 1)
(12970, 31) (12970, 1) (2090, 31) (2090, 1)
Accuracy: 82.75%
Accuracy (non-us): 84.91%
Accuracy (us): 82.55%
Accuracy (male): 78.69%
Accuracy (female): 91.13%
Accuracy (white): 81.96%
Accuracy (non-white): 87.66%
