## Load the needed libs and dataset
url of the adult dataset:  https://archive.ics.uci.edu/ml/datasets/Adult

In [1]:
from sklearn.metrics import accuracy_score
import sklearn as sk
import numpy as np
import pandas as pd
from sklearn import tree

# names of colnum
names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
]
# read the training data and trim the space in each field
train_data = pd.read_csv('data/adult.data', sep="\s*,\s*",
                         names=names, engine='python')
# read the testing data and trim the space in each field
test_data = pd.read_csv('data/adult.test', sep="\s*,\s*",
                        names=names, engine='python')

# in the test dataset, the colnum of the 'income' has a surplus period.
test_data['income'].replace(
    regex=True, inplace=True, to_replace=r'\.', value=r'')
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


## Check the missing values of the traning data

In [2]:
print('Missing values in training data')
for i, j in zip(train_data.columns, (train_data.values.astype(str) == '?').sum(axis=0)):
    if j > 0:
        print(str(i) + ': ' + str(j) + ' out of ' +
              str(len(train_data))+' records')


Missing values in training data
workclass: 1836 out of 32561 records
occupation: 1843 out of 32561 records
native-country: 583 out of 32561 records


## Check the missing values of the test data

In [3]:
print('Missing values in test data')
for i, j in zip(test_data.columns, (test_data.values.astype(str) == '?').sum(axis=0)):
    if j > 0:
        print(str(i) + ': ' + str(j) + ' out of ' +
              str(len(test_data))+' records')

Missing values in test data
workclass: 963 out of 16281 records
occupation: 966 out of 16281 records
native-country: 274 out of 16281 records


## print the labels of the colnum "workclass"

In [4]:
print(set(train_data['workclass']))

{'Never-worked', 'Self-emp-inc', 'Private', 'Without-pay', '?', 'State-gov', 'Federal-gov', 'Self-emp-not-inc', 'Local-gov'}


## removing the rows have the missing values from the traning dataset

In [5]:
clean_train_data = train_data.copy()
befor_train = len(clean_train_data)
for colnum in names:
    be = len(clean_train_data)
    clean_train_data = clean_train_data[(clean_train_data[colnum] != '?')]
    af = len(clean_train_data)
    if be != af:
        print("{}: before removing: {}     after removing:{}".format(colnum, be, af))
after_train = len(clean_train_data)
print("size of training dataset: before removing: {}    after removing:{}".format(
    befor_train, after_train))

workclass: before removing: 32561     after removing:30725
occupation: before removing: 30725     after removing:30718
native-country: before removing: 30718     after removing:30162
size of training dataset: before removing: 32561    after removing:30162


  result = method(y)


## removing the rows have the missing values from the test dataset

In [6]:
clean_test_data = test_data.copy()
befor_test = len(clean_test_data)
for colnum in names:
    be = len(clean_test_data)
    clean_test_data = clean_test_data[(clean_test_data[colnum] != '?')]
    af = len(clean_test_data)
    if be != af:
        print("{}: before removing: {}     after removing:{}".format(colnum, be, af))
after_test = len(clean_test_data)
print("size of test dataset: before removing: {}     after removing:{}".format(
    befor_test, after_test))

workclass: before removing: 16281     after removing:15318
occupation: before removing: 15318     after removing:15315
native-country: before removing: 15315     after removing:15060
size of test dataset: before removing: 16281     after removing:15060


## convert the categorical value to int
* build the mapping from categorical string to nominal value
* call the df.replace function to Encoding Categorical Values (https://pbpython.com/categorical-encoding.html)

In [7]:
cat_to_nom_mapping = {}
for col in clean_test_data.columns:
    if clean_train_data[col].dtype == object:
        # set of labels of the categorical columns
        labels = set(clean_train_data[col])
        mapping_labels = {key: value for value, key in enumerate(labels)}
        print(col, mapping_labels)
        cat_to_nom_mapping.update({col: mapping_labels})

# print(cat_to_nom_mapping)
clean_train_data.replace(cat_to_nom_mapping, inplace=True)
clean_test_data.replace(cat_to_nom_mapping, inplace=True)
# print(clean_train_data.info())
# print(clean_test_data.info())

workclass {'Self-emp-inc': 0, 'Private': 1, 'Without-pay': 2, 'State-gov': 3, 'Federal-gov': 4, 'Self-emp-not-inc': 5, 'Local-gov': 6}
education {'Masters': 0, 'Preschool': 1, '1st-4th': 2, 'Prof-school': 3, '10th': 4, '7th-8th': 5, 'HS-grad': 6, 'Assoc-voc': 7, 'Bachelors': 8, '9th': 9, '11th': 10, 'Doctorate': 11, 'Assoc-acdm': 12, 'Some-college': 13, '12th': 14, '5th-6th': 15}
marital-status {'Never-married': 0, 'Separated': 1, 'Divorced': 2, 'Married-civ-spouse': 3, 'Married-AF-spouse': 4, 'Widowed': 5, 'Married-spouse-absent': 6}
occupation {'Adm-clerical': 0, 'Armed-Forces': 1, 'Craft-repair': 2, 'Priv-house-serv': 3, 'Handlers-cleaners': 4, 'Transport-moving': 5, 'Other-service': 6, 'Machine-op-inspct': 7, 'Protective-serv': 8, 'Tech-support': 9, 'Sales': 10, 'Exec-managerial': 11, 'Prof-specialty': 12, 'Farming-fishing': 13}
relationship {'Other-relative': 0, 'Unmarried': 1, 'Wife': 2, 'Own-child': 3, 'Not-in-family': 4, 'Husband': 5}
race {'White': 0, 'Other': 1, 'Black': 2, '

 ## Decision Tree Model
 link: https://scikit-learn.org/stable/modules/tree.html

In [8]:
# all features
# Split the data frame
clean_train_data_X = clean_train_data.iloc[:, :-1]
clean_train_data_Y = clean_train_data.iloc[:, -1:]

clean_test_data_X = clean_test_data.iloc[:, :-1]
clean_test_data_Y = clean_test_data.iloc[:, -1:]
print(clean_train_data_X.shape, clean_train_data_Y.shape,
      clean_test_data_X.shape, clean_test_data_Y.shape)


# Train and predict the model
clf = tree.DecisionTreeClassifier()
clf.fit(clean_train_data_X, clean_train_data_Y)
y_pred = clf.predict(clean_test_data_X)
print('[all features] Accuracy: {:0.2%}' .format(
    accuracy_score(clean_test_data_Y, y_pred)))

# single features
for col in clean_train_data_X.columns:
    clean_train_data_single_X = clean_train_data_X[col].values.reshape(-1,1)
    clean_test_data_single_X = clean_test_data[col].values.reshape(-1,1)
    clf = tree.DecisionTreeClassifier()
    clf.fit(clean_train_data_single_X, clean_train_data_Y)
    y_pred = clf.predict(clean_test_data_single_X)
    print('[{}] Accuracy: {:0.2%}' .format(
        col, accuracy_score(clean_test_data_Y, y_pred)))

# except race and sex
clean_train_data_except_X = clean_train_data_X[map(
    lambda x:x not in ['race', 'sex'], list(clean_train_data_X.columns))].copy()
clean_test_data_except_X = clean_test_data_X[map(
    lambda x:x not in ['race', 'sex'], list(clean_test_data_X.columns))].copy()
print(clean_train_data_except_X.shape, clean_train_data_Y.shape,
      clean_test_data_except_X.shape, clean_test_data_Y.shape)
clf = tree.DecisionTreeClassifier()
clf.fit(clean_train_data_except_X, clean_train_data_Y)
y_pred = clf.predict(clean_test_data_except_X)
print('[except race and sex] Accuracy: {:0.2%}' .format(
    accuracy_score(clean_test_data_Y, y_pred)))

(30162, 14) (30162, 1) (15060, 14) (15060, 1)
[all features] Accuracy: 80.88%
[age] Accuracy: 75.43%
[workclass] Accuracy: 75.78%
[fnlwgt] Accuracy: 66.00%
[education] Accuracy: 77.16%
[education-num] Accuracy: 77.16%
[marital-status] Accuracy: 75.43%
[occupation] Accuracy: 75.43%
[relationship] Accuracy: 75.43%
[race] Accuracy: 75.43%
[sex] Accuracy: 75.43%
[capital-gain] Accuracy: 80.56%
[capital-loss] Accuracy: 77.65%
[hours-per-week] Accuracy: 75.44%
[native-country] Accuracy: 75.43%
(30162, 12) (30162, 1) (15060, 12) (15060, 1)
[except race and sex] Accuracy: 80.80%


## Random Forest Model
link: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [9]:
from sklearn.ensemble import RandomForestClassifier

# all features
# Split the data frame
clean_train_data_X = clean_train_data.iloc[:, :-1]
clean_train_data_Y = clean_train_data.iloc[:, -1:]

clean_test_data_X = clean_test_data.iloc[:, :-1]
clean_test_data_Y = clean_test_data.iloc[:, -1:]
print(clean_train_data_X.shape, clean_train_data_Y.shape,
      clean_test_data_X.shape, clean_test_data_Y.shape)


# Train and predict the model
# Add the .values.ravel() to Y to avoid the warning
clf = RandomForestClassifier(n_estimators=100)
clf.fit(clean_train_data_X, clean_train_data_Y.values.ravel())
y_pred = clf.predict(clean_test_data_X)
print('[all features] Accuracy: {:0.2%}' .format(
    accuracy_score(clean_test_data_Y, y_pred)))

# single features
for col in clean_train_data_X.columns:
    clean_train_data_single_X = clean_train_data_X[col].values.reshape(-1,1)
    clean_test_data_single_X = clean_test_data[col].values.reshape(-1,1)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(clean_train_data_single_X, clean_train_data_Y.values.ravel())
    y_pred = clf.predict(clean_test_data_single_X)
    print('[{}] Accuracy: {:0.2%}' .format(
        col, accuracy_score(clean_test_data_Y, y_pred)))

# except race and sex
clean_train_data_except_X = clean_train_data_X[map(
    lambda x:x not in ['race', 'sex'], list(clean_train_data_X.columns))].copy()
clean_test_data_except_X = clean_test_data_X[map(
    lambda x:x not in ['race', 'sex'], list(clean_test_data_X.columns))].copy()
print(clean_train_data_except_X.shape, clean_train_data_Y.shape,
      clean_test_data_except_X.shape, clean_test_data_Y.shape)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(clean_train_data_except_X, clean_train_data_Y.values.ravel())
y_pred = clf.predict(clean_test_data_except_X)
print('[except race and sex] Accuracy: {:0.2%}' .format(
    accuracy_score(clean_test_data_Y, y_pred)))

(30162, 14) (30162, 1) (15060, 14) (15060, 1)
[all features] Accuracy: 85.24%
[age] Accuracy: 75.43%
[workclass] Accuracy: 75.78%
[fnlwgt] Accuracy: 68.22%
[education] Accuracy: 77.16%
[education-num] Accuracy: 77.16%
[marital-status] Accuracy: 75.43%
[occupation] Accuracy: 75.43%
[relationship] Accuracy: 75.43%
[race] Accuracy: 75.43%
[sex] Accuracy: 75.43%
[capital-gain] Accuracy: 80.56%
[capital-loss] Accuracy: 77.65%
[hours-per-week] Accuracy: 75.44%
[native-country] Accuracy: 75.43%
(30162, 12) (30162, 1) (15060, 12) (15060, 1)
[except race and sex] Accuracy: 84.95%


## SVM
link: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [10]:
from sklearn.svm import SVC

# all features
# Split the data frame
clean_train_data_X = clean_train_data.iloc[:, :-1]
clean_train_data_Y = clean_train_data.iloc[:, -1:]

clean_test_data_X = clean_test_data.iloc[:, :-1]
clean_test_data_Y = clean_test_data.iloc[:, -1:]
print(clean_train_data_X.shape, clean_train_data_Y.shape,
      clean_test_data_X.shape, clean_test_data_Y.shape)


# Train and predict the model
# Add the .values.ravel() to Y to avoid the warning
clf = SVC(gamma='scale')
clf.fit(clean_train_data_X, clean_train_data_Y.values.ravel())
y_pred = clf.predict(clean_test_data_X)
print('[all features] Accuracy: {:0.2%}' .format(
    accuracy_score(clean_test_data_Y, y_pred)))

# single features
for col in clean_train_data_X.columns:
    clean_train_data_single_X = clean_train_data_X[col].values.reshape(-1,1)
    clean_test_data_single_X = clean_test_data[col].values.reshape(-1,1)
    clf = SVC(gamma='scale')
    clf.fit(clean_train_data_single_X, clean_train_data_Y.values.ravel())
    y_pred = clf.predict(clean_test_data_single_X)
    print('[{}] Accuracy: {:0.2%}' .format(
        col, accuracy_score(clean_test_data_Y, y_pred)))

# except race and sex
clean_train_data_except_X = clean_train_data_X[map(
    lambda x:x not in ['race', 'sex'], list(clean_train_data_X.columns))].copy()
clean_test_data_except_X = clean_test_data_X[map(
    lambda x:x not in ['race', 'sex'], list(clean_test_data_X.columns))].copy()
print(clean_train_data_except_X.shape, clean_train_data_Y.shape,
      clean_test_data_except_X.shape, clean_test_data_Y.shape)
clf = SVC(gamma='scale')
clf.fit(clean_train_data_except_X, clean_train_data_Y.values.ravel())
y_pred = clf.predict(clean_test_data_except_X)
print('[except race and sex] Accuracy: {:0.2%}' .format(
    accuracy_score(clean_test_data_Y, y_pred)))

(30162, 14) (30162, 1) (15060, 14) (15060, 1)
[all features] Accuracy: 79.08%
[age] Accuracy: 75.43%
[workclass] Accuracy: 75.78%
[fnlwgt] Accuracy: 75.43%
[education] Accuracy: 75.85%
[education-num] Accuracy: 77.16%
[marital-status] Accuracy: 75.43%
[occupation] Accuracy: 75.43%
[relationship] Accuracy: 75.43%
[race] Accuracy: 75.43%
[sex] Accuracy: 75.43%
[capital-gain] Accuracy: 79.47%
[capital-loss] Accuracy: 76.87%
[hours-per-week] Accuracy: 75.43%
[native-country] Accuracy: 75.43%
(30162, 12) (30162, 1) (15060, 12) (15060, 1)
[except race and sex] Accuracy: 79.08%


 ## KNN
 link:https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [11]:
from sklearn.neighbors import KNeighborsClassifier

# all features
# Split the data frame
clean_train_data_X = clean_train_data.iloc[:, :-1]
clean_train_data_Y = clean_train_data.iloc[:, -1:]

clean_test_data_X = clean_test_data.iloc[:, :-1]
clean_test_data_Y = clean_test_data.iloc[:, -1:]
print(clean_train_data_X.shape, clean_train_data_Y.shape,
      clean_test_data_X.shape, clean_test_data_Y.shape)


# Train and predict the model
# Add the .values.ravel() to Y to avoid the warning
neigh = KNeighborsClassifier()
neigh.fit(clean_train_data_X, clean_train_data_Y.values.ravel())
y_pred = neigh.predict(clean_test_data_X)
print('[all features] Accuracy: {:0.2%}' .format(
    accuracy_score(clean_test_data_Y, y_pred)))

# single features
for col in clean_train_data_X.columns:
    clean_train_data_single_X = clean_train_data_X[col].values.reshape(-1,1)
    clean_test_data_single_X = clean_test_data[col].values.reshape(-1,1)
    neigh = KNeighborsClassifier()
    neigh.fit(clean_train_data_single_X, clean_train_data_Y.values.ravel())
    y_pred = neigh.predict(clean_test_data_single_X)
    print('[{}] Accuracy: {:0.2%}' .format(
        col, accuracy_score(clean_test_data_Y, y_pred)))

# except race and sex
clean_train_data_except_X = clean_train_data_X[map(
    lambda x:x not in ['race', 'sex'], list(clean_train_data_X.columns))].copy()
clean_test_data_except_X = clean_test_data_X[map(
    lambda x:x not in ['race', 'sex'], list(clean_test_data_X.columns))].copy()
print(clean_train_data_except_X.shape, clean_train_data_Y.shape,
      clean_test_data_except_X.shape, clean_test_data_Y.shape)
neigh = KNeighborsClassifier()
neigh.fit(clean_train_data_except_X, clean_train_data_Y.values.ravel())
y_pred = clf.predict(clean_test_data_except_X)
print('[except race and sex] Accuracy: {:0.2%}' .format(
    accuracy_score(clean_test_data_Y, y_pred)))

#%%


(30162, 14) (30162, 1) (15060, 14) (15060, 1)
[all features] Accuracy: 77.05%
[age] Accuracy: 70.23%
[workclass] Accuracy: 75.78%
[fnlwgt] Accuracy: 71.18%
[education] Accuracy: 76.71%
[education-num] Accuracy: 74.36%
[marital-status] Accuracy: 75.41%
[occupation] Accuracy: 67.66%
[relationship] Accuracy: 71.53%
[race] Accuracy: 75.43%
[sex] Accuracy: 75.43%
[capital-gain] Accuracy: 27.62%
[capital-loss] Accuracy: 77.62%
[hours-per-week] Accuracy: 46.37%
[native-country] Accuracy: 75.17%
(30162, 12) (30162, 1) (15060, 12) (15060, 1)
[except race and sex] Accuracy: 79.08%
