In [63]:
# import data
import pandas as pd
df_train = pd.read_csv('adult.data') 
df_test = pd.read_csv('adult.test')

# please use the files I provided;
# i edited them manually because editing this in Python is a pain
# and I frankly don't have the time to write that code.
# Sorry.

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   martial-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   gender          32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hour-per-week   32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [64]:
# Preprocess Data
# information based on https://archive.ics.uci.edu/ml/datasets/Adult
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# drop id - won't be needed
# df_train = df_train.drop(['id'])
# df_test = df_train.drop(['id'])

# one hot encode workclass
# categories: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked, ? (not known)
df_train['workclass'] = df_train['workclass'].replace("?", "workclass_?")
df_test['workclass'] = df_test['workclass'].replace("?", "workclass_?")

workclassEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(workclassEncoder.fit_transform(df_train[['workclass']]).toarray(), 
                       columns=workclassEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(workclassEncoder.transform(df_test[['workclass']]).toarray(), 
                       columns=workclassEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['workclass'], axis=1)
df_test = df_test.drop(['workclass'], axis=1)

# fnlwgt  = nr of people this entry represents
# is continuous, so scale
fnlwgtScaler = MinMaxScaler()
df_train['fnlwgt'] = fnlwgtScaler.fit_transform(df_train[['fnlwgt']])
df_test['fnlwgt'] = fnlwgtScaler.transform(df_test[['fnlwgt']])

# education
educationEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(educationEncoder.fit_transform(df_train[['education']]).toarray(), 
                       columns=educationEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(educationEncoder.transform(df_test[['education']]).toarray(), 
                       columns=educationEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['education'], axis=1)
df_test = df_test.drop(['education'], axis=1)

# education-num
# education-num: continuous.
educationNumScaler = MinMaxScaler()
df_train['education-num'] = educationNumScaler.fit_transform(df_train[['education-num']])
df_test['education-num'] = educationNumScaler.transform(df_test[['education-num']])

# martial-status
# possible entries: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
martialStatusEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(martialStatusEncoder.fit_transform(df_train[['martial-status']]).toarray(), 
                       columns=martialStatusEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(martialStatusEncoder.transform(df_test[['martial-status']]).toarray(), 
                       columns=martialStatusEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['martial-status'], axis=1)
df_test = df_test.drop(['martial-status'], axis=1)

# occupation
# possibnle entries: Tech-support, Craft-repair, Other-service, Sales, 
# Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, 
# Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces, ? (unknown)
df_train['occupation'] = df_train['occupation'].replace("?", "occupation_?")
df_test['occupation'] = df_test['occupation'].replace("?", "occupation_?")
occupationEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(occupationEncoder.fit_transform(df_train[['occupation']]).toarray(), 
                       columns=occupationEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(occupationEncoder.transform(df_test[['occupation']]).toarray(), 
                       columns=occupationEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['occupation'], axis=1)
df_test = df_test.drop(['occupation'], axis=1)

# relationship
# possible values: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
relationshipEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(relationshipEncoder.fit_transform(df_train[['relationship']]).toarray(), 
                       columns=relationshipEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(relationshipEncoder.transform(df_test[['relationship']]).toarray(), 
                       columns=relationshipEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['relationship'], axis=1)
df_test = df_test.drop(['relationship'], axis=1)

# race: 
# poss. values: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
raceEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(raceEncoder.fit_transform(df_train[['race']]).toarray(), 
                       columns=raceEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(raceEncoder.transform(df_test[['race']]).toarray(), 
                       columns=raceEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['race'], axis=1)
df_test = df_test.drop(['race'], axis=1)

# sex: values: Female, Male. 
sexEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(sexEncoder.fit_transform(df_train[['gender']]).toarray(), 
                       columns=sexEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(sexEncoder.transform(df_test[['gender']]).toarray(), 
                       columns=sexEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['gender'], axis=1)
df_test = df_test.drop(['gender'], axis=1)

# capital-gain: continuous.
capitalGainScaler = MinMaxScaler()
df_train['capital-gain'] = capitalGainScaler.fit_transform(df_train[['capital-gain']])
df_test['capital-gain'] = capitalGainScaler.transform(df_test[['capital-gain']])

# capital-loss: continuous.
capitalLossScaler = MinMaxScaler()
df_train['capital-loss'] = capitalLossScaler.fit_transform(df_train[['capital-loss']])
df_test['capital-loss'] = capitalLossScaler.transform(df_test[['capital-loss']])

# hours-per-week: continuous.
hoursPerWeekScaler = MinMaxScaler()
df_train['hour-per-week'] = hoursPerWeekScaler.fit_transform(df_train[['hour-per-week']])
df_test['hour-per-week'] = hoursPerWeekScaler.transform(df_test[['hour-per-week']])

# native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
countryEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(countryEncoder.fit_transform(df_train[['native-country']]).toarray(), 
                       columns=countryEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(countryEncoder.transform(df_test[['native-country']]).toarray(), 
                       columns=countryEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['native-country'], axis=1)
df_test = df_test.drop(['native-country'], axis=1)

# salary
df_train['salary'] = df_train['salary'].replace('<=50K', 0)
df_train['salary'] = df_train['salary'].replace('>50K', 1)
df_test['salary'] = df_test['salary'].replace('<=50K.', 0)
df_test['salary'] = df_test['salary'].replace('>50K.', 1)

df_train.rename(
    columns={
        df_train.columns[7] : "Federal-Gov",
        df_train.columns[8] : "Local-Gov",
        df_train.columns[9] : "Never-worked",
        df_train.columns[10] : "Private",
        df_train.columns[11] : "Self-emp-inc",
        df_train.columns[12] : "Self-emp-not-inc",
        df_train.columns[13] : "State-gov",  
        df_train.columns[14] : "Without-pay",
        df_train.columns[15] : "unknown_workclass",
        df_train.columns[16] : "10th",
        df_train.columns[17] : "11th",
        df_train.columns[18] : "12th",
        df_train.columns[19] : "1st-4th",
        df_train.columns[20] : "5th-6th",
        df_train.columns[21] : "7th-8th",
        df_train.columns[22] : "9th",
        df_train.columns[23] : "Assoc-acdm",
        df_train.columns[24] : "Assoc-voc",
        df_train.columns[25] : "Bachelors",
        df_train.columns[26] : "Doctorate",
        df_train.columns[27] : "HS-grad",
        df_train.columns[28] : "Masters",
        df_train.columns[29] : "Preschool",
        
        df_train.columns[30] : "Prof-school",
        df_train.columns[31] : "Some-college",
        df_train.columns[32] : "Divorced",
        df_train.columns[33] : "Married-AF-spouse",
        df_train.columns[34] : "Married-civ-spouse",
        df_train.columns[35] : "Married-spouse-absent",
        df_train.columns[36] : "Never-married",
        df_train.columns[37] : "Separated",
        df_train.columns[38] : "Widowed",
        df_train.columns[39] : "Adm-clerical",

        df_train.columns[40] : "Armed-Forces",
        df_train.columns[41] : "Craft-repair",
        df_train.columns[42] : "Exec-managerial",
        df_train.columns[43] : "Farming-fishing",
        df_train.columns[44] : "Handlers-cleaners",
        df_train.columns[45] : "Machine-op-inspct",
        df_train.columns[46] : "Other-service",
        df_train.columns[47] : "Priv-house-serv",
        df_train.columns[48] : "Prof-specialty",
        df_train.columns[49] : "Protective-serv",

        df_train.columns[50] : "Sales",
        df_train.columns[51] : "Tech-support",
        df_train.columns[52] : "Transport-moving",
        df_train.columns[53] : "unknown_occupation",
        df_train.columns[54] : "Husband",
        df_train.columns[55] : "Not-in-family",
        df_train.columns[56] : "Other-relative",
        df_train.columns[57] : "Own-child",
        df_train.columns[58] : "Unmarried",
        df_train.columns[59] : "Wife",

        df_train.columns[60] : "Amer-Indian-Eskimo",
        df_train.columns[61] : "Asian-Pac-Islander",
        df_train.columns[62] : "Black",
        df_train.columns[63] : "Other",
        df_train.columns[64] : "White",
        df_train.columns[65] : "Female",
        df_train.columns[66] : "Male",
        df_train.columns[67] : "unknown",
        df_train.columns[68] : "Cambodia",
        df_train.columns[69] : "Canada",

        df_train.columns[70] : "China",
        df_train.columns[71] : "Columbia",
        df_train.columns[72] : "Cuba",
        df_train.columns[73] : "Dominican-Republic",
        df_train.columns[74] : "Ecuador",
        df_train.columns[75] : "El-Salvador",
        df_train.columns[76] : "England",
        df_train.columns[77] : "France",
        df_train.columns[78] : "Germany",
        df_train.columns[79] : "Greece",

        df_train.columns[80] : "Guatemala",
        df_train.columns[81] : "Haiti",
        df_train.columns[82] : "Holand-Netherlands",
        df_train.columns[83] : "Honduras",
        df_train.columns[84] : "Hong",
        df_train.columns[85] : "Hungary",
        df_train.columns[86] : "India",
        df_train.columns[87] : "Iran",
        df_train.columns[88] : "Ireland",
        df_train.columns[89] : "Italy",

        df_train.columns[90] : "Jamaica",
        df_train.columns[91] : "Japan",
        df_train.columns[92] : "Laos",
        df_train.columns[93] : "Mexico",
        df_train.columns[94] : "Nicaragua",
        df_train.columns[95] : "Outlying-US(Guam-USVI-etc)",
        df_train.columns[96] : "Peru",
        df_train.columns[97] : "Philippines",
        df_train.columns[98] : "Poland",
        df_train.columns[99] : "Portugal",
        
        df_train.columns[100] : "Puerto-Rico",
        df_train.columns[101] : "Scotland",
        df_train.columns[102] : "South",
        df_train.columns[103] : "Taiwan",
        df_train.columns[104] : "Thailand",
        df_train.columns[105] : "Trinadad&Tobago",
        df_train.columns[106] : "United-States",
        df_train.columns[107] : "Vietnam",
        df_train.columns[108] : "Yugoslavia"        
    }
)

Unnamed: 0,id,fnlwgt,education-num,capital-gain,capital-loss,hour-per-week,salary,Federal-Gov,Local-Gov,Never-worked,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,39,0.044302,0.800000,0.021740,0.0,0.397959,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,50,0.048238,0.800000,0.000000,0.0,0.122449,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,38,0.138113,0.533333,0.000000,0.0,0.397959,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,53,0.151068,0.400000,0.000000,0.0,0.397959,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,28,0.221488,0.800000,0.000000,0.0,0.397959,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,0.166404,0.733333,0.000000,0.0,0.377551,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32557,40,0.096500,0.533333,0.000000,0.0,0.397959,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32558,58,0.094827,0.533333,0.000000,0.0,0.397959,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32559,22,0.128499,0.533333,0.000000,0.0,0.193878,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [65]:
for i in range (109):
    print(i,  df_train.columns[i] )

0 id
1 fnlwgt
2 education-num
3 capital-gain
4 capital-loss
5 hour-per-week
6 salary
7 ('Federal-gov',)
8 ('Local-gov',)
9 ('Never-worked',)
10 ('Private',)
11 ('Self-emp-inc',)
12 ('Self-emp-not-inc',)
13 ('State-gov',)
14 ('Without-pay',)
15 ('workclass_?',)
16 ('10th',)
17 ('11th',)
18 ('12th',)
19 ('1st-4th',)
20 ('5th-6th',)
21 ('7th-8th',)
22 ('9th',)
23 ('Assoc-acdm',)
24 ('Assoc-voc',)
25 ('Bachelors',)
26 ('Doctorate',)
27 ('HS-grad',)
28 ('Masters',)
29 ('Preschool',)
30 ('Prof-school',)
31 ('Some-college',)
32 ('Divorced',)
33 ('Married-AF-spouse',)
34 ('Married-civ-spouse',)
35 ('Married-spouse-absent',)
36 ('Never-married',)
37 ('Separated',)
38 ('Widowed',)
39 ('Adm-clerical',)
40 ('Armed-Forces',)
41 ('Craft-repair',)
42 ('Exec-managerial',)
43 ('Farming-fishing',)
44 ('Handlers-cleaners',)
45 ('Machine-op-inspct',)
46 ('Other-service',)
47 ('Priv-house-serv',)
48 ('Prof-specialty',)
49 ('Protective-serv',)
50 ('Sales',)
51 ('Tech-support',)
52 ('Transport-moving',)
53 (

# Explanation on Preprocessing
*Based on exploratory data analysis, discuss what preprocessing that you need to do before classification, and provide evidence and justifications.*


In [66]:
# (1) kNN, (2) naive Bayes, (3) SVM, (4) decision tree, 
# (5) random forest, (6) AdaBoost, (7) gradient Boosting, 
# (8) linear discriminant analysis, (9) multi-layer perceptron, and
# (10) logistic regression.

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # this is the first result? hope it's right? sorry if not?
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

models = ["Models", KNeighborsClassifier(), GaussianNB(), SVC( probability=True),
         DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(),
         GradientBoostingClassifier(), LinearDiscriminantAnalysis(), 
         MLPClassifier(), LogisticRegression()]

names = ["Models", "KNN", "Naive Bayes", "SVC",
         "Decision Tree", "Random Forest", "AdaBoost",
         "Gradient Boosting", "linear discriminant analysis", 
         "multi-layer perceptron", "logistic regression"
        ]

accuracy = ["Accuracy"]
recall = ["Recall Positive"]
recallNegative = ["Recall Negative"]
precision = ["Precision"]
f1 = ["F1"]
aucList = ["AUC"]

y_train = df_train['salary']
x_train = df_train.drop('salary', axis=1)

y_test = df_test['salary']
x_test = df_test.drop('salary', axis=1)

for i in range (1, 11): 
    model = models[i]
    name = names[i]
    
    print(name)
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    y_pred_prob = model.predict_proba(x_test)
    
    accuracy.append(accuracy_score(y_test, y_pred))
    recall.append( recall_score(y_test, y_pred))
    recallNegative.append( recall_score(y_test, y_pred, pos_label=0))
    precision.append( precision_score(y_test, y_pred))
    f1.append( f1_score(y_test, y_pred))
    
    
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1], pos_label=1)
    aucList.append(auc(fpr, tpr) )
    
    print(y_pred)

KNN




[0 0 0 ... 1 0 1]
Naive Bayes
[0 1 1 ... 1 0 1]
SVC




[0 0 0 ... 1 0 1]
Decision Tree




[0 0 1 ... 1 0 0]
Random Forest




[0 0 1 ... 1 0 1]
AdaBoost




[0 0 0 ... 1 1 1]
Gradient Boosting




[0 0 0 ... 1 0 1]
linear discriminant analysis




[0 0 0 ... 1 0 1]
multi-layer perceptron




[0 0 0 ... 1 0 1]
logistic regression
[0 0 0 ... 1 0 1]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


*Report the results (keep 2 decimals) of all the 10 classification algorithms on the given test data in terms of classification accuracy, precision, recall, F1-score, and AUC. You should report them in a table.* 

In [70]:
import tabulate


header = names
data = [header, accuracy, recall, recallNegative, precision, f1, aucList]

for i in range (1, len (data)):
    for j in range (1, len (data[i])):
        data[i][j] = round(data[i][j], 2)
table = tabulate.tabulate(data, tablefmt='html')
table

0,1,2,3,4,5,6,7,8,9,10
Names,KNN,Naive Bayes,SVC,Decision Tree,Random Forest,AdaBoost,Gradient Boosting,linear discriminant analysis,multi-layer perceptron,logistic regression
Accuracy,0.82,0.6,0.83,0.81,0.85,0.86,0.87,0.84,0.85,0.84
Recall Positive,0.55,0.93,0.46,0.62,0.61,0.61,0.61,0.56,0.54,0.57
Recall Negative,0.9,0.5,0.95,0.87,0.93,0.94,0.95,0.93,0.95,0.93
Precision,0.63,0.36,0.75,0.6,0.72,0.75,0.8,0.72,0.77,0.7
F1,0.59,0.52,0.57,0.61,0.66,0.67,0.69,0.63,0.63,0.63
AUC,0.83,0.81,0.88,0.75,0.9,0.91,0.92,0.89,0.91,0.89


*Find the two best algorithms according to each of the four performance metrics, Are they the same? Explain why.*

Accuracy: AdaBoost, Gradient Boosting
Recall (Positive): Naive Bayed, Decision Tree
Recall (Negative): SVC, Gradient Boosting, Multi-Layer Perceptron
Precision: Multi-Layer Perceptron, SVC
F1: Gradient Boosting, AdaBoost
AUC: Gradient Boosting, AdaBoost, Multi-Layer Perceptron

The best performing algorithms across those are AdaBoost, Gradient Boosting and Multi-Layer Perceptrons. This may just be because those algorithms perform better on this data set with the given parameters (the default ones), but it also just shows a general strength for them. Logistic Regression certainly doesn't do well with the default parameters, because it never manages to converge in the first place, explaining its lower performance.