In [82]:
# import data
import pandas as pd
df_train = pd.read_csv('adult.data') 
df_test = pd.read_csv('adult.test')

# please use the files I provided;
# i edited them manually because editing this in Python is a pain
# and I frankly don't have the time to write that code.
# Sorry.

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   martial-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   gender          32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hour-per-week   32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [83]:
# Preprocess Data
# information based on https://archive.ics.uci.edu/ml/datasets/Adult
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# drop id - won't be needed
# df_train = df_train.drop(['id'])
# df_test = df_train.drop(['id'])

# one hot encode workclass
# categories: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked, ? (not known)
df_train['workclass'] = df_train['workclass'].replace("?", "workclass_?")
df_test['workclass'] = df_test['workclass'].replace("?", "workclass_?")

workclassEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(workclassEncoder.fit_transform(df_train[['workclass']]).toarray(), 
                       columns=workclassEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(workclassEncoder.transform(df_test[['workclass']]).toarray(), 
                       columns=workclassEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['workclass'], axis=1)
df_test = df_test.drop(['workclass'], axis=1)

# fnlwgt  = nr of people this entry represents
# is continuous, so scale
fnlwgtScaler = MinMaxScaler()
df_train['fnlwgt'] = fnlwgtScaler.fit_transform(df_train[['fnlwgt']])
df_test['fnlwgt'] = fnlwgtScaler.transform(df_test[['fnlwgt']])

# education
educationEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(educationEncoder.fit_transform(df_train[['education']]).toarray(), 
                       columns=educationEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(educationEncoder.transform(df_test[['education']]).toarray(), 
                       columns=educationEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['education'], axis=1)
df_test = df_test.drop(['education'], axis=1)

# education-num
# education-num: continuous.
educationNumScaler = MinMaxScaler()
df_train['education-num'] = educationNumScaler.fit_transform(df_train[['education-num']])
df_test['education-num'] = educationNumScaler.transform(df_test[['education-num']])

# martial-status
# possible entries: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
martialStatusEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(martialStatusEncoder.fit_transform(df_train[['martial-status']]).toarray(), 
                       columns=martialStatusEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(martialStatusEncoder.transform(df_test[['martial-status']]).toarray(), 
                       columns=martialStatusEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['martial-status'], axis=1)
df_test = df_test.drop(['martial-status'], axis=1)

# occupation
# possibnle entries: Tech-support, Craft-repair, Other-service, Sales, 
# Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, 
# Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces, ? (unknown)
df_train['occupation'] = df_train['occupation'].replace("?", "occupation_?")
df_test['occupation'] = df_test['occupation'].replace("?", "occupation_?")
occupationEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(occupationEncoder.fit_transform(df_train[['occupation']]).toarray(), 
                       columns=occupationEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(occupationEncoder.transform(df_test[['occupation']]).toarray(), 
                       columns=occupationEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['occupation'], axis=1)
df_test = df_test.drop(['occupation'], axis=1)

# relationship
# possible values: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
relationshipEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(relationshipEncoder.fit_transform(df_train[['relationship']]).toarray(), 
                       columns=relationshipEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(relationshipEncoder.transform(df_test[['relationship']]).toarray(), 
                       columns=relationshipEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['relationship'], axis=1)
df_test = df_test.drop(['relationship'], axis=1)

# race: 
# poss. values: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
raceEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(raceEncoder.fit_transform(df_train[['race']]).toarray(), 
                       columns=raceEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(raceEncoder.transform(df_test[['race']]).toarray(), 
                       columns=raceEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['race'], axis=1)
df_test = df_test.drop(['race'], axis=1)

# sex: values: Female, Male. 
sexEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(sexEncoder.fit_transform(df_train[['gender']]).toarray(), 
                       columns=sexEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(sexEncoder.transform(df_test[['gender']]).toarray(), 
                       columns=sexEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['gender'], axis=1)
df_test = df_test.drop(['gender'], axis=1)

# capital-gain: continuous.
capitalGainScaler = MinMaxScaler()
df_train['capital-gain'] = capitalGainScaler.fit_transform(df_train[['capital-gain']])
df_test['capital-gain'] = capitalGainScaler.transform(df_test[['capital-gain']])

# capital-loss: continuous.
capitalLossScaler = MinMaxScaler()
df_train['capital-loss'] = capitalLossScaler.fit_transform(df_train[['capital-loss']])
df_test['capital-loss'] = capitalLossScaler.transform(df_test[['capital-loss']])

# hours-per-week: continuous.
hoursPerWeekScaler = MinMaxScaler()
df_train['hour-per-week'] = hoursPerWeekScaler.fit_transform(df_train[['hour-per-week']])
df_test['hour-per-week'] = hoursPerWeekScaler.transform(df_test[['hour-per-week']])

# native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
countryEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(countryEncoder.fit_transform(df_train[['native-country']]).toarray(), 
                       columns=countryEncoder.categories_)
df_train = df_train.join(encoded)
encoded = pd.DataFrame(countryEncoder.transform(df_test[['native-country']]).toarray(), 
                       columns=countryEncoder.categories_)
df_test = df_test.join(encoded)

df_train = df_train.drop(['native-country'], axis=1)
df_test = df_test.drop(['native-country'], axis=1)

# salary
df_train['salary'] = df_train['salary'].replace("<=50k", 0)
df_train['salary'] = df_train['salary'].replace(">50k", 1)
df_test['salary'] = df_test['salary'].replace("<=50k", 0)
df_test['salary'] = df_test['salary'].replace(">50k", 1)


# Explanation on Preprocessing
* Based on exploratory data analysis, discuss what preprocessing that you need to do before classification, and provide evidence and justifications.*


In [86]:
# (1) kNN, (2) naive Bayes, (3) SVM, (4) decision tree, 
# (5) random forest, (6) AdaBoost, (7) gradient Boosting, 
# (8) linear discriminant analysis, (9) multi-layer perceptron, and
# (10) logistic regression.

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # this is the first result? hope it's right? sorry if not?
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

models = ["Models", KNeighborsClassifier(), GaussianNB(), SVC(),
         DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(),
         GradientBoostingClassifier(), LinearDiscriminantAnalysis(), 
         MLPClassifier(), LogisticRegression()]

names = ["Names", "KNN", "Naive Bayed", "SVC",
         "Decision Tree", "Random Forest", "AdaBoost",
         "Gradient Boosting", "linear discriminant analysis", 
         "multi-layer perceptron", "logistic regression"
        ]

accuracy = ["Accuracy"]
recall = ["Recall Positive"]
recallNegative = ["Recall Negative"]
precision = ["Precision"]
f1 = ["F1"]
aucList = ["AUC"]

y_train = df_train['salary']
x_train = df_train.drop(['salary'])

y_test = df_test['salary']
x_test = df_test.drop(['salary'])

for i in range (1, 11): 
    model = models[i]
    name = name[i]
    
    print(name)
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    y_pred_prob = model.predict_proba(x_test)
    
    accuracy.append(accuracy_score(y_test, y_pred))
    recall.append( recall_score(y_test, y_pred))
    recallNegative.append( recall_score(y_test, y_pred, pos_label=0))
    precision.append( precision_score(y_test, y_pred))
    f1.append( f1_score(y_test, y_pred))
    
    
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1], pos_label=1)
    aucList.append(auc(fpr, tpr)

10 10


* Report the results (keep 2 decimals) of all the 10 classification algorithms on the given test data in terms of classification accuracy, precision, recall, F1-score, and AUC. You should report them in a table.* 