In [5]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('census_income_dataset_preprocessed.csv')
df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,capital-gains,capital-loss,hours-per-week,native-country,target
0,39,6,13,4,0,2174,0,40,38,0
1,50,5,13,2,3,0,0,13,38,0
2,38,3,9,0,5,0,0,40,38,0
3,53,3,7,2,5,0,0,40,38,0
4,28,3,13,2,9,0,0,40,4,0


In [6]:
data, target = df.drop('target', axis=1), df.target

data.shape, target.shape

((32561, 9), (32561,))

In [7]:
from sklearn.model_selection import train_test_split


X, X_test, y, y_test = train_test_split(
    data, target,
    test_size=.1,
    random_state=42,
    stratify=target,
)

X_train, X_dev, y_train, y_dev = train_test_split(
    X, y,
    test_size=X_test.shape[0]/X.shape[0],
    random_state=42,
    stratify=y,
)

(
    X_train.shape, y_train.shape,
    X_dev.shape, y_dev.shape,
    X_test.shape, y_test.shape
)

((26047, 9), (26047,), (3257, 9), (3257,), (3257, 9), (3257,))

In [8]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42)
model.fit(X_train, y_train)

MLPClassifier(random_state=42)

In [9]:
from sklearn.metrics import accuracy_score

X_sets = [X_train, X_dev, X_test]
y_sets = [y_train, y_dev, y_test]
sets = ['Training', 'Validation', 'Testing']
accuracy = {}

for i in range(len(X_sets)):
  y_pred = model.predict(X_sets[i])
  score = accuracy_score(y_sets[i], y_pred)
  accuracy[sets[i]] = score


for k, v in accuracy.items():
  print(k, ': ', v)

Training :  0.8355280838484278
Validation :  0.8259134172551428
Testing :  0.8452563708934603


In [10]:
res_1 = pd.DataFrame({'Sets': ['Bayes Error'], 'Error Rate': 0.01},index={'1'})
results = res_1[['Sets','Error Rate']]

res_2 = pd.DataFrame(
    {'Sets': ['Training Set'], 
     'Error Rate': 1-accuracy['Training']},
     index={'2'}
)
results = pd.concat([results, res_2])
results = results[['Sets','Error Rate']]

res_4 = pd.DataFrame(
    {'Sets': ['Validation Set'], 
     'Error Rate': 1-accuracy['Validation']},
     index={'3'}
)
results = pd.concat([results, res_4])
results = results[['Sets','Error Rate']]

res_5 = pd.DataFrame(
    {'Sets': ['Testing Set'], 
     'Error Rate': 1-accuracy['Testing']},
     index={'4'}
)
results = pd.concat([results, res_5])
results = results[['Sets','Error Rate']]
results

Unnamed: 0,Sets,Error Rate
1,Bayes Error,0.01
2,Training Set,0.164472
3,Validation Set,0.174087
4,Testing Set,0.154744


In [11]:
# overfitting
0.154744 - 0.174087

-0.019343

In [12]:
# mismatch
0.174087 - 0.164472

0.009614999999999985

In [13]:
# High bias / underfitting
0.164472 - 0.010000

0.154472

# Tuning Hyperparams

## First round of experiments
**Experiment 1**

In [18]:
model = MLPClassifier(random_state=42, max_iter=500)
model.fit(X_train, y_train)

MLPClassifier(max_iter=500, random_state=42)

In [19]:
accuracy = {}
for i in range(0,len(X_sets)):  
    pred = model.predict(X_sets[i])
    score = accuracy_score(y_sets[i], pred)
    accuracy[sets[i]] = score

print(accuracy)

{'Training': 0.8355280838484278, 'Validation': 0.8259134172551428, 'Testing': 0.8452563708934603}


**Experiment 2**

In [20]:
model = MLPClassifier(random_state=42, max_iter=500,
                      hidden_layer_sizes=(100,100)
      )

model.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=500, random_state=42)

In [21]:
sets = ["Training", "Validation", "Testing"]
X_sets = [X_train, X_dev, X_test]
Y_sets = [y_train, y_dev, y_test]

accuracy = {}
for i in range(0,len(X_sets)):  
    pred = model.predict(X_sets[i])
    score = accuracy_score(y_sets[i], pred)
    accuracy[sets[i]] = score

print(accuracy)


{'Training': 0.8348370253772027, 'Validation': 0.8262204482652747, 'Testing': 0.8474055879643844}


**Experiment 3**

In [22]:
model = MLPClassifier(random_state=42, max_iter = 500,
                      hidden_layer_sizes=(100,100,100))

model = model.fit(X_train, y_train)

In [23]:
sets = ["Training", "Validation", "Testing"]
X_sets = [X_train, X_dev, X_test]
Y_sets = [y_train, y_dev, y_test]

accuracy = {}
for i in range(0,len(X_sets)):  
    pred = model.predict(X_sets[i])
    score = accuracy_score(y_sets[i], pred)
    accuracy[sets[i]] = score

print(accuracy)

{'Training': 0.8441279226014512, 'Validation': 0.8375805956401596, 'Testing': 0.8523180841264968}


# Second round of experiments
**Experiment 1**

In [24]:
model = MLPClassifier(random_state=42, max_iter=500, 
                      hidden_layer_sizes=(50,50)
)

model.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)

In [25]:
accuracy = {}

for i in range(0,len(X_sets)):  
    pred = model.predict(X_sets[i])
    score = accuracy_score(y_sets[i], pred)
    accuracy[sets[i]] = score

print(accuracy)

{'Training': 0.8308826352363036, 'Validation': 0.8274485723058029, 'Testing': 0.8409579367516119}


**Experiment 2**

In [28]:
model = MLPClassifier(random_state=42, max_iter=500, 
                      hidden_layer_sizes=(150,150)
    )

model.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(150, 150), max_iter=500, random_state=42)

In [29]:
accuracy = {}
for i in range(0,len(X_sets)):  
    pred = model.predict(X_sets[i])
    score = accuracy_score(y_sets[i], pred)
    accuracy[sets[i]] = score

print(accuracy)

{'Training': 0.8472376857219641, 'Validation': 0.8366595026097636, 'Testing': 0.8510899600859687}
