# Current Population Survey Feed-Forward Neural Network Classification


Predict whether an individuals family income is greater or less than 50,000 using a basic Feed-Forward Neural Network.

In [1]:
import pandas as pd  
import numpy as np  
import torch    

from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("../datasets/CPS2016_UPDATE.csv",
                  usecols=['age', 'weekly_hrs', 'educ','num_in_house', 
                           'num_child', 'ismarried', 'faminc_50'])

data = data.astype(np.float64)
data.faminc_50 = data.faminc_50.astype(np.int64)
data.head()

Unnamed: 0,age,weekly_hrs,educ,num_in_house,num_child,ismarried,faminc_50
0,28.0,40.0,11.0,6.0,4.0,0.0,0
1,44.0,50.0,13.0,5.0,2.0,1.0,1
2,23.0,20.0,9.0,3.0,0.0,0.0,1
3,20.0,20.0,7.0,5.0,0.0,0.0,0
4,20.0,20.0,9.0,4.0,0.0,0.0,1


## Deciding on Predictor and Response variables

We are only going to use the following numerical variables.
I have tested this network using the categorical columns, however they are noise and reduce the score.

In [3]:
predictors = ['age', 'educ', 'weekly_hrs', 'ismarried',
              'num_child', 'num_in_house']

class_column = 'faminc_50'

In [4]:
n_predictors = len(predictors)
print(f"{n_predictors} input variables:  {predictors}")
categories = np.sort(data[class_column].unique())
n_categories = len(categories)

print(categories)
print(n_categories)

6 input variables:  ['age', 'educ', 'weekly_hrs', 'ismarried', 'num_child', 'num_in_house']
[0 1]
2


In [5]:
X = data[predictors]
Y = data[class_column]

print(X.shape)
print(Y.shape)

(55253, 6)
(55253,)


## Train/Test Split

We will randomly select 80% of the datapoints to be training, and use the complementary 20% for testing.

In [6]:
N = len(data)
train_bool = np.zeros(shape=(N,), dtype='bool')
train_bool[np.random.choice(N, int(0.80*N), replace=False)] = True

X_train = X.iloc[ train_bool]
X_test  = X.iloc[~train_bool]
Y_train = Y.iloc[ train_bool]
Y_test  = Y.iloc[~train_bool]
print(f"{N} total = {len(X_train)} training + {len(X_test)} testing")

55253 total = 44202 training + 11051 testing


## Wrap data in  PyTorch tensors

In [7]:
X_train = torch.tensor(X_train.values.astype(np.float32))
Y_train = torch.tensor(Y_train.values)
X_test = torch.tensor(X_test.values.astype(np.float32))
Y_test = torch.tensor(Y_test.values)

In [8]:
from torch.utils.data import TensorDataset, DataLoader

train_batch_size = 20
test_batch_size = 50

train_ds = TensorDataset(X_train, Y_train)
train_dl = DataLoader(dataset=train_ds, batch_size=train_batch_size, shuffle=True)

test_ds = TensorDataset(X_test, Y_test)
test_dl =  DataLoader(dataset=test_ds, batch_size=test_batch_size)  # No need to shuffle the testing data

## Setting up a Feed-Forward Neural Network

In [9]:
class FFNN(torch.nn.Module):
    r"""Abe's really simple Feed-Forward Neural Network. """
    def __init__(self, input_size, hidden_size, output_size):
        super(FFNN, self).__init__()
        self.input_to_hidden =  torch.nn.Linear(input_size, hidden_size)
        self.hidden_to_output = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.Softmax(dim=1)
        self.sig = torch.nn.Sigmoid()

    def forward(self, input):
        hidden = self.sig(self.input_to_hidden(input))
        output = self.hidden_to_output(hidden)
        output = self.softmax(output)
        return output

In [10]:
n_hidden = 30 # Let's use 30 variables in the hidden layer
ffnn = FFNN(n_predictors, n_hidden, n_categories)

## Preparing for Gradient Descent and Back-Propagation

In [11]:
optimizer = torch.optim.SGD(ffnn.parameters(), lr=0.1, momentum=1.0)
loss_fn = torch.nn.NLLLoss() 

## Testing

In [12]:
def test_score(model, loss_func, test_dataloader):
    r""" A simple test-batch scoring method. """
    
    scores = []
    with torch.no_grad():  # compute no gradients
        model.eval()  # evaluation mode -- don't do any internal learning.
        for (x_batch, y_batch) in test_dataloader:
            output_batch = model(x_batch)
            loss = loss_func(output_batch, y_batch)
            scores.append(loss.item())
    return np.array(scores).mean()  # average score across batches

## Initial (random) model
The model has not been trained yet, so it just has its original random parameters.

In [13]:
- test_score(ffnn, loss_fn, test_dl)

0.4955596420410517

## Train once

We now loop over the training set batches once, to improve the model

In [14]:
ffnn.train()  # put the model in training mode (not needed here, but good to remember to include)
for i, (x_batch, y_batch) in enumerate(train_dl):
    #print(f"batch {i} of size {x_batch.size()[0]}", end=" ")
    output_batch = ffnn(x_batch)
    loss = loss_fn(output_batch, y_batch)
    #print(f"in-batch loss {-loss}")
    
    optimizer.zero_grad()  # clear previous gradients
    loss.backward()        # compute gradients of all variables wrt loss
    optimizer.step()       # perform updates using calculated gradients

## After one "epoch" of training, the score has improved.

In [15]:
- test_score(ffnn, loss_fn, test_dl)

0.6663963953116993

## Let's train a lot more, and also print out the testing score as we go.

In [16]:
num_epochs = 10

for epoch in range(num_epochs):
    ffnn.train()  # put the model in training mode (not needed here, but good to remember to include)
    print(f"{epoch}", end=" ")
    for i, (x_batch, y_batch) in enumerate(train_dl):
        #print(".", end="")
        output_batch = ffnn(x_batch)
        loss = loss_fn(output_batch, y_batch)

        optimizer.zero_grad()  # clear previous gradients
        loss.backward()        # compute gradients of all variables wrt loss
        optimizer.step()       # perform updates using calculated gradients
        
    testloss = - test_score(ffnn, loss_fn, test_dl)
    print(f" test score {testloss}")

0  test score 0.6663963953116993
1  test score 0.6663963953116993
2  test score 0.6663963953116993
3  test score 0.6663963953116993
4  test score 0.6663963953116993
5  test score 0.6663963953116993
6  test score 0.6663963953116993
7  test score 0.6663963953116993
8  test score 0.6663963953116993
9  test score 0.6663963953116993


In [17]:
predict_out = ffnn(X_test)
_, predict_y = torch.max(predict_out, 1)

print('prediction accuracy', accuracy_score(Y_test.data, predict_y.data))

prediction accuracy 0.6649172020631617


Using a FFNN for classifying family income groups is not as effective as other methods such as Random Forest Decision Trees and SVM. When predicting if a families income is greater than 50,000, the maximum score achieved is .67, which is lower than SVM and Random Forest classification which the scores are greater than .7

When using the FFNN for predicting if an individual is married, we get scores around .56 which is significantly lower than using other methods.