## Import necessary functions 


In [0]:
# For data pre-processing
import pandas as pd
import numpy as np
# For splitting data into training and test
from sklearn.model_selection import train_test_split
# For preparing data for model
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
# For building layers in neural network and for embeddings
from keras.layers import Input, Dense, Embedding, Flatten, concatenate
# For creating model
from keras.models import Model
from keras import optimizers

## Read in data


In [2]:
# Upload adult_final.csv
from google.colab import files
uploaded = files.upload()

Saving adult_final.csv to adult_final.csv


In [0]:
# Use pandas function read_csv to read in file and assign to dataframe (df1)
df1 = pd.read_csv('adult_final.csv')

## Examine data

In [62]:
# Check first 6 rows of data
df1.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
2,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
3,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
4,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K


In [63]:
# Number of rows an columns
df1.shape

(617867, 14)

In [64]:
# Check data types of each feature
df1.dtypes

age                float64
workclass         category
education         category
education_num      float64
marital_status    category
occupation        category
relationship      category
race              category
sex               category
capital_gain       float64
capital_loss       float64
hours_per_week     float64
native_country    category
income            category
dtype: object

In [65]:
# Check for missing values
df1.isnull().sum()

age               0
workclass         0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [0]:
# Check the frequency of categories in each categorical variable
df1_melt = pd.melt(df1[["workclass","education","marital_status","occupation","relationship","race", \
                        "sex","native_country","income", "hours_per_week"]], id_vars = "hours_per_week").groupby(["variable", "value"], \
                         as_index = False)["hours_per_week"].count()
df1_melt.rename(columns={"hours_per_week":"freq"},inplace=True)

In [67]:
df1_melt.head()

Unnamed: 0,variable,value,freq
0,education,10th,18356
1,education,11th,22903
2,education,12th,8620
3,education,1st-4th,4024
4,education,5th-6th,7741


In [0]:
# Calculate the percent of observations that fall in each category
df1_melt["percentage"]=df1_melt["freq"]/df1.shape[0]

In [69]:
# Find the minimum % for each variable
df1_melt.groupby(["variable"],as_index=False)["percentage"].min()

Unnamed: 0,variable,percentage
0,education,0.001947
1,income,0.238593
2,marital_status,0.000704
3,native_country,0.000329
4,occupation,0.000312
5,race,0.006084
6,relationship,0.032552
7,sex,0.323814
8,workclass,0.000257


## Preprocessing



In [0]:
# Create list of numerical and categorical variables
numerical = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
categorical = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "income"]

In [0]:
# Convert numerical variables to floats (for normalizing later)
for num in numerical:
  df1[num] = df1[num].astype('float')

# Convert categorical data to categories
for cat in categorical:
  df1[cat] = df1[cat].astype('category')

In [0]:
# Create lists of of each variable type (useful when have many variables)

# Select all columns that have the data type 'category'
categorical_attr = df1.select_dtypes('category').columns
# Drop the 'income' column which is the response
categorical_attr = categorical_attr.drop('income')
# Assign income as the target attribute
target_attr = 'income'
# Select all columns that have the data type 'float'
numerical_attr = df1.select_dtypes('float').columns

In [0]:
# Split into training and test
data_categorical_train, data_categorical_test, data_numerical_train, data_numerical_test, Y_train, Y_test = \
train_test_split(df1[categorical_attr], df1[numerical_attr], df1[target_attr], test_size=0.33, random_state=123, stratify = df1[target_attr]) 

In [0]:
# One hot encode the categorical variables
# handle_unknown = "ignore" ensures that if a category ends up in test that is not in train, it can still predict by dropping that variable
onehotencoder = OneHotEncoder(handle_unknown='ignore')
# Use training data to identify how to one-hot encode
OneHotEncoder_fit = onehotencoder.fit(data_categorical_train)
# Apply to train and test
OneHotEncoder_train = OneHotEncoder_fit.transform(data_categorical_train).toarray()
OneHotEncoder_test = OneHotEncoder_fit.transform(data_categorical_test).toarray()

In [79]:
OneHotEncoder_train.shape

(413970, 101)

In [80]:
OneHotEncoder_test.shape

(203897, 101)

In [82]:
# One hot encode the response variable
OneHotEncoder_fit = onehotencoder.fit(Y_train.values.get_values().reshape(-1, 1))
OneHotEncoder_target_train = OneHotEncoder_fit.transform(Y_train.values.get_values().reshape(-1, 1)).toarray()
OneHotEncoder_target_test = OneHotEncoder_fit.transform(Y_test.values.get_values().reshape(-1, 1)).toarray()
# Remove the second column since this is binary
OneHotEncoder_target_train = OneHotEncoder_target_train[:, 0]
OneHotEncoder_target_test = OneHotEncoder_target_test[:, 0]

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
# # Scale the continuous data
# Scaler= MinMaxScaler()
# scaled_attr = Scalar.fit(data_numerical_train)
# scaled_attr_train= scaled_attr.transform(data_numerical_train)
# scaled_attr_test= scaled_attr.transform(data_numerical_test)

# Normalize the continous data
Scaler = StandardScaler()
scaled_attr = Scaler.fit(data_numerical_train)
scaled_attr_train= scaled_attr.transform(data_numerical_train)
scaled_attr_test= scaled_attr.transform(data_numerical_test)


In [84]:
# Create a training data set with all input variables
X_train = np.hstack((scaled_attr_train, OneHotEncoder_train))
X_train.shape

(413970, 106)

In [85]:
# Create a test data set with all input variables
X_test = np.hstack((scaled_attr_test, OneHotEncoder_test))
X_test.shape

(203897, 106)

# Build a perceptron

In [0]:
# Input layer

# Shapes is equal to the number of input variables
inputs = Input(shape=(X_train.shape[1],), name='inputs')
# Output layer has one node with "sigmoid" activation
# Receives the previous layer as input
outputs = Dense(1, activation='sigmoid')(inputs)

In [0]:
# Create model
model01 = Model(inputs=inputs, outputs=outputs)

In [88]:
model01.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 106)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 107       
Total params: 107
Trainable params: 107
Non-trainable params: 0
_________________________________________________________________


In [0]:
# Specify the optimizer adam with a 0.01 learning rate, amsgrad = False so that it does not use a variant of the optimizer
adam = optimizers.Adam(lr = 0.01, amsgrad = False)

In [0]:
# Compile model with "binary_crossentropy" which is logistic loss, the specified adam optimizer and "accuracy" as the error metric
model01.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ["accuracy"])

In [91]:
# Fit the model with 5 epochs using 20% of the data for validation
model01_fit = model01.fit(X_train, 
          y=OneHotEncoder_target_train, 
          epochs=5,validation_split=0.20)

Train on 331176 samples, validate on 82794 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [92]:
# Evaluate loss and accuracy on training set
model01.evaluate([X_train], y=OneHotEncoder_target_train, )



[0.313392620929604, 0.8556972727480812]

## Multilayer perceptron

In [0]:
# Input layer
inputs = Input(shape=(X_train.shape[1],), name='inputs')
# One dense layer with 32 nodes and "relu" activation
dense1 = Dense(32, activation = "relu")(inputs)
# Output layer with "sigmoid" activation
outputs = Dense(1, activation='sigmoid')(dense1)

In [0]:
# Build the model
model02 = Model(inputs=inputs, outputs=outputs)

In [95]:
model02.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 106)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                3424      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 3,457
Trainable params: 3,457
Non-trainable params: 0
_________________________________________________________________


In [0]:
# Specify adam optimizer with 0.1 learning rate
adam = optimizers.Adam(lr = 0.1, amsgrad = False)

In [0]:
# Compile the model with "binary_crossentropy" loss, adam optimizer, and "accuracy" error metric
model02.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ["accuracy"])

In [98]:
# Fit the model with 5 epochs and 20% of the data for validation
model02_fit = model02.fit(X_train, 
          y=OneHotEncoder_target_train, 
          epochs=5,validation_split=0.20)

Train on 331176 samples, validate on 82794 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [99]:
# Evaluate loss and accuracy on training set
model02.evaluate(X_train, 
               y=OneHotEncoder_target_train, )



[0.34381661177988887, 0.8385583496391507]

## Embedding Preprocessing

In [100]:
## Calculate the number of unique values in native_country
native_country_levels = len(df1.native_country.unique())
native_country_levels

41

In [101]:
## Make sure it is categorial
df1["native_country"] = df1["native_country"].astype("category")
## Check for missing values
df1.native_country.isnull().sum()

0

In [102]:
categorical_attr

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country'],
      dtype='object')

In [0]:
# Drop native_country from the list of attributes that was created earlier
categorical_attr = categorical_attr.drop(["native_country"])

In [104]:
categorical_attr

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex'],
      dtype='object')

In [0]:
# Split into training and test
data_categorical_train, data_categorical_test, data_numerical_train, data_numerical_test, data_country_train, data_country_test, Y_train, Y_test = \
train_test_split(df1[categorical_attr], df1[numerical_attr], df1["native_country"], df1[target_attr], test_size=0.33, random_state=123, stratify = df1[target_attr]) 


In [0]:
# One hot encode the categorical variables
# handle_unknown = "ignore" ensures that if a category ends up in test that is not in train, it can still predict by dropping that variable
onehotencoder = OneHotEncoder(handle_unknown='ignore')
# Use training data to identify how to one-hot encode
OneHotEncoder_fit = onehotencoder.fit(data_categorical_train)
# Apply to train and test
OneHotEncoder_train = OneHotEncoder_fit.transform(data_categorical_train).toarray()
OneHotEncoder_test = OneHotEncoder_fit.transform(data_categorical_test).toarray()

In [107]:
# One hot encode the response variable
OneHotEncoder = onehotencoder.fit(Y_train.values.get_values().reshape(-1, 1))
OneHotEncoder_target_train = OneHotEncoder.transform(Y_train.values.get_values().reshape(-1, 1)).toarray()
OneHotEncoder_target_test = OneHotEncoder.transform(Y_test.values.get_values().reshape(-1, 1)).toarray()
# Remove the second column since this is binary
OneHotEncoder_target_train = OneHotEncoder_target_train[:, 0]
OneHotEncoder_target_test = OneHotEncoder_target_test[:, 0]

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
# # Scale the continuous data
# Scaler= MinMaxScaler()
# scaled_attr = Scalar.fit(data_numerical_train)
# scaled_attr_train= scaled_attr.transform(data_numerical_train)
# scaled_attr_test= scaled_attr.transform(data_numerical_test)

# Normalize the continous data
Scaler = StandardScaler()
scaled_attr = Scaler.fit(data_numerical_train)
scaled_attr_train= scaled_attr.transform(data_numerical_train)
scaled_attr_test= scaled_attr.transform(data_numerical_test)



In [0]:
# Create a training data set with all input variables EXCEPT the variable to embed
X_train = np.hstack((scaled_attr_train, OneHotEncoder_train))

# Create a test data set with all input variables EXCEP the variable to embed
X_test = np.hstack((scaled_attr_test, OneHotEncoder_test))

In [0]:
# Give each level of native country a number between 0 and native_country_levels-1
native_country_levels_encoded = LabelEncoder().fit(df1["native_country"])

In [0]:
# Assign each of these ## numbers to each row in the training and test data set based on which native_country is in the row
native_country_levels_encoded_train=native_country_levels_encoded.transform(data_country_train)
native_country_levels_encoded_test=native_country_levels_encoded.transform(data_country_test)

In [112]:
np.unique(native_country_levels_encoded_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40])

## Build Multilayer Perceptron


 ### Embedding layer in neural net

In [0]:
# INPUT Layer
# Input shape is 1 since clothing id is 1 dimensional vector
# Need to give name
country_input = Input(shape=(1, ), name="country")
# native_country_levels is number of indexes, output_dim is the dimension you choose to represent country in
country_embed = Embedding(input_dim=native_country_levels, output_dim=10)(country_input)
# Flatten layer is used to ensure final embeddings are a 1-D vector for each row 
country_embed_flat = Flatten()(country_embed)

### Create a dense layer for other variables


In [0]:
# Get number of columns from X_train as shape
num_cat_inputs = Input(shape=(X_train.shape[1], ),name='num_cat_inputs')
# hidden layer with 8 nodes
out_num_cat = Dense(8, activation='relu')(num_cat_inputs)

### Connect embedding layer

In [0]:
# concactonate all data
concatenated = concatenate([country_embed_flat,out_num_cat],axis=-1)
# Use relu in layer
X = Dense(8, activation='relu')(concatenated)
# Use sigmoid on output layer
final_out = Dense(1, activation='sigmoid')(X)

In [0]:
# Create model from the embedding input layer and numerical/categorical feature input layer
model03 = Model(inputs=[country_input,num_cat_inputs], outputs=final_out)

In [117]:
model03.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
country (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 10)        410         country[0][0]                    
__________________________________________________________________________________________________
num_cat_inputs (InputLayer)     (None, 65)           0                                            
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 10)           0           embedding_1[0][0]                
____________________________________________________________________________________________

In [0]:
# Specify adam optimizer with 0.1 learning rate
adam = optimizers.Adam(lr = 0.1, amsgrad = False)

In [0]:
model03.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ["accuracy"])

In [120]:
model03.fit([native_country_levels_encoded_train,X_train], 
          y=OneHotEncoder_target_train, 
          epochs=5,validation_split=0.20)

Train on 331176 samples, validate on 82794 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff6147f0ef0>

In [122]:
# Evalute final results
model03.evaluate([native_country_levels_encoded_train,X_train], 
               y=OneHotEncoder_target_train, )



[0.506952140624245, 0.7879121675475091]

### Final test

In [0]:
# Evaluate loss and accuracy on training set
# model02.evaluate(X_test, 
#                y=OneHotEncoder_target_test, )

# ## OR 

# # Evalute final results
# model03.evaluate([native_country_levels_encoded_test,X_test], 
#                y=OneHotEncoder_target_test, )