# From 0 to machine learning - Let's try an example with real data

Start importing the required libraries.

In [46]:
import numpy as np
import pandas as pd
from tensorflow.keras import models, activations, layers, metrics, optimizers, regularizers, callbacks
import re
import datetime

## Import dataset and show the first rows
So you can have an idea of what the dataset we are using contains.

In [2]:
dataset = pd.read_csv("/workspace/data/income.csv", header=0, skipinitialspace=True)
del dataset['Unnamed: 0']
dataset.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education num,marital status,occupation,relationship,race,m_f,capital gain,capital loss,hours/week,native country,income bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Aim of the project

We want to predict the income of a person (the records of the dataset are labeled by the last column "income bracket").

## Analize the dataset

We see that we have some column names that doesn't describe what they contains.

We find that the label for the row is the last column, the one that classify the income in two classes (more or less than 50k/year).

In [3]:
dataset.rename(columns={"m_f": 'gender', "income bracket": "output"}, inplace=True)
dataset.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education num,marital status,occupation,relationship,race,gender,capital gain,capital loss,hours/week,native country,output
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K


### Look for bad data
Aim here is to find bad data such as typos and correct them.

In [4]:
dataset.groupby(["race"])["race"].count()

race
Amer-Indian-Eskimo      311
Asian-Pac-Islander     1039
Black                  3124
Other                   271
White                 27527
Wite                    289
Name: race, dtype: int64

In [5]:
condition = dataset.loc[:, "race"] == "Wite"
dataset.loc[condition, "race"] = "White"
dataset.groupby(["race"])["race"].count()

race
Amer-Indian-Eskimo      311
Asian-Pac-Islander     1039
Black                  3124
Other                   271
White                 27816
Name: race, dtype: int64

Aggregate similar data to reduce feature dimension

In [6]:
def replace_on_condition(dataset, column_name, condition_value, replace_value):
    condition = dataset.loc[:, column_name].str.contains(condition_value)
    dataset.loc[condition, column_name] = replace_value
    
    return dataset
    
dataset = replace_on_condition(dataset, "workclass", "Private", "private")
dataset = replace_on_condition(dataset, "workclass", "Self", "self")
dataset = replace_on_condition(dataset, "workclass", "gov", "gov")
dataset = replace_on_condition(dataset, "workclass", "Without", "other")
dataset = replace_on_condition(dataset, "workclass", "Never", "other")
dataset = replace_on_condition(dataset, "workclass", "\?", "other")

dataset["workclass"].unique()

array(['gov', 'self', 'private', 'other'], dtype=object)

In [7]:
dataset["fnlwgt"] = np.log1p(dataset["fnlwgt"])
dataset["fnlwgt"].describe()

count    32561.000000
mean        11.983778
std          0.630738
min          9.416216
25%         11.676981
50%         12.091542
75%         12.376035
max         14.210727
Name: fnlwgt, dtype: float64

In [8]:
def standardize_number(dataframe, column_name):
    return (dataframe[column_name] - dataframe[column_name].mean()) / dataframe[column_name].std()

for name in ["age", "fnlwgt", "capital gain", "capital loss", "education num", "hours/week"]:
    dataset[name] = standardize_number(dataset, name)

In [9]:
north_america = ['United-States', 'Cuba', 'Jamaica', 'Mexico', 'Honduras', 'Canada', 'Puerto-Rico']
south_america = ['South', 'Philippines', 'Columbia', 'Cambodia', 'Ecuador', 'Haiti', 'Dominican-Republic', 'El-Salvador', 'Guatemala', 'Peru', 'Outlying-US(Guam-USVI-etc)', 'Trinadad&Tobago', 'Nicaragua']
asia = ['India', 'Iran', 'Thailand', 'Laos', 'Taiwan', 'China', 'Japan', 'Vietnam', 'Hong']
europe = ['England', 'Germany', 'Italy', 'Poland', 'Portugal', 'France', 'Yugoslavia', 'Scotland', 'Greece', 'Ireland', 'Hungary', 'Holand-Netherlands']
other = ['\?']

def replace_native_country(dataset, values, replacement):
    for value in values:
        dataset = replace_on_condition(dataset, "native country", value, replacement)
    
    return dataset

dataset = replace_native_country(dataset, north_america, "north_america")
dataset = replace_native_country(dataset, south_america, "south_america")
dataset = replace_native_country(dataset, asia, "asia")
dataset = replace_native_country(dataset, europe, "europe")
dataset = replace_native_country(dataset, other, "other")

  return func(self, *args, **kwargs)


### Expand categorical data
Aim here is to expand a categorical feature with finite values into more features.

In [10]:
def expand_category(dataframe, column_name):
    expanded_dataframe = pd.concat([dataframe, pd.get_dummies(dataframe[column_name], prefix=column_name)], axis=1)

    expanded_dataframe.drop([column_name], axis=1, inplace=True)

    return expanded_dataframe

for name in ["workclass", "education", "marital status", "native country", "gender", "race", "occupation", "relationship"]:
    dataset = expand_category(dataset, name)

In [11]:
dataset.columns

Index(['age', 'fnlwgt', 'education num', 'capital gain', 'capital loss',
       'hours/week', 'output', 'workclass_gov', 'workclass_other',
       'workclass_private', 'workclass_self', 'education_10th',
       'education_11th', 'education_12th', 'education_1st-4th',
       'education_5th-6th', 'education_7th-8th', 'education_9th',
       'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors',
       'education_Doctorate', 'education_HS-grad', 'education_Masters',
       'education_Preschool', 'education_Prof-school',
       'education_Some-college', 'marital status_Divorced',
       'marital status_Married-AF-spouse', 'marital status_Married-civ-spouse',
       'marital status_Married-spouse-absent', 'marital status_Never-married',
       'marital status_Separated', 'marital status_Widowed',
       'native country_Outlying-US(Guam-USVI-etc)', 'native country_asia',
       'native country_europe', 'native country_north_america',
       'native country_other', 'native coun

### Output class as 0 and 1

In [12]:
condition = dataset.loc[:, "output"] == "<=50K"
dataset.loc[condition, "output"] = 0.0
dataset.loc[np.logical_not(condition), "output"] = 1.0

In [13]:
inputs = dataset.drop(["output"], axis=1)

outputs = dataset["output"]

dataset.head(5)

Unnamed: 0,age,fnlwgt,education num,capital gain,capital loss,hours/week,output,workclass_gov,workclass_other,workclass_private,...,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife
0,0.03067,-1.15028,1.134721,0.148451,-0.216656,-0.035429,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0.837096,-1.035977,1.134721,-0.145918,-0.216656,-2.222119,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,-0.042641,0.47186,-0.420053,-0.145918,-0.216656,-0.035429,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,1.057031,0.606241,-1.19744,-0.145918,-0.216656,-0.035429,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,-0.775756,1.186286,1.134721,-0.145918,-0.216656,-0.035429,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [51]:
def build(output_size):
    model = models.Sequential()
    model.add(layers.Dense(units=50, activation=activations.tanh))
    model.add(layers.Dense(units=output_size, activation=activations.sigmoid))

    model.compile(loss='binary_crossentropy',
                  metrics=[metrics.AUC()],
                  optimizer=optimizers.Adam(learning_rate=0.01, beta_1=0.9))

    return model

def fit(epochs, batch_size, inputs, outputs, validation_inputs, validation_outputs):
    model.fit(inputs, outputs,
              callbacks=[callbacks.EarlyStopping(mode="min", patience=10, restore_best_weights=True, monitor="val_loss")],
              validation_data=(validation_inputs, validation_outputs),
              batch_size=batch_size,
              epochs=epochs)

epochs = 50
batch_size = 500

inputs = dataset.drop(["output"], axis=1).to_numpy(dtype=np.float32)
outputs = dataset["output"].to_numpy(dtype=np.float32)
outputs = np.expand_dims(outputs, 1)


over_mask = np.reshape(outputs == 1, (-1))

inputs_over = inputs[over_mask]
outputs_over = outputs[over_mask]
inputs_notover = inputs[np.logical_not(over_mask)]
outputs_notover = outputs[np.logical_not(over_mask)]

mask_over = np.random.random(inputs_over.shape[0]) > 0.1
mask_notover = np.random.random(inputs_notover.shape[0]) > 0.1

train_inputs_over = inputs_over[mask_over]
train_inputs_notover = inputs_notover[mask_notover]
train_outputs_over = outputs_over[mask_over]
train_outputs_notover = outputs_notover[mask_notover]
validation_inputs_over = inputs_over[np.logical_not(mask_over)]
validation_inputs_notover = inputs_notover[np.logical_not(mask_notover)]
validation_outputs_over = outputs_over[np.logical_not(mask_over)]
validation_outputs_notover = outputs_notover[np.logical_not(mask_notover)]

train_inputs = np.vstack((train_inputs_over, train_inputs_notover))
train_outputs = np.vstack((train_outputs_over, train_outputs_notover))
validation_inputs = np.vstack((validation_inputs_over, validation_inputs_notover))
validation_outputs = np.vstack((validation_outputs_over, validation_outputs_notover))

np.save("./inputs", inputs)
np.save("./outputs", outputs)

model = build(outputs.shape[1])

fit(epochs, batch_size, train_inputs, train_outputs, validation_inputs, validation_outputs)

print(model.predict(validation_inputs))
print(validation_outputs)

Train on 29260 samples, validate on 3301 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
[[0.5672699 ]
 [0.83466876]
 [0.92695385]
 ...
 [0.07971731]
 [0.07304411]
 [0.01557716]]
[[1.]
 [1.]
 [1.]
 ...
 [0.]
 [0.]
 [0.]]
