#### Source: https://towardsdatascience.com/pytorch-widedeep-deep-learning-for-tabular-data-9cd1c48eb40d

In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

adult = pd.read_csv("data/adult.csv", index_col=0)
adult.columns = [c.replace("-", "_") for c in adult.columns]
adult["income_label"] = (adult["income"].apply(lambda x: ">50K" in x)).astype(int)
adult.drop(["income",'gender', 'capital_gain',
       'capital_loss', 'native_country'], axis=1, inplace=True)

for c in adult.columns:
    if adult[c].dtype == 'O':
        adult[c] = adult[c].apply(lambda x: "unknown" if x == "?" else x)
        adult[c] = adult[c].str.lower()
adult_train, adult_test = train_test_split(adult, test_size=0.2, stratify=adult.income_label)

In [77]:
adult_test.shape

  and should_run_async(code)


(6513, 10)

In [47]:
adult_train.shape

(26048, 10)

In [80]:
adult_test.columns==adult_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [40]:
adult.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_label
0,39,state-gov,bachelors,13,never-married,adm-clerical,not-in-family,white,male,2174,0,40,united-states,0
1,50,self-emp-not-inc,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,0,0,13,united-states,0
2,38,private,hs-grad,9,divorced,handlers-cleaners,not-in-family,white,male,0,0,40,united-states,0
3,53,private,11th,7,married-civ-spouse,handlers-cleaners,husband,black,male,0,0,40,united-states,0
4,28,private,bachelors,13,married-civ-spouse,prof-specialty,wife,black,female,0,0,40,cuba,0


In [44]:
len(set(adult.columns))

  and should_run_async(code)


14

In [81]:
from pytorch_widedeep.preprocessing import TabPreprocessor

# define the embedding and continuous columns, and target
embed_cols = [
    ('workclass', 6), 
    ('education', 8), 
    ('marital_status', 6), 
    ('occupation',8), 
    ('relationship', 6), 
    ('race', 6)]
cont_cols = ["age", "hours_per_week",  "education_num"]
target = adult_train["income_label"].values

# prepare deeptabular component
tab_preprocessor = TabPreprocessor(embed_cols=embed_cols, continuous_cols=cont_cols)
X_tab = tab_preprocessor.fit_transform(adult_train)

In [101]:
# At this stage the data is prepared and we are ready to build the model

from pytorch_widedeep.models import TabMlp, WideDeep

tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=tab_preprocessor.continuous_cols, 
)
model = WideDeep(deeptabular=tab_mlp)

In [73]:
model

WideDeep(
  (deeptabular): Sequential(
    (0): TabMlp(
      (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(
        (cat_embed): DiffSizeCatEmbeddings(
          (embed_layers): ModuleDict(
            (emb_layer_workclass): Embedding(10, 6, padding_idx=0)
            (emb_layer_education): Embedding(17, 8, padding_idx=0)
            (emb_layer_marital_status): Embedding(8, 6, padding_idx=0)
            (emb_layer_occupation): Embedding(16, 8, padding_idx=0)
            (emb_layer_relationship): Embedding(7, 6, padding_idx=0)
            (emb_layer_race): Embedding(6, 6, padding_idx=0)
          )
          (embedding_dropout): Dropout(p=0.1, inplace=False)
        )
        (cont_norm): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (tab_mlp): MLP(
        (mlp): Sequential(
          (dense_layer_0): Sequential(
            (0): Dropout(p=0.1, inplace=False)
            (1): Linear(in_features=43, out_features=200, bias=True)
      

In [83]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.metrics import Accuracy

trainer = Trainer(model, objective="binary", metrics=[(Accuracy)])
trainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256, val_split=0.2)

epoch 1: 100%|██████████| 82/82 [00:00<00:00, 107.98it/s, loss=0.432, metrics={'acc': 0.7858}]
valid: 100%|██████████| 21/21 [00:00<00:00, 102.63it/s, loss=0.366, metrics={'acc': 0.8219}]
epoch 2: 100%|██████████| 82/82 [00:00<00:00, 101.73it/s, loss=0.367, metrics={'acc': 0.8265}]
valid: 100%|██████████| 21/21 [00:00<00:00, 102.43it/s, loss=0.356, metrics={'acc': 0.8299}]
epoch 3: 100%|██████████| 82/82 [00:00<00:00, 109.31it/s, loss=0.36, metrics={'acc': 0.8302}]
valid: 100%|██████████| 21/21 [00:00<00:00, 108.20it/s, loss=0.353, metrics={'acc': 0.8328}]
epoch 4: 100%|██████████| 82/82 [00:00<00:00, 101.15it/s, loss=0.358, metrics={'acc': 0.8317}]
valid: 100%|██████████| 21/21 [00:00<00:00, 99.21it/s, loss=0.351, metrics={'acc': 0.8332}]
epoch 5: 100%|██████████| 82/82 [00:00<00:00, 106.73it/s, loss=0.355, metrics={'acc': 0.8362}]
valid: 100%|██████████| 21/21 [00:00<00:00, 101.87it/s, loss=0.351, metrics={'acc': 0.8324}]


In [None]:
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(df_train)

In [84]:
# predict on test
#X_wide_te = wide_preprocessor.transform(adult_test)
X_tab_te = tab_preprocessor.transform(adult_test)
preds = trainer.predict(X_tab=X_tab_te)

  and should_run_async(code)
predict: 100%|██████████| 26/26 [00:00<00:00, 124.37it/s]


In [85]:
preds

  and should_run_async(code)


array([0, 0, 0, ..., 0, 0, 0])

In [88]:
adult_test.income_label.values

array([0, 0, 0, ..., 0, 0, 0])

In [86]:
def compute_accuracy(y_true, y_pred):
    correct_predictions = 0
    # iterate over each label and check
    for true, predicted in zip(y_true, y_pred):
        if true == predicted:
            correct_predictions += 1
    # compute the accuracy
    accuracy = correct_predictions/len(y_true)
    return accuracy

In [89]:
compute_accuracy(adult_test['income_label'].values, preds)

0.841701212958698

# Slightly different model

In [92]:
# https://githubhelp.com/jrzaurin/pytorch-widedeep


import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep.datasets import load_adult

  and should_run_async(code)


In [94]:
# Load data

#df = load_adult(as_frame=True)
df = pd.read_csv("data/adult.csv", index_col=0)
df.head()

  and should_run_async(code)


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [93]:


df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop("income", axis=1, inplace=True)
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.income_label)

# Define the 'column set up'
wide_cols = [
    "education",
    "relationship",
    "workclass",
    "occupation",
    "native-country",
    "gender",
]
crossed_cols = [("education", "occupation"), ("native-country", "occupation")]

cat_embed_cols = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital-gain",
    "capital-loss",
    "native-country",
]
continuous_cols = ["age", "hours-per-week"]
target = "income_label"
target = df_train[target].values

In [96]:
# prepare the data
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(df_train)

#____________________________________________________________________

tab_preprocessor = TabPreprocessor(cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols  
                                   # type: ignore[arg-type]
                                  )
X_tab = tab_preprocessor.fit_transform(df_train)

In [97]:
# build the model
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)

tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=continuous_cols,
)
model = WideDeep(wide=wide, deeptabular=tab_mlp)
model

  and should_run_async(code)


WideDeep(
  (wide): Wide(
    (wide_linear): Embedding(722, 1, padding_idx=0)
  )
  (deeptabular): Sequential(
    (0): TabMlp(
      (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(
        (cat_embed): DiffSizeCatEmbeddings(
          (embed_layers): ModuleDict(
            (emb_layer_workclass): Embedding(10, 5, padding_idx=0)
            (emb_layer_education): Embedding(17, 8, padding_idx=0)
            (emb_layer_marital-status): Embedding(8, 5, padding_idx=0)
            (emb_layer_occupation): Embedding(16, 7, padding_idx=0)
            (emb_layer_relationship): Embedding(7, 4, padding_idx=0)
            (emb_layer_race): Embedding(6, 4, padding_idx=0)
            (emb_layer_gender): Embedding(3, 2, padding_idx=0)
            (emb_layer_capital-gain): Embedding(118, 23, padding_idx=0)
            (emb_layer_capital-loss): Embedding(90, 20, padding_idx=0)
            (emb_layer_native-country): Embedding(43, 13, padding_idx=0)
          )
          (embedding_dropout): Dropout

In [98]:
# train and validate
trainer = Trainer(model, objective="binary", metrics=[Accuracy])
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=target,
    n_epochs=5,
    batch_size=256,
)

epoch 1: 100%|██████████| 102/102 [00:01<00:00, 85.36it/s, loss=0.467, metrics={'acc': 0.7776}]
epoch 2: 100%|██████████| 102/102 [00:01<00:00, 89.30it/s, loss=0.37, metrics={'acc': 0.827}] 
epoch 3: 100%|██████████| 102/102 [00:01<00:00, 90.97it/s, loss=0.341, metrics={'acc': 0.8398}]
epoch 4: 100%|██████████| 102/102 [00:01<00:00, 89.85it/s, loss=0.326, metrics={'acc': 0.8477}]
epoch 5: 100%|██████████| 102/102 [00:01<00:00, 91.16it/s, loss=0.314, metrics={'acc': 0.8563}]


In [99]:
# predict on test
X_wide_te = wide_preprocessor.transform(df_test)
X_tab_te = tab_preprocessor.transform(df_test)
preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

predict: 100%|██████████| 26/26 [00:00<00:00, 115.55it/s]


In [100]:
def compute_accuracy(y_true, y_pred):
    correct_predictions = 0
    # iterate over each label and check
    for true, predicted in zip(y_true, y_pred):
        if true == predicted:
            correct_predictions += 1
    # compute the accuracy
    accuracy = correct_predictions/len(y_true)
    return accuracy


compute_accuracy(df_test['income_label'].values, preds)

  and should_run_async(code)


0.8604329801934593

In [61]:

# Save and load

# Option 1: this will also save training history and lr history if the
# LRHistory callback is used
trainer.save(path="model_weights", save_state_dict=True)

# Option 2: save as any other torch model
torch.save(model.state_dict(), "model_weights/wd_model.pt")

# From here in advance, Option 1 or 2 are the same. I assume the user has
# prepared the data and defined the new model components:
# 1. Build the model
model_new = WideDeep(wide=wide, deeptabular=tab_mlp)
model_new.load_state_dict(torch.load("model_weights/wd_model.pt"))

# 2. Instantiate the trainer
trainer_new = Trainer(model_new, objective="binary")

# 3. Either start the fit or directly predict
preds = trainer_new.predict(X_wide=X_wide, X_tab=X_tab)

  and should_run_async(code)
epoch 1: 100%|██████████| 153/153 [00:01<00:00, 90.25it/s, loss=0.43, metrics={'acc': 0.7988}] 
epoch 2: 100%|██████████| 153/153 [00:01<00:00, 96.62it/s, loss=0.339, metrics={'acc': 0.8453}]
epoch 3: 100%|██████████| 153/153 [00:01<00:00, 96.38it/s, loss=0.318, metrics={'acc': 0.8535}]
epoch 4: 100%|██████████| 153/153 [00:01<00:00, 96.69it/s, loss=0.308, metrics={'acc': 0.8596}]
epoch 5: 100%|██████████| 153/153 [00:01<00:00, 95.64it/s, loss=0.298, metrics={'acc': 0.8627}]
predict: 100%|██████████| 39/39 [00:00<00:00, 140.62it/s]
predict: 100%|██████████| 153/153 [00:00<00:00, 200.71it/s]


# Annother way to dwnload the data

In [57]:
## This takes longer to run and contains many categorical features

from pytorch_widedeep.models import Wide, TabMlp, TabResnet, TabTransformer, WideDeep, TabFastFormer, TabPerceiver, SAINT


!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                 'marital_status', 'occupation', 'relationship', 'race', 'sex', 
                 'capital_gain', 'capital_loss', 'hours_per_week', 
                 'native_country', 'income']

df = pd.read_csv('adult.data', header=None, names=names, na_values=['?', ' ?'])
#create a binary target
df['income_label'] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop('income', axis=1, inplace=True)


cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16), ('occupation',16),('native_country',16)]
cat_embed_cols = ['education', 'relationship' , 'workclass' , 'occupation' , 'native_country']
continuous_cols = ['age','hours_per_week']
target_col = 'income_label'
target = df[target_col].values
original_df = df
original_df.head()

--2022-05-28 16:03:30--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974305 (3,8M) [application/x-httpd-php]
Saving to: ‘adult.data’


2022-05-28 16:03:44 (301 KB/s) - ‘adult.data’ saved [3974305/3974305]



Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
