# Creation of BlackBox Models for the Adult dataset

In [2]:
import sys
import os
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce

In [3]:
# ADD OTHER FOLDERS TO THIS LIST TO ADD THEM TO THE sys.path
modules_to_add = [""]

this_file = os.path.abspath('')

for module in modules_to_add:
    p = Path(this_file).parent / module 
    if p.exists():
        sys.path.append(str(p))
        print(f"ADDED: {p}")
    else:
        print(f"ERROR: {p} doesn't exist")

ADDED: /home/gerardozinno/Desktop/Tesi/Code/mlem


In [4]:
print(sys.path)

['/home/gerardozinno/Desktop/Tesi/Code/mlem/notebooks', '/home/gerardozinno/.pyenv/versions/3.9.9/lib/python39.zip', '/home/gerardozinno/.pyenv/versions/3.9.9/lib/python3.9', '/home/gerardozinno/.pyenv/versions/3.9.9/lib/python3.9/lib-dynload', '', '/home/gerardozinno/.pyenv/versions/3.9.9/envs/ml-environment/lib/python3.9/site-packages', '/home/gerardozinno/.pyenv/versions/3.9.9/envs/ml-environment/lib/python3.9/site-packages/IPython/extensions', '/home/gerardozinno/.ipython', '/home/gerardozinno/Desktop/Tesi/Code/mlem']


In [5]:
DATASET_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
WHOLE_DATASET_PATH = Path("../data/adult/adult.csv")
TRAIN_PATH = WHOLE_DATASET_PATH.parent / "train" / "train.csv"
TEST_PATH  = WHOLE_DATASET_PATH.parent / "test" / "test.csv"

In [6]:
TRAIN_SPLIT = .8
RAND_SEED   = 1234

## Dataset creation and cleaning

In [6]:
if not WHOLE_DATASET_PATH.exists():
    print(f"downloading dataset from {DATASET_URL}")
    columns = ['Age', 'Workclass', 'Fnlwgt', 'Education', 'Education-num', 'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-country', 'Target']
    df = pd.read_csv(DATASET_URL, names=columns)
    print("dataset downloaded")
    print("Cleaning and preprocessing dataset:")
    print("\tdropping duplicates")
    df.drop_duplicates(inplace=True)
    print("\ttrimming strings")
    df_str = df.select_dtypes(['object'])
    df[df_str.columns] = df_str.apply(lambda x: x.str.strip())
    print("\tremoving rows with missing values (?)")
    for col in df_str.columns:
        df = df[df[col] != '?']
    print("Target Encoding the dataset")
    feat = df.iloc[:, :-1]
    targ = df.iloc[:, -1]
    map_targ = {
        '<=50K': 0,
        '>50K': 1
    }
    targ = targ.map(map_targ)

    targenc = ce.TargetEncoder(verbose=1,return_df=True)
    df = targenc.fit_transform(feat, targ)
    df['Target'] = targ
    
    df.reset_index(inplace=True)
    WHOLE_DATASET_PATH.parent.mkdir(exist_ok=True)
    df.to_csv(WHOLE_DATASET_PATH, index=False)

    
if not (TRAIN_PATH.exists() and TEST_PATH.exists()):
    print(f"Couldn't find the train and/or test dataset(s) in:\n\t{TRAIN_PATH}\n\t{TEST_PATH}\n")
    if not WHOLE_DATASET_PATH.exists():
        print(f"ERROR: Couldn't even find {WHOLE_DATASET_PATH}")
        raise Exception("Can't find dataset")
    else:
        
        print(f"Creating train and test sets with a split of {TRAIN_SPLIT}% - {1-TRAIN_SPLIT:.2f}% and {RAND_SEED} as random seed")
        print('The dataset is split "as is", without preprocessing. The selection of the right columns is made by the respective Dataloader')
        df = pd.read_csv(WHOLE_DATASET_PATH, index_col=0)
        train, test = train_test_split(df, train_size=TRAIN_SPLIT, shuffle=True, random_state=RAND_SEED)
        TRAIN_PATH.parent.mkdir(exist_ok=True)
        TEST_PATH.parent.mkdir(exist_ok=True)
        train.to_csv(TRAIN_PATH, index=False)
        test.to_csv(TEST_PATH, index=False)
        print("train and test datasets created")

# Loading the datasets

In [26]:
df = pd.read_csv(WHOLE_DATASET_PATH, index_col=0)
test = pd.read_csv(TEST_PATH)
train = pd.read_csv(TRAIN_PATH)
assert(len(train) + len(test) == len(df))

In [8]:
df['Target'].value_counts()

0    22633
1     7506
Name: Target, dtype: int64

In [9]:
test['Target'].value_counts()

0    4528
1    1500
Name: Target, dtype: int64

In [10]:
train_np = train.to_numpy()
test_np = test.to_numpy()

X_train, y_train = train_np[:, :-1], train_np[:,-1]

X_test, y_test = test_np[:, :-1], test_np[:,-1]

In [14]:
EPOCHS = 20
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [15]:
import torch
import numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import sigmoid
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [16]:
from datasets import GenericDataset
train_dataset = GenericDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
test_dataset = GenericDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))

In [17]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

# Creating and fitting a model

In [None]:
class SimpleBinaryClassifier(nn.Module):
    def __init__(self,input_shape):
        super().__init__()
        self.fc1 = nn.Linear(input_shape,64)
        self.fc2 = nn.Linear(64,32)
        self.dropout = nn.Dropout(p=0.1)
        self.fc3 = nn.Linear(32,1)
        
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [None]:
model = SimpleBinaryClassifier(input_shape=14)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion   = nn.BCEWithLogitsLoss()

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, y_pred_list)

In [None]:
print(classification_report(y_test, y_pred_list))

In [None]:
ROOT = Path("../pretrained/")
MODEL_NAME      = "adult_1.tar"
SAVE_PATH = ROOT / MODEL_NAME

In [None]:
dic = {
    'model_state_dict' : model.state_dict(),
    'X_train' : X_train,
    'y_train' : y_train,
    'X_test': X_test,
    'y_test': y_test,
    'activation': 'sigmoid',
    'criterion': 'BCEWithLogitsLoss',
    'input_shape': 14,
    'epochs': EPOCHS,
    'optimizer': 'ADAM'
}

In [None]:
torch.save(dic, SAVE_PATH)
print(f"SAVED: {SAVE_PATH}")

# Model with 2 values when predicting

In [19]:
class SimpleBinaryClassifierP(nn.Module):
    def __init__(self,input_shape):
        super().__init__()
        self.fc1 = nn.Linear(input_shape,64)
        self.fc2 = nn.Linear(64,32)
        self.dropout = nn.Dropout(p=0.1)
        self.fc3 = nn.Linear(32,1)
        
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        if not self.training:
            x = torch.unsqueeze(x, -1)
            x = torch.cat((x, -1*x), -1)
        return x


In [20]:
model = SimpleBinaryClassifierP(input_shape=14)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion   = nn.BCEWithLogitsLoss()

SimpleBinaryClassifierP(
  (fc1): Linear(in_features=14, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
)


In [21]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [22]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 35.12712 | Acc: 63.050
Epoch 002: | Loss: 0.64625 | Acc: 76.249
Epoch 003: | Loss: 0.57984 | Acc: 77.472
Epoch 004: | Loss: 0.55661 | Acc: 78.279
Epoch 005: | Loss: 0.52969 | Acc: 78.629
Epoch 006: | Loss: 0.83899 | Acc: 78.297
Epoch 007: | Loss: 0.51924 | Acc: 78.594
Epoch 008: | Loss: 0.51669 | Acc: 78.411
Epoch 009: | Loss: 0.52002 | Acc: 78.533
Epoch 010: | Loss: 0.51896 | Acc: 78.313
Epoch 011: | Loss: 0.51711 | Acc: 78.342
Epoch 012: | Loss: 0.51860 | Acc: 78.358
Epoch 013: | Loss: 0.52005 | Acc: 78.151
Epoch 014: | Loss: 0.52472 | Acc: 78.183
Epoch 015: | Loss: 0.51848 | Acc: 78.379
Epoch 016: | Loss: 0.53303 | Acc: 77.411
Epoch 017: | Loss: 0.53343 | Acc: 77.281
Epoch 018: | Loss: 0.56505 | Acc: 75.401
Epoch 019: | Loss: 0.56166 | Acc: 75.273
Epoch 020: | Loss: 0.56084 | Acc: 75.114


In [23]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [24]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, y_pred_list)

ValueError: Classification metrics can't handle a mix of binary and multilabel-indicator targets

In [None]:
print(classification_report(y_test, y_pred_list))

In [None]:
ROOT = Path("../pretrained/")
MODEL_NAME      = "adult_1.tar"
SAVE_PATH = ROOT / MODEL_NAME

In [None]:
dic = {
    'model_state_dict' : model.state_dict(),
    'X_train' : X_train,
    'y_train' : y_train,
    'X_test': X_test,
    'y_test': y_test,
    'activation': 'sigmoid',
    'criterion': 'BCEWithLogitsLoss',
    'input_shape': 14,
    'epochs': EPOCHS,
    'optimizer': 'ADAM'
}

In [None]:
torch.save(dic, SAVE_PATH)
print(f"SAVED: {SAVE_PATH}")

# Model With 2 outputs

In [59]:
class SimpleBinaryClassifier2(nn.Module):
    def __init__(self,input_shape):
        super().__init__()
        self.fc1 = nn.Linear(input_shape,64)
        self.fc2 = nn.Linear(64,32)
        self.dropout = nn.Dropout(p=0.1)
        self.fc3 = nn.Linear(32,2)
        
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [63]:
model = SimpleBinaryClassifier2(input_shape=14)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion   = nn.MSELoss()

SimpleBinaryClassifier2(
  (fc1): Linear(in_features=14, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc3): Linear(in_features=32, out_features=2, bias=True)
)


In [61]:
def binary_acc2(y_pred, y_test):
    y_pred_tag = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    return acc

In [38]:
b, y = next(iter(train_loader))
b = b.to(device)
y = y.to(device)

In [39]:
preds = model(b)

In [53]:
a = torch.argmax(torch.softmax(preds, dim=1), dim=1)

In [56]:
a.type()

'torch.cuda.LongTensor'

In [70]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        y_pred_ = torch.softmax(y_pred, dim=1)
        y_pred_ = torch.argmax(y_pred_, dim=1).float()
        loss = criterion(y_pred_, y_batch)
        acc = binary_acc2(y_pred, y_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

# Saving the model and its data

### TEST
load the model just saved and compare its results against the original one.

In [None]:
RUN_TEST = True # Set to true to run the tests

In [None]:
if RUN_TEST:
    from mlem.black_box import PyTorchBlackBox

    dic = torch.load(SAVE_PATH)

    loaded_model = SimpleBinaryClassifier(14)
    loaded_model.load_state_dict(dic['model_state_dict'])

    model.cpu()
    M = PyTorchBlackBox(model, activation=sigmoid)
    LM = PyTorchBlackBox(loaded_model, activation=sigmoid)

    loaded_preds = LM.predict(X_test[:10])
    model_preds = M.predict(X_test[:10])

    if not all(loaded_preds == model_preds):
        print("The loaded model is not the same")
    else:
        print("All's good")

In [None]:
LM.predict(X_test[100:200])

In [None]:
LM.predict_proba(X_test[100:200])

In [None]:
import numpy as np
def sigmoid_to_prob(x):
    
    
LM2 = PyTorchBlackBox(model, activation=torch.softmax)

In [None]:
LM.predict(X_test[:10])

In [None]:
X_tensor = torch.tensor(X_test[:10], dtype=torch.float32, device="cpu")

In [None]:
X_tensor

In [None]:
X_tensor.to("cpu")
y_out = loaded_model(X_tensor)

In [None]:
y_out

# Using a Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [14]:
clf = RandomForestClassifier(n_estimators=100)

In [15]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [16]:
preds = clf.predict(X_test)

In [17]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, preds)

array([[4217,  311],
       [ 556,  944]])

In [18]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.88      0.93      0.91      4528
         1.0       0.75      0.63      0.69      1500

    accuracy                           0.86      6028
   macro avg       0.82      0.78      0.80      6028
weighted avg       0.85      0.86      0.85      6028



In [8]:
from mlem.utilities import save_pickle_bz2, load_pickle_bz2

In [20]:
ROOT = Path("../pretrained/")
MODEL_NAME      = "adult_rf.bz2"
SAVE_PATH = ROOT / MODEL_NAME

In [21]:
save_pickle_bz2(SAVE_PATH, clf)

In [22]:
np.savez_compressed(
                SAVE_PATH.parent / "adultdata",
                x_train=X_train,
                x_test=X_test,
                y_train=y_train,
                y_test=y_test,
            )

In [23]:
loaded = np.load(SAVE_PATH.parent / "adultdata.npz", allow_pickle=True)

In [24]:
loaded['x_train']

array([[3.20000000e+01, 2.18918433e-01, 2.29732000e+05, ...,
        0.00000000e+00, 4.50000000e+01, 2.54411176e-01],
       [4.20000000e+01, 2.18918433e-01, 1.36986000e+05, ...,
        0.00000000e+00, 4.00000000e+01, 2.54411176e-01],
       [2.40000000e+01, 2.18918433e-01, 2.04935000e+05, ...,
        0.00000000e+00, 5.60000000e+01, 2.54411176e-01],
       ...,
       [2.80000000e+01, 2.18918433e-01, 4.41620000e+05, ...,
        0.00000000e+00, 4.30000000e+01, 5.44554455e-02],
       [5.10000000e+01, 2.18918433e-01, 1.71914000e+05, ...,
        0.00000000e+00, 5.00000000e+01, 2.54411176e-01],
       [3.10000000e+01, 2.18918433e-01, 3.25500000e+04, ...,
        0.00000000e+00, 4.00000000e+01, 2.54411176e-01]])

In [33]:
from pandas import read_pickle


In [34]:
(SAVE_PATH.parent / "adult_rf.tar")

FileNotFoundError: [Errno 2] No such file or directory: '../pretrained/adult_rf.tar'

In [81]:
import pickle

In [25]:
rf_loaded = load_pickle_bz2(SAVE_PATH.parent / "adult_rf.bz2")

In [26]:
preds = rf_loaded.predict(X_test)

In [32]:
uni = [i for i,j in enumerate(preds) if j == 1]

In [None]:
uni

In [35]:
y_test[15]

1.0

In [41]:
preds = clf.predict(X_test)
preds_loaded = rf_loaded.predict(X_test)

In [42]:
assert(all(preds == preds_loaded))

In [43]:
clf.predict_proba(X_test)

array([[0.71, 0.29],
       [0.76, 0.24],
       [0.93, 0.07],
       ...,
       [0.57, 0.43],
       [0.2 , 0.8 ],
       [1.  , 0.  ]])

In [44]:
from mlem.black_box import SklearnBlackBox

In [45]:
bb = SklearnBlackBox(clf)

In [46]:
bb.predict(X_test)

array([0., 0., 0., ..., 0., 1., 0.])

In [47]:
bb.predict_proba(X_test)

array([[0.71, 0.29],
       [0.76, 0.24],
       [0.93, 0.07],
       ...,
       [0.57, 0.43],
       [0.2 , 0.8 ],
       [1.  , 0.  ]])

In [93]:
y_test

array([0, 0, 1, ..., 0, 1, 0])

# Extract Balanced Subset from Adult

In [23]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [60]:
y_train
y_test = y_test.astype(int)

In [76]:
ones = [i for (i,j) in enumerate(y_train) if j == 1]
zeros = [i for (i,j) in enumerate(y_train) if j == 0]

In [77]:
X_ones = X_train[ones]
X_zeros = X_train[zeros]

In [80]:
assert len(X_ones)+len(X_zeros) == len(X_train)

In [83]:
X_zeros = X_zeros[:len(X_ones)]

In [97]:
X_train_balanced = np.concatenate([X_ones, X_zeros])

In [102]:
y_train_balanced = np.concatenate([np.ones(len(X_ones)), np.zeros(len(X_zeros))]).astype(int)

In [107]:
def unisonShuffleDataset(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [108]:
X_train_balanced, y_train_balanced = unisonShuffleDataset(X_train_balanced, y_train_balanced)

In [120]:
assert sum(y_train_balanced) == len(y_train_balanced) / 2

True

In [114]:
clf_bal = RandomForestClassifier(n_estimators=150)

In [115]:
clf.fit(X_train_balanced, y_train_balanced)

RandomForestClassifier()

In [116]:
preds = clf.predict(X_test)

In [117]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, preds)

array([[3685,  843],
       [ 254, 1246]])

In [118]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.94      0.81      0.87      4528
           1       0.60      0.83      0.69      1500

    accuracy                           0.82      6028
   macro avg       0.77      0.82      0.78      6028
weighted avg       0.85      0.82      0.83      6028



In [90]:
z = np.zeros(10)

In [91]:
z

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [94]:
np.concatenate([o,z]).astype(int)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# LOAD THE MODEL TRAINED ON THE SERVER

In [9]:
rand_for = load_pickle_bz2("../pretrained/adult_randfor.bz2")

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [24]:
rand_for.max_features

'sqrt'

In [25]:
data = np.load("../pretrained/adult_randfor.data.npz")

In [26]:
for k in data.keys():
    print(k)

x_train
x_test
y_train
y_test


In [27]:
X_test, y_test = data['x_test'], data['y_test']

In [28]:
preds = rand_for.predict(X_test)

In [29]:
sum(preds == y_test) / len(y_test)

0.8651293961512939