# Creation of BlackBox Models for the Geotarget30 dataset

In [1]:
import sys
import os
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# ADD OTHER FOLDERS TO THIS LIST TO ADD THEM TO THE sys.path
modules_to_add = [""]

this_file = os.path.abspath('')

for module in modules_to_add:
    p = Path(this_file).parent / module 
    if p.exists():
        sys.path.append(str(p))
        print(f"ADDED: {p}")
    else:
        print(f"ERROR: {p} doesn't exist")

ADDED: /home/gerardozinno/Desktop/Tesi/Code/mlem


In [3]:
print(sys.path)

['/home/gerardozinno/Desktop/Tesi/Code/mlem/notebooks', '/home/gerardozinno/.pyenv/versions/3.9.9/lib/python39.zip', '/home/gerardozinno/.pyenv/versions/3.9.9/lib/python3.9', '/home/gerardozinno/.pyenv/versions/3.9.9/lib/python3.9/lib-dynload', '', '/home/gerardozinno/.pyenv/versions/3.9.9/envs/ml-environment/lib/python3.9/site-packages', '/home/gerardozinno/.pyenv/versions/3.9.9/envs/ml-environment/lib/python3.9/site-packages/IPython/extensions', '/home/gerardozinno/.ipython', '/home/gerardozinno/Desktop/Tesi/Code/mlem']


# CREATING THE TRAIN AND TEST DATASETS (if they aren't in the respective folders)

In [4]:
WHOLE_DATASET_PATH = Path("../data/geotarget/geotarget_30.csv")
TRAIN_PATH = WHOLE_DATASET_PATH.parent / "train" / "train.csv"
TEST_PATH  = WHOLE_DATASET_PATH.parent / "test" / "test.csv"

In [5]:
TRAIN_SPLIT = .8
RAND_SEED   = 1234

In [6]:
if not (TRAIN_PATH.exists() and TEST_PATH.exists()):
    print(f"Couldn't find the train and/or test dataset(s) in:\n\t{TRAIN_PATH}\n\t{TEST_PATH}\n")
    if not WHOLE_DATASET_PATH.exists():
        print(f"ERROR: Couldn't even find {WHOLE_DATASET_PATH}")
        raise Exception("Can't find dataset")
    else:
        
        print(f"Creating train and test sets with a split of {TRAIN_SPLIT}% - {1-TRAIN_SPLIT:.2f}% and {RAND_SEED} as random seed")
        print('The dataset is split "as is", without preprocessing. The selection of the right columns is made by the respective Dataloader')
        df = pd.read_csv(WHOLE_DATASET_PATH)
        train, test = train_test_split(df, train_size=TRAIN_SPLIT, shuffle=True, random_state=RAND_SEED)
        TRAIN_PATH.parent.mkdir(exist_ok=True)
        TEST_PATH.parent.mkdir(exist_ok=True)
        train.to_csv(TRAIN_PATH, index=False)
        test.to_csv(TEST_PATH, index=False)
        print("train and test datasets created")

# CREATING THE MODELS

In [7]:
import torch
import torch.optim as optim
import torch.nn as nn
from blackboxes.pytorch.linear import LinearDropLinear

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
model = LinearDropLinear()
if device.type == "cuda":
    print("moving model to GPU")
    model = model.cuda()
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

moving model to GPU
LinearDropLinear(
  (fc1): Linear(in_features=236, out_features=128, bias=True)
  (drop): Dropout(p=0.3, inplace=False)
  (fc4): Linear(in_features=128, out_features=30, bias=True)
)


# LOADING THE DATA

In [9]:
from datasets.geotarget import Geotarget30
from torch.utils.data import DataLoader

In [10]:
train_set = Geotarget30(TRAIN_PATH)
test_set  = Geotarget30(TEST_PATH)

In [11]:
train_dataloader = DataLoader(train_set, batch_size=64)
test_dataloader  = DataLoader(test_set, batch_size=64)

# TRAINING LOOP

In [12]:
import torch

In [19]:
from blackboxes.pytorch.utilities import train

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [21]:
train(model, optimizer, loss_fn, train_dataloader, test_dataloader, device=device)

RuntimeError: The size of tensor a (30) must match the size of tensor b (64) at non-singleton dimension 1

### Save the model and other infos

In [16]:
ROOT = Path("../pretrained/")
MODEL_NAME      = "linear_geo30.tar"
SAVE_PATH = ROOT / MODEL_NAME

In [17]:
x_train, y_train = train_set[:]
x_test, y_test = test_set[:]

In [18]:
dic = {
    'model_state_dict' : model.state_dict(),
    'x_train' : x_train,
    'y_train' : y_train,
    'x_test': x_test,
    'y_test': y_test
}

In [19]:
torch.save(dic, SAVE_PATH)
print(f"SAVED: {SAVE_PATH}")

SAVED: ../pretrained/linear_geo30.tar


### TEST
load the model just saved and compare its results against the original one.

In [25]:
RUN_TEST = False # Set to true to run the tests

In [26]:
if RUN_TEST:
    from mlem.black_box import PyTorchBlackBox

    dic = torch.load(SAVE_PATH)

    loaded_model = LinearDropLinear()
    loaded_model.load_state_dict(dic['model_state_dict'])

    model.cpu()
    M = PyTorchBlackBox(model)
    LM = PyTorchBlackBox(loaded_model)

    loaded_preds = LM.predict(x_test[:10])
    model_preds = M.predict(x_test[:10])

    if not all(loaded_preds == model_preds):
        print("The loaded model is not the same")
    else:
        print("All's good")

All's good
