## Model Training

### Model 1 - Ordinal Transformer

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import optuna

torch.manual_seed(0)
np.random.seed(0)


def ordinal_accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    total_count = len(y_true)
    accurate_count = sum(
        1
        for true_label, pred_label in zip(y_true, y_pred)
        if pred_label in [true_label, true_label - 1, true_label + 1]
    )
    return accurate_count / total_count


# Load and preprocess data
datafile_path = "../data/fine_food_reviews_fine_tuned_e5_small_v2_1k.parquet"
df = pd.read_parquet(datafile_path)

embedding_dim = np.array(list(df.embedding.values)).shape[1]
print(f"Shape of embeddings in the dataframe: {embedding_dim}")

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    list(df.embedding.values),
    df.Score - 1,
    test_size=0.3,
    random_state=42,
)

# Convert the problem to three classes (very bad, bad, neutral-good)
# y_train = y_train.apply(lambda x: 0 if x < 3 else 1 if x == 3 else 2)
# y_test = y_test.apply(lambda x: 0 if x < 3 else 1 if x == 3 else 2)

n_classes = len(y_train.unique())


class ReviewsDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx].clone().detach().float()
        label = self.labels[idx].clone().detach().long()
        return embedding, label


# Convert train and test splits to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).clone().detach()
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long).clone().detach()
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).clone().detach()
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long).clone().detach()

# Create Dataset objects
train_dataset = ReviewsDataset(X_train_tensor, y_train_tensor)
test_dataset = ReviewsDataset(X_test_tensor, y_test_tensor)

params = {
    "input_dim": embedding_dim,
    "n_classes": n_classes,
    "num_hidden_1": 256,
    "num_hidden_2": 128,
    "num_hidden_3": 64,
    "dropout_rate": 0.5,
    "num_epochs": 20,
    "batch_size": 64,
    "learning_rate": 0.001,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}


# Define the Transformer model
class TransformerModel(nn.Module):
    def __init__(
        self,
        input_size,
        d_model,
        nhead,
        num_encoder_layers,
        dim_feedforward,
        output_size,
    ):
        super().__init__()
        self.embedding = nn.Linear(input_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, d_model))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model, nhead, dim_feedforward, dropout=0.1, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers)
        self.fc = nn.Linear(d_model, output_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding
        x = self.transformer_encoder(x.unsqueeze(1))  # Add sequence dimension
        x = self.fc(x.squeeze(1))
        return x


# Define the loss function for CORN
def loss_corn(logits, y_train, num_classes):
    sets = []
    for i in range(num_classes - 1):
        label_mask = y_train > i - 1
        label_tensor = (y_train[label_mask] > i).to(torch.int64)
        sets.append((label_mask, label_tensor))

    num_examples = 0
    losses = 0.0
    for task_index, s in enumerate(sets):
        train_examples = s[0]
        train_labels = s[1]

        if len(train_labels) < 1:
            continue

        num_examples += len(train_labels)
        pred = logits[train_examples, task_index]

        loss = -torch.sum(
            F.logsigmoid(pred) * train_labels + (F.logsigmoid(pred) - pred) * (1 - train_labels)
        )
        losses += loss
    return losses / num_examples


def label_from_logits(logits):
    """Converts logits to class labels. This function is specific to CORN."""
    probas = torch.sigmoid(logits)
    probas = torch.cumprod(probas, dim=1)
    predict_levels = probas > 0.5
    return torch.sum(predict_levels, dim=1)


# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    nhead = trial.suggest_int("nhead", 1, 8)
    d_model = trial.suggest_int("d_model", nhead * 8, nhead * 64, step=nhead * 8)
    num_encoder_layers = trial.suggest_int("num_encoder_layers", 1, 6)
    dim_feedforward = trial.suggest_int("dim_feedforward", 128, 1024)
    # dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [10, 32, 64, 128])
    num_epochs = trial.suggest_int("num_epochs", 10, 50)

    # Create DataLoader objects with the new batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Create model
    model = TransformerModel(
        input_size=embedding_dim,
        d_model=d_model,
        nhead=nhead,
        num_encoder_layers=num_encoder_layers,
        dim_feedforward=dim_feedforward,
        output_size=n_classes - 1,
    ).to(params["device"])

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for _ in range(num_epochs):
        model.train()
        for features, targets in train_loader:
            features = features.to(params["device"])
            targets = targets.to(params["device"])

            # Forward pass
            logits = model(features)

            # CORN loss
            loss = loss_corn(logits, targets, n_classes)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    all_preds = []
    all_targets = []
    all_logits = []
    with torch.no_grad():
        for features, targets in test_loader:
            features = features.to(params["device"])
            targets = targets.to(params["device"])

            logits = model(features)
            all_logits.extend(logits.cpu().numpy())
            preds = label_from_logits(logits)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    return f1_score(all_targets, all_preds, average="weighted")


# Run the Optuna optimization
tpe_sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=tpe_sampler)
study.optimize(objective, n_trials=20)

# Get the best hyperparameters
best_params = study.best_trial.params
best_score = study.best_trial.value
print(f"Best hyperparameters: {best_params}")
print(f"Best score: {best_score:.4f}")

# Update the params dictionary with the best hyperparameters
params.update(best_params)

# Create DataLoader objects with the best batch size
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)

# Create and train the final model with the best hyperparameters
model = TransformerModel(
    input_size=params["input_dim"],
    d_model=params["d_model"],
    nhead=params["nhead"],
    num_encoder_layers=params["num_encoder_layers"],
    dim_feedforward=params["dim_feedforward"],
    output_size=params["n_classes"] - 1,
).to(params["device"])

optimizer = torch.optim.Adam(model.parameters(), lr=params["learning_rate"])

# Training loop
for epoch in range(params["num_epochs"]):
    model.train()
    for features, targets in train_loader:
        features = features.to(params["device"])
        targets = targets.to(params["device"])

        # Forward pass
        logits = model(features)

        # CORN loss
        loss = loss_corn(logits, targets, params["n_classes"])

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{params['num_epochs']}, Loss: {loss.item():.4f}")

# Evaluation
model.eval()
all_preds = []
all_targets = []
all_logits = []
with torch.no_grad():
    for features, targets in test_loader:
        features = features.to(params["device"])
        targets = targets.to(params["device"])

        logits = model(features)
        preds = label_from_logits(logits)
        all_logits.extend(logits.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Print classification report
print(
    classification_report(
        all_targets, all_preds, target_names=[f"Class {i}" for i in range(n_classes)]
    )
)

ordinal_accuracy_score = ordinal_accuracy(np.array(all_targets), np.array(all_preds))
print(f"Ordinal accuracy of the network on the test data: {ordinal_accuracy_score:.0%}")

[I 2024-07-23 23:02:08,426] A new study created in memory with name: no-name-4b193d95-463d-448c-94b6-be2f1be70073


Shape of embeddings in the dataframe: 384


[I 2024-07-23 23:02:15,586] Trial 0 finished with value: 0.7964154715341156 and parameters: {'nhead': 3, 'd_model': 192, 'num_encoder_layers': 5, 'dim_feedforward': 664, 'learning_rate': 2.9380279387035334e-05, 'batch_size': 64, 'num_epochs': 39}. Best is trial 0 with value: 0.7964154715341156.
[I 2024-07-23 23:02:17,438] Trial 1 finished with value: 0.8048618921745949 and parameters: {'nhead': 1, 'd_model': 64, 'num_encoder_layers': 5, 'dim_feedforward': 318, 'learning_rate': 3.511356313970405e-05, 'batch_size': 64, 'num_epochs': 21}. Best is trial 1 with value: 0.8048618921745949.
[I 2024-07-23 23:02:20,574] Trial 2 finished with value: 0.8016993269508568 and parameters: {'nhead': 5, 'd_model': 80, 'num_encoder_layers': 2, 'dim_feedforward': 456, 'learning_rate': 0.00023345864076016249, 'batch_size': 10, 'num_epochs': 11}. Best is trial 1 with value: 0.8048618921745949.
[I 2024-07-23 23:02:25,637] Trial 3 finished with value: 0.8238473411995871 and parameters: {'nhead': 5, 'd_model':

Best hyperparameters: {'nhead': 5, 'd_model': 80, 'num_encoder_layers': 1, 'dim_feedforward': 979, 'learning_rate': 0.00788671412999049, 'batch_size': 10, 'num_epochs': 28}
Best score: 0.8238
Epoch 1/28, Loss: 0.0243
Epoch 2/28, Loss: 0.0129
Epoch 3/28, Loss: 0.1351
Epoch 4/28, Loss: 0.0232
Epoch 5/28, Loss: 0.0108
Epoch 6/28, Loss: 0.3519
Epoch 7/28, Loss: 0.0088
Epoch 8/28, Loss: 0.0066
Epoch 9/28, Loss: 0.0590
Epoch 10/28, Loss: 0.0032
Epoch 11/28, Loss: 0.0456
Epoch 12/28, Loss: 0.1001
Epoch 13/28, Loss: 0.0065
Epoch 14/28, Loss: 0.1043
Epoch 15/28, Loss: 0.0309
Epoch 16/28, Loss: 0.0360
Epoch 17/28, Loss: 0.0063
Epoch 18/28, Loss: 0.1100
Epoch 19/28, Loss: 0.0068
Epoch 20/28, Loss: 0.0075
Epoch 21/28, Loss: 0.0116
Epoch 22/28, Loss: 0.0193
Epoch 23/28, Loss: 0.2012
Epoch 24/28, Loss: 0.0055
Epoch 25/28, Loss: 0.1198
Epoch 26/28, Loss: 0.1229
Epoch 27/28, Loss: 0.0048
Epoch 28/28, Loss: 0.0228
              precision    recall  f1-score   support

     Class 0       0.73      0.61 

In [64]:
# Model analysis
from scipy.special import expit as sigmoid

def label_from_logits(logits):
    """Converts logits to class labels."""
    probas = sigmoid(logits)
    probas = np.cumprod(probas, axis=1)
    predict_levels = probas > 0.5
    return np.sum(predict_levels, axis=1)

def probas_from_logits(logits):
    """Converts logits to class labels."""
    probas = sigmoid(logits)
    probas_cum = np.cumprod(probas, axis=1)
    return (probas_cum > 0.5).astype(int)

logits_df = pd.DataFrame(all_logits)
preds_df = pd.DataFrame(label_from_logits(all_logits))
probas_df = pd.DataFrame(probas_from_logits(all_logits))
labels_df = pd.DataFrame({'True Label': all_targets, 'Predicted Label': all_preds})

slice_window = slice(220, 230)
df_to_show = pd.concat(
    [
        pd.DataFrame(probas_df[slice_window]), 
        labels_df[slice_window]['True Label'], 
        labels_df[slice_window]['Predicted Label']
    ], axis=1
)

In [65]:
df_to_show

Unnamed: 0,0,1,2,3,True Label,Predicted Label
220,1,1,1,1,4,4
221,1,1,1,1,4,4
222,0,0,0,0,0,0
223,1,1,1,1,4,4
224,1,1,1,1,4,4
225,0,0,0,0,0,0
226,1,1,0,0,2,2
227,1,1,1,1,4,4
228,1,1,1,0,3,3
229,1,0,0,0,1,1


### Model 2 - Multi-Class Transformer

In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import optuna

torch.manual_seed(0)
np.random.seed(0)


def ordinal_accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    total_count = len(y_true)
    accurate_count = sum(
        1
        for true_label, pred_label in zip(y_true, y_pred)
        if pred_label in [true_label, true_label - 1, true_label + 1]
    )
    return accurate_count / total_count


# Load and preprocess data
datafile_path = "../data/fine_food_reviews_fine_tuned_e5_small_v2_1k.parquet"
df = pd.read_parquet(datafile_path)

embedding_dim = np.array(list(df.embedding.values)).shape[1]
print(f"Shape of embeddings in the dataframe: {embedding_dim}")

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    list(df.embedding.values),
    df.Score - 1,
    test_size=0.3,
    random_state=42,
)

# Convert the problem to three classes (very bad, bad, neutral-good)
# y_train = y_train.apply(lambda x: 0 if x < 3 else 1 if x == 3 else 2)
# y_test = y_test.apply(lambda x: 0 if x < 3 else 1 if x == 3 else 2)

n_classes = len(y_train.unique())


class ReviewsDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx].clone().detach().float()
        label = self.labels[idx].clone().detach().long()
        return embedding, label


# Convert train and test splits to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).clone().detach()
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long).clone().detach()
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).clone().detach()
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long).clone().detach()

# Create Dataset objects
train_dataset = ReviewsDataset(X_train_tensor, y_train_tensor)
test_dataset = ReviewsDataset(X_test_tensor, y_test_tensor)

params = {
    "input_dim": embedding_dim,
    "n_classes": n_classes,
    "num_hidden_1": 256,
    "num_hidden_2": 128,
    "num_hidden_3": 64,
    "dropout_rate": 0.5,
    "num_epochs": 20,
    "batch_size": 64,
    "learning_rate": 0.001,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}


# Define the Transformer model
class TransformerModel(nn.Module):
    def __init__(
        self,
        input_size,
        d_model,
        nhead,
        num_encoder_layers,
        dim_feedforward,
        output_size,
    ):
        super().__init__()
        self.embedding = nn.Linear(input_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, d_model))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model, nhead, dim_feedforward, dropout=0.1, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers)
        self.fc = nn.Linear(d_model, output_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding
        x = self.transformer_encoder(x.unsqueeze(1))  # Add sequence dimension
        x = self.fc(x.squeeze(1))
        return x


# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    nhead = trial.suggest_int("nhead", 1, 8)
    d_model = trial.suggest_int("d_model", nhead * 8, nhead * 64, step=nhead * 8)
    num_encoder_layers = trial.suggest_int("num_encoder_layers", 1, 6)
    dim_feedforward = trial.suggest_int("dim_feedforward", 128, 1024)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [10, 32, 64, 128])
    num_epochs = trial.suggest_int("num_epochs", 10, 50)

    # Create DataLoader objects with the new batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Create model
    model = TransformerModel(
        input_size=embedding_dim,
        d_model=d_model,
        nhead=nhead,
        num_encoder_layers=num_encoder_layers,
        dim_feedforward=dim_feedforward,
        output_size=n_classes,
    ).to(params["device"])

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for _ in range(num_epochs):
        model.train()
        for features, targets in train_loader:
            features = features.to(params["device"])
            targets = targets.to(params["device"])

            # Forward pass
            logits = model(features)
            loss = criterion(logits, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for features, targets in test_loader:
            features = features.to(params["device"])
            targets = targets.to(params["device"])

            logits = model(features)
            preds = torch.argmax(F.softmax(logits, dim=1), dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    return f1_score(all_targets, all_preds, average="weighted")


# Run the Optuna optimization
tpe_sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=tpe_sampler)
study.optimize(objective, n_trials=20)

# Get the best hyperparameters
best_params = study.best_trial.params
best_score = study.best_trial.value
print(f"Best hyperparameters: {best_params}")
print(f"Best score: {best_score:.4f}")

# Update the params dictionary with the best hyperparameters
params.update(best_params)

# Create DataLoader objects with the best batch size
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)

# Create and train the final model with the best hyperparameters
model = TransformerModel(
    input_size=params["input_dim"],
    d_model=params["d_model"],
    nhead=params["nhead"],
    num_encoder_layers=params["num_encoder_layers"],
    dim_feedforward=params["dim_feedforward"],
    output_size=params["n_classes"],
).to(params["device"])

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params["learning_rate"])

# Training loop
for epoch in range(params["num_epochs"]):
    model.train()
    for features, targets in train_loader:
        features = features.to(params["device"])
        targets = targets.to(params["device"])

        # Forward pass
        logits = model(features)
        loss = criterion(logits, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{params['num_epochs']}, Loss: {loss.item():.4f}")

# Evaluation
model.eval()
all_logits = []
all_preds = []
all_targets = []
with torch.no_grad():
    for features, targets in test_loader:
        features = features.to(params["device"])
        targets = targets.to(params["device"])

        logits = model(features)
        preds = torch.argmax(F.softmax(logits, dim=1), dim=1)
        all_logits.extend(logits.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Print classification report
print(
    classification_report(
        all_targets, all_preds, target_names=[f"Class {i}" for i in range(n_classes)]
    )
)

ordinal_accuracy_score = ordinal_accuracy(np.array(all_targets), np.array(all_preds))
print(f"Ordinal accuracy of the network on the test data: {ordinal_accuracy_score:.0%}")

[I 2024-07-23 23:46:51,827] A new study created in memory with name: no-name-f9b1e370-900e-4bab-b6b0-dd45e7b6d21a


Shape of embeddings in the dataframe: 384


[I 2024-07-23 23:46:58,757] Trial 0 finished with value: 0.7872796764253056 and parameters: {'nhead': 3, 'd_model': 192, 'num_encoder_layers': 5, 'dim_feedforward': 664, 'learning_rate': 2.9380279387035334e-05, 'batch_size': 64, 'num_epochs': 39}. Best is trial 0 with value: 0.7872796764253056.
[I 2024-07-23 23:47:00,326] Trial 1 finished with value: 0.7959241760428201 and parameters: {'nhead': 1, 'd_model': 64, 'num_encoder_layers': 5, 'dim_feedforward': 318, 'learning_rate': 3.511356313970405e-05, 'batch_size': 64, 'num_epochs': 21}. Best is trial 1 with value: 0.7959241760428201.
[I 2024-07-23 23:47:02,655] Trial 2 finished with value: 0.7986780616120333 and parameters: {'nhead': 5, 'd_model': 80, 'num_encoder_layers': 2, 'dim_feedforward': 456, 'learning_rate': 0.00023345864076016249, 'batch_size': 10, 'num_epochs': 11}. Best is trial 2 with value: 0.7986780616120333.
[I 2024-07-23 23:47:06,647] Trial 3 finished with value: 0.8077103184614994 and parameters: {'nhead': 5, 'd_model':

Best hyperparameters: {'nhead': 7, 'd_model': 112, 'num_encoder_layers': 1, 'dim_feedforward': 859, 'learning_rate': 0.001319994226153501, 'batch_size': 32, 'num_epochs': 14}
Best score: 0.8202
Epoch 1/14, Loss: 0.0788
Epoch 2/14, Loss: 0.3042
Epoch 3/14, Loss: 0.5500
Epoch 4/14, Loss: 0.1436
Epoch 5/14, Loss: 0.2163
Epoch 6/14, Loss: 0.0419
Epoch 7/14, Loss: 0.0436
Epoch 8/14, Loss: 0.0427
Epoch 9/14, Loss: 0.0407
Epoch 10/14, Loss: 0.0434
Epoch 11/14, Loss: 0.0928
Epoch 12/14, Loss: 0.0404
Epoch 13/14, Loss: 0.2447
Epoch 14/14, Loss: 0.3591
              precision    recall  f1-score   support

     Class 0       1.00      0.55      0.71        31
     Class 1       0.68      0.76      0.72        17
     Class 2       0.42      0.62      0.50        16
     Class 3       0.69      0.58      0.63        43
     Class 4       0.91      0.96      0.93       193

    accuracy                           0.83       300
   macro avg       0.74      0.70      0.70       300
weighted avg     

In [116]:
# Model analysis
from scipy.special import expit as sigmoid, softmax

def label_from_logits(logits):
    """Converts logits to class labels."""
    z = softmax(logits, axis=1)
    return z / z.sum(axis=1, keepdims=1)

def probas_from_logits(logits):
    """Converts logits to class labels."""
    z = softmax(logits, axis=1)
    softm_ = z / z.sum(axis=1, keepdims=1)
    # probas_cum = np.cumprod(probas, axis=1)
    return (softm_ > 0.5).astype(int)

logits_df = pd.DataFrame(all_logits)
preds_df = pd.DataFrame(label_from_logits(all_logits))
probas_df = pd.DataFrame(probas_from_logits(all_logits))
labels_df = pd.DataFrame({'True Label': all_targets, 'Predicted Label': all_preds})

slice_window = slice(160, 170)
df_to_show = pd.concat(
    [
        pd.DataFrame(probas_df[slice_window]), 
        labels_df[slice_window]['True Label'], 
        labels_df[slice_window]['Predicted Label']
    ], axis=1
)

In [118]:
df_to_show

Unnamed: 0,0,1,2,3,4,True Label,Predicted Label
160,0,0,0,0,1,4,4
161,0,0,1,0,0,1,2
162,0,0,0,0,1,4,4
163,0,0,0,0,1,4,4
164,0,0,0,0,1,4,4
165,1,0,0,0,0,0,0
166,0,1,0,0,0,0,1
167,0,0,0,0,1,3,4
168,0,0,0,1,0,3,3
169,0,0,0,0,1,4,4
