A hybrid gated network architecture used to train the model with text and numerical features based in pytorch.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/multitude_split/dataset_all.csv')
data.head()

Mounted at /content/drive


Unnamed: 0,text,label,multi_label,split,language,length,source,word_count,unique_word_count,char_count,...,question_mark_count,exclamation_mark_count,flesch_reading_ease,gunning_fog_index,first_person_pronoun_count,person_entity_count,date_entity_count,uniqueness_bigram,uniqueness_trigram,syntax_variety
0,Der Ausbruch des Coronavirus hat die Entwicklu...,1,text-davinci-003,test,de,174,MULTITuDE_MassiveSumm_spiegel,199.0,118.0,1067.0,...,0.0,0.0,-272.02217,11.15603,0.0,0.0,0.0,0.90404,0.979695,12.0
1,Alex Azar was officially sworn in as the U.S. ...,1,text-davinci-003,train,en,57,MULTITuDE_MassiveSumm_voanews,70.0,54.0,311.0,...,0.0,1.0,-186.793214,8.714286,0.0,5.0,2.0,1.0,1.0,11.0
2,Європейський союз вимагає зупинити розтрату ко...,1,gpt-3.5-turbo,test,uk,105,MULTITuDE_MassiveSumm_interfax,130.0,82.0,691.0,...,0.0,0.0,-269.236538,11.015385,0.0,0.0,0.0,0.860465,0.929688,14.0
3,"Yesterday, hundreds of Zambian university stud...",1,text-davinci-003,train,en,254,MULTITuDE_MassiveSumm_voanews,292.0,149.0,1419.0,...,0.0,0.0,-231.229869,11.4401,1.0,1.0,1.0,0.876289,0.965517,13.0
4,"In a narrow and highly watched vote, the US Se...",1,gpt-4,train,en,416,MULTITuDE_MassiveSumm_voanews,476.0,242.0,2259.0,...,0.0,0.0,-224.855788,13.160504,1.0,2.0,2.0,0.871579,0.974684,15.0


In [2]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

statistical_features = ['word_count', 'unique_word_count', 'char_count', 'avg_word_length',
       'ttr', 'hapax_legomenon', 'sentence_count', 'avg_sentence_length',
       'avg_sentence_complexity', 'punctuation_count', 'noun_count',
       'stopword_count', 'verb_count', 'adj_count', 'adv_count',
       'complex_sentence_count', 'question_mark_count',
       'exclamation_mark_count', 'flesch_reading_ease', 'gunning_fog_index',
       'first_person_pronoun_count', 'person_entity_count',
       'date_entity_count', 'uniqueness_bigram', 'uniqueness_trigram',
       'syntax_variety']

data[statistical_features] = scaler.fit_transform(data[statistical_features])

In [3]:
data['numerical'] = data[statistical_features].apply(lambda row: row.tolist(), axis=1)
data = data.drop(columns=statistical_features)
data.head()

Unnamed: 0,text,label,multi_label,split,language,length,source,numerical
0,Der Ausbruch des Coronavirus hat die Entwicklu...,1,text-davinci-003,test,de,174,MULTITuDE_MassiveSumm_spiegel,"[0.3060278207109737, 0.3556231003039514, 0.361..."
1,Alex Azar was officially sworn in as the U.S. ...,1,text-davinci-003,train,en,57,MULTITuDE_MassiveSumm_voanews,"[0.10664605873261206, 0.16109422492401215, 0.1..."
2,Європейський союз вимагає зупинити розтрату ко...,1,gpt-3.5-turbo,test,uk,105,MULTITuDE_MassiveSumm_interfax,"[0.19938176197836166, 0.24620060790273557, 0.2..."
3,"Yesterday, hundreds of Zambian university stud...",1,text-davinci-003,train,en,254,MULTITuDE_MassiveSumm_voanews,"[0.4497681607418856, 0.44984802431610943, 0.48..."
4,"In a narrow and highly watched vote, the US Se...",1,gpt-4,train,en,416,MULTITuDE_MassiveSumm_voanews,"[0.7341576506955177, 0.7325227963525837, 0.767..."


In [4]:
data_test = data[data["split"] == "test"]
data_val = data[data["split"] == "test"]
# select 100 samples from each language following same label distribution
data_val = data_val.groupby("language").apply(lambda x: x.sample(100)).reset_index(drop=True)
data_train = data[data["split"] == "train"]

In [5]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
model = AutoModel.from_pretrained('/content/drive/MyDrive/multitude_split/mdeberta-reduced-2').to(device)

# Freeze all the parameters in the transformer model
for param in model.parameters():
    param.requires_grad = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [7]:
class CustomModel(nn.Module):
    def __init__(self, transformer_model, num_numerical_features):
        super(CustomModel, self).__init__()
        self.transformer = transformer_model
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.numerical_processor = nn.Sequential(
            nn.Linear(num_numerical_features, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.classifier = nn.Sequential(
            nn.Linear(64 + transformer_model.config.hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask, numerical_data):
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        text_features = transformer_output.last_hidden_state[:, 0, :]  # Use the CLS token
        text_features = text_features.unsqueeze(-1)
        text_features = self.pooling(text_features).squeeze(-1)

        numerical_features = self.numerical_processor(numerical_data)

        features = torch.cat((text_features, numerical_features), dim=1)
        output = self.classifier(features)
        return output

# Instantiate the model
custom_model = CustomModel(model, num_numerical_features=26)
custom_model.to(device)

CustomModel(
  (transformer): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(250102, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            

In [8]:
def process_data(df):
    tokenized = tokenizer(df['text'].tolist(), max_length=300, padding='max_length', truncation=True, return_tensors="pt")
    numerical_data = torch.tensor(df['numerical'].tolist(), dtype=torch.float32)
    labels = torch.tensor(df['label'].tolist(), dtype=torch.float32)
    return tokenized['input_ids'], tokenized['attention_mask'], numerical_data, labels

def create_dataloader(df, batch_size=32):
    input_ids, attention_mask, numerical_data, labels = process_data(df)
    dataset = TensorDataset(input_ids, attention_mask, numerical_data, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

train_loader = create_dataloader(data_train)
val_loader = create_dataloader(data_val)
test_loader = create_dataloader(data_test)


In [9]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import time

def train(model, dataloader, criterion, optimizer, epochs=3, print_every=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        steps = 0
        start_time = time.time()  # Record the start time of the epoch

        for batch_idx, (input_ids, attention_mask, numerical_data, labels) in enumerate(dataloader):
            input_ids, attention_mask, numerical_data, labels = input_ids.to(device), attention_mask.to(device), numerical_data.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, numerical_data).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            steps += 1

            # Calculate the current average time per batch
            current_time = time.time()
            elapsed_time = current_time - start_time
            average_time_per_batch = elapsed_time / (batch_idx + 1)

            # Estimate remaining time for the epoch
            remaining_batches = len(dataloader) - (batch_idx + 1)
            eta = remaining_batches * average_time_per_batch

            if (batch_idx + 1) % print_every == 0:
                print(f"Epoch {epoch+1}/{epochs}, Step {batch_idx+1}/{len(dataloader)}, Loss: {loss.item():.4f}")
                print(f"ETA: {eta:.2f} seconds remaining for this epoch.")

        average_loss = total_loss / steps
        print(f"Epoch {epoch+1} completed. Average Loss: {average_loss:.4f}")
        print(f"Total time for this epoch: {elapsed_time:.2f} seconds")

# Optimizer and loss function setup
optimizer = Adam(filter(lambda p: p.requires_grad, custom_model.parameters()), lr=0.001)
criterion = nn.BCELoss()


In [14]:
# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    all_predictions = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for input_ids, attention_mask, numerical_data, labels in dataloader:
            input_ids, attention_mask, numerical_data, labels = input_ids.to(device), attention_mask.to(device), numerical_data.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask, numerical_data).squeeze()
            predictions = torch.sigmoid(outputs).cpu().numpy()
            predicted_labels = (predictions > 0.5).astype(int)

            all_predictions.extend(predicted_labels)
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(predictions)

    accuracy = accuracy_score(all_labels, all_predictions)
    macro_f1 = f1_score(all_labels, all_predictions, average='macro')
    weighted_f1 = f1_score(all_labels, all_predictions, average='weighted')
    weighted_precision = precision_score(all_labels, all_predictions, average='weighted')
    weighted_recall = recall_score(all_labels, all_predictions, average='weighted')
    roc_auc = roc_auc_score(all_labels, all_probs)
    tn, fp, fn, tp = confusion_matrix(all_labels, all_predictions).ravel()
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)

    metrics = {
        "Accuracy": accuracy,
        "Macro F1 Score": macro_f1,
        "Weighted F1 Score": weighted_f1,
        "Weighted Precision": weighted_precision,
        "Weighted Recall": weighted_recall,
        "ROC AUC": roc_auc,
        "False Positive Rate (FPR)": fpr,
        "False Negative Rate (FNR)": fnr
    }
    return metrics

In [11]:
# Train the model
train(custom_model, train_loader, criterion, optimizer, epochs=5, print_every=50)

Epoch 1/5, Step 50/1400, Loss: 0.3881
ETA: 1123.12 seconds remaining for this epoch.
Epoch 1/5, Step 100/1400, Loss: 0.3914
ETA: 1087.02 seconds remaining for this epoch.
Epoch 1/5, Step 150/1400, Loss: 0.4282
ETA: 1049.97 seconds remaining for this epoch.
Epoch 1/5, Step 200/1400, Loss: 0.2473
ETA: 1007.72 seconds remaining for this epoch.
Epoch 1/5, Step 250/1400, Loss: 0.3795
ETA: 967.48 seconds remaining for this epoch.
Epoch 1/5, Step 300/1400, Loss: 0.5088
ETA: 925.45 seconds remaining for this epoch.
Epoch 1/5, Step 350/1400, Loss: 0.2487
ETA: 884.07 seconds remaining for this epoch.
Epoch 1/5, Step 400/1400, Loss: 0.4438
ETA: 842.86 seconds remaining for this epoch.
Epoch 1/5, Step 450/1400, Loss: 0.4328
ETA: 801.35 seconds remaining for this epoch.
Epoch 1/5, Step 500/1400, Loss: 0.3722
ETA: 759.65 seconds remaining for this epoch.
Epoch 1/5, Step 550/1400, Loss: 0.1846
ETA: 717.79 seconds remaining for this epoch.
Epoch 1/5, Step 600/1400, Loss: 0.3723
ETA: 675.86 seconds rem

In [15]:
# Evaluate the model
metrics = evaluate(custom_model, test_loader)
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

Accuracy: 0.8895
Macro F1 Score: 0.4708
Weighted F1 Score: 0.8375
Weighted Precision: 0.7913
Weighted Recall: 0.8895
ROC AUC: 0.7780
False Positive Rate (FPR): 1.0000
False Negative Rate (FNR): 0.0000


  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
#save the model
torch.save(custom_model.state_dict(), '/content/drive/MyDrive/multitude_split/mdeberta-reduced-2-custom')