# Neural Network Models

In [113]:
import random
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import WeightedRandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

## Data Preprocessing

In [114]:
domain1 = pd.read_csv('processed_data/domain1_processed.csv')
domain2 = pd.read_csv('processed_data/domain2_processed.csv')
test = pd.read_csv('processed_data/test_processed.csv')

# Split the data for domain1
train_d1, val_d1 = train_test_split(domain1, train_size=0.8, random_state=88, stratify=domain1['label'])

# Split the data for domain2
train_d2, val_d2 = train_test_split(domain2, train_size=0.8, random_state=88, stratify=domain2['label'])

# Combine d1 and d2 for training
train_combined = pd.concat([train_d1, train_d2], ignore_index=True)
val_combined = pd.concat([val_d1, val_d2], ignore_index=True)

train_combined.reset_index(drop=True, inplace=True)
val_d1.reset_index(drop=True, inplace=True)
val_d2.reset_index(drop=True, inplace=True)

In [115]:
list_d1 = domain1['text'].apply(lambda x:len(x.split()))

list_d2 = domain2['text'].apply(lambda x:len(x.split()))

list_combined = train_combined['text'].apply(lambda x:len(x.split()))

quantile_d1 = list_d1.quantile(0.95)

quantile_d2 = list_d2.quantile(0.95)

quantile_combined = list_combined.quantile(0.95)

print(f"95% of the sequences in domain1 have length <= {quantile_d1}")

print(f"95% of the sequences in domain2 have length <= {quantile_d2}")

print(f"95% of the sequences in domain2 have length <= {quantile_combined}")


95% of the sequences in domain1 have length <= 111.0
95% of the sequences in domain2 have length <= 516.0
95% of the sequences in domain2 have length <= 294.0


In [138]:
d1_class_0_count = len(train_combined[(train_combined['label'] == 0) & (train_combined['domain'] == 1)])
d1_class_1_count = len(train_combined[(train_combined['label'] == 1) & (train_combined['domain'] == 1)])
d2_class_0_count = len(train_combined[(train_combined['label'] == 0) & (train_combined['domain'] == 2)])
d2_class_1_count = len(train_combined[(train_combined['label'] == 1) & (train_combined['domain'] == 2)])

total_samples = len(train_combined)

desired_samples_per_group = total_samples / 4

weight_d1_class_0 = desired_samples_per_group / d1_class_0_count
weight_d1_class_1 = desired_samples_per_group / d1_class_1_count
weight_d2_class_0 = desired_samples_per_group / d2_class_0_count
weight_d2_class_1 = desired_samples_per_group / d2_class_1_count

def compute_weight(row):
    if row['domain'] == 1 and row['label'] == 0:
        return weight_d1_class_0
    elif row['domain'] == 1 and row['label'] == 1:
        return weight_d1_class_1
    elif row['domain'] == 2 and row['label'] == 0:
        return weight_d2_class_0
    elif row['domain'] == 2 and row['label'] == 1:
        return weight_d2_class_1
    
train_combined['weights'] = train_combined.apply(compute_weight, axis=1)

# Simple Feedfoward networks & LSTM NN

In [116]:
MAX_LENGTH = 300

class Padded_Dataset(Dataset):
    def __init__(self, data, labels_column='label'):
        self.data = data
        self.max_length = MAX_LENGTH
        self.labels_column = labels_column
        self.has_labels = labels_column in data.columns

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        txt_data = self.data.iloc[idx]['text']
        
        # Split the space-separated string of token IDs and convert to integers
        token_ids = [int(token) for token in txt_data.split()]
        
        
        
        # Truncate or pad to MAX_LENGTH
        if len(token_ids) > self.max_length:
            token_ids = token_ids[:self.max_length]  # Truncate if longer
        elif len(token_ids) < self.max_length:
            token_ids.extend([0] * (self.max_length - len(token_ids)))  # Pad with zeros
    
        # Convert the list of token IDs to a PyTorch tensor
        txt = torch.tensor(token_ids, dtype=torch.long)
        
        if self.has_labels:
            label = torch.tensor(self.data.iloc[idx][self.labels_column])
            return txt, label
        else:
            return txt




In [117]:
import torch.nn as nn

class Label_Classifier(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, dropout_rate=0.5):
        # Initializing the parent class, nn.Module.
        super(Label_Classifier, self).__init__()  # Call the initialization of the superclass nn.Module.

        # Embedding layer that converts token IDs into dense vectors of fixed size.
        self.embedding = nn.Embedding(vocab_size, emb_size)  # Initialize an embedding layer with given vocabulary size and embedding size.
        
         # Dropout layer after embedding
        self.dropout_after_embedding = nn.Dropout(dropout_rate) 

        # A sequential container in which input is processed by each module (layer) in the defined order.
        self.classifier = nn.Sequential(
            nn.Linear(emb_size, hidden_size),      
            nn.ReLU(),                            
            nn.Dropout(dropout_rate),              
            nn.Linear(hidden_size, hidden_size),   
            nn.ReLU(),                             
            nn.Dropout(dropout_rate),              
            nn.Linear(hidden_size, 1)              
        )

    def forward(self, text):
        # Convert token IDs in 'text' into dense vectors.
        text_emb = self.embedding(text)          
        # Apply dropout after embedding
        text_emb = self.dropout_after_embedding(text_emb)
        
        # Average the embeddings along the sequence dimension (common method to handle variable-length sequences).
        text_emb = text_emb.mean(dim=1)          
        
        # Pass the averaged embeddings through the classifier to produce an output.
        output = self.classifier(text_emb)       

        return output.squeeze()                  
    
    
class LSTM_Label_Classifier(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_layers=1, dropout_prob=0.5):
        super(LSTM_Label_Classifier, self).__init__()

        # Embedding layer with dropout
        self.embedding = nn.Sequential(
            nn.Embedding(vocab_size, emb_size),
            nn.Dropout(p=dropout_prob)  # Add dropout to the embedding layer
        )

        # LSTM layer without dropout when num_layers is 1
        if num_layers > 1:
            self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
        else:
            self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True)

        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout_prob),  # Add dropout after ReLU
            nn.Linear(hidden_size, 1)
        )

    def forward(self, text):
        # Convert token IDs in 'text' into dense vectors with dropout.
        text_emb = self.embedding(text)
        

        # Pass embeddings through LSTM
        lstm_out, _ = self.lstm(text_emb)

        
        last_hidden = lstm_out[:, -1, :]

        # Pass the last hidden state through the classifier
        output = self.classifier(last_hidden)

        return output.squeeze()



In [118]:
## Training loop for one epoch
def train_one_epoch_nn(model, train_loader, criterion, device):
    model.train()
    total_loss = 0.0
    correct_predictions = 0   # Keep track of correct predictions
    total_examples = 0        # Keep track of total examples processed
    train_preds = []
    train_labels = []
    
    for batch in train_loader:
        txt, label = batch[0].to(device), batch[1].to(device).float()
        x_o = model.forward(txt)
        loss = criterion(x_o, label)
        optimizer.zero_grad()   # Typo corrected from 'oprimizer' to 'optimizer'
        loss.backward()
        optimizer.step()
        
        # Debugging and tracking
        total_loss += loss.item()
        predictions = torch.round(torch.sigmoid(x_o))
        train_preds += predictions.tolist()
        train_labels += label.tolist()
        
        # Update correct predictions and total examples
        correct_predictions += (predictions == label).sum().item()
        total_examples += label.size(0)  # Assuming label is of shape (batch_size,)
        
    average_loss = total_loss / len(train_loader)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    train_accuracy = correct_predictions / total_examples  # Compute accuracy
    
    return average_loss, train_f1, train_accuracy
    
        
#prediction function        
def predict(model, loader, device):
    model.eval()
    all_preds = []

    with torch.no_grad():
        for batch in loader:
            txt = batch[0].to(device)
            x_o = model(txt)
            preds = torch.round(torch.sigmoid(x_o))
            all_preds += preds.tolist()

    return all_preds
        
        
# Evaluation funtion        
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    eval_preds = []
    eval_labels = []

    with torch.no_grad():
        for batch in loader:
            txt, label = batch[0].to(device), batch[1].to(device).float()
            x_o = model(txt)
            loss = criterion(x_o, label)
            
            total_loss += loss.item()
            preds = torch.round(torch.sigmoid(x_o))
            correct += (preds == label).sum().item()

            eval_preds += preds.tolist()
            eval_labels += label.tolist()

    average_loss = total_loss / len(loader)
    accuracy = correct / len(loader.dataset)
    val_f1 = f1_score(eval_labels, eval_preds, average='macro')
    
    return average_loss, val_f1, accuracy   #, all_labels, all_preds    

In [119]:
# Prepare the dataset for training
batch_size = 64

# Convert the train_combined to the pytorch Dataset tensor format with padding
train_combined_db = Padded_Dataset(train_combined)
train_loader = DataLoader(train_combined_db, batch_size=batch_size, shuffle=True)

train_d1_db = Padded_Dataset(train_d1)
train_d1_loader = DataLoader(train_d1_db, batch_size=batch_size, shuffle=True)


# Convert the validation sets
val_d1_db = Padded_Dataset(val_d1)
val_d2_db = Padded_Dataset(val_d2)

val_d1_loader = DataLoader(val_d1_db, batch_size=batch_size, shuffle=False)
val_d2_loader = DataLoader(val_d2_db, batch_size=batch_size, shuffle=False)

# Convert the test set
test_db = Padded_Dataset(test)
test_loader = DataLoader(test_db, batch_size=batch_size, shuffle=False)


In [126]:
#Initialise the training process for the feed foward NN
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    


ff_model=Label_Classifier(vocab_size=5000,emb_size=128,hidden_size=64).to(device)

optimizer=optim.Adam(ff_model.parameters(),lr=0.001)

criterion = nn.BCEWithLogitsLoss()

#Training the Label_Classifier
num_epoch=20
for epoch in range(num_epoch):
    train_loss, train_f1, train_accuracy = train_one_epoch_nn(ff_model, train_loader, criterion, device)
    val_d1_loss, val_d1_f1 ,val_d1_accuracy = evaluate(ff_model, val_d1_loader, criterion, device)
    val_d2_loss, val_d2_f1, val_d2_accuracy = evaluate(ff_model, val_d2_loader, criterion, device)
    print(f"Epoch {epoch + 1}, val_d1 loss: {val_d1_loss:.4f}, val_d2 loss: {val_d2_loss:.4f}, val_d1_f1: {val_d1_f1:.4f}, val_d2_f1: {val_d2_f1:.4f}, val_d1_accuracy: {val_d1_accuracy:.4f}, val_d2_accuracy: {val_d2_accuracy:.4f}")


Epoch 1, val_d1 loss: 0.5136, val_d2 loss: 0.4635, val_d1_f1: 0.7842, val_d2_f1: 0.4619, val_d1_accuracy: 0.7844, val_d2_accuracy: 0.8507
Epoch 2, val_d1 loss: 0.3707, val_d2 loss: 0.4196, val_d1_f1: 0.8675, val_d2_f1: 0.4618, val_d1_accuracy: 0.8679, val_d2_accuracy: 0.8503
Epoch 3, val_d1 loss: 0.3271, val_d2 loss: 0.4084, val_d1_f1: 0.8933, val_d2_f1: 0.4616, val_d1_accuracy: 0.8938, val_d2_accuracy: 0.8497
Epoch 4, val_d1 loss: 0.2846, val_d2 loss: 0.4078, val_d1_f1: 0.8999, val_d2_f1: 0.4613, val_d1_accuracy: 0.9005, val_d2_accuracy: 0.8487
Epoch 5, val_d1 loss: 0.2752, val_d2 loss: 0.4015, val_d1_f1: 0.9053, val_d2_f1: 0.4617, val_d1_accuracy: 0.9054, val_d2_accuracy: 0.8500
Epoch 6, val_d1 loss: 0.2704, val_d2 loss: 0.4065, val_d1_f1: 0.9069, val_d2_f1: 0.4629, val_d1_accuracy: 0.9074, val_d2_accuracy: 0.8470
Epoch 7, val_d1 loss: 0.2442, val_d2 loss: 0.4093, val_d1_f1: 0.9108, val_d2_f1: 0.4630, val_d1_accuracy: 0.9113, val_d2_accuracy: 0.8473
Epoch 8, val_d1 loss: 0.2441, val_

In [122]:


if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

    
lstm_model = LSTM_Label_Classifier(vocab_size=5000, emb_size=128, hidden_size=64).to(device)

optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

# Training the LSTM_Label_Classifier
num_epoch = 50
for epoch in range(num_epoch):
    train_loss, train_f1, train_accuracy = train_one_epoch_nn(lstm_model, train_loader, criterion, device)
    val_d1_loss, val_d1_f1, val_d1_accuracy = evaluate(lstm_model, val_d1_loader, criterion, device)
    val_d2_loss, val_d2_f1, val_d2_accuracy = evaluate(lstm_model, val_d2_loader, criterion, device)
    print(f"Epoch {epoch + 1}, val_d1 loss: {val_d1_loss:.4f}, val_d2 loss: {val_d2_loss:.4f}, val_d1_f1: {val_d1_f1:.4f}, val_d2_f1: {val_d2_f1:.4f}, val_d1_accuracy: {val_d1_accuracy:.4f}, val_d2_accuracy: {val_d2_accuracy:.4f}")


Epoch 1, val_d1 loss: 0.7499, val_d2 loss: 0.5070, val_d1_f1: 0.3333, val_d2_f1: 0.4611, val_d1_accuracy: 0.5000, val_d2_accuracy: 0.8557
Epoch 2, val_d1 loss: 0.7638, val_d2 loss: 0.4873, val_d1_f1: 0.3333, val_d2_f1: 0.4611, val_d1_accuracy: 0.5000, val_d2_accuracy: 0.8557
Epoch 3, val_d1 loss: 0.7360, val_d2 loss: 0.5321, val_d1_f1: 0.3333, val_d2_f1: 0.4611, val_d1_accuracy: 0.5000, val_d2_accuracy: 0.8557
Epoch 4, val_d1 loss: 0.7439, val_d2 loss: 0.5261, val_d1_f1: 0.3333, val_d2_f1: 0.4674, val_d1_accuracy: 0.5000, val_d2_accuracy: 0.8540
Epoch 5, val_d1 loss: 0.4129, val_d2 loss: 0.4133, val_d1_f1: 0.8281, val_d2_f1: 0.4613, val_d1_accuracy: 0.8290, val_d2_accuracy: 0.8487
Epoch 6, val_d1 loss: 0.3485, val_d2 loss: 0.4208, val_d1_f1: 0.8673, val_d2_f1: 0.4596, val_d1_accuracy: 0.8674, val_d2_accuracy: 0.8503
Epoch 7, val_d1 loss: 0.3607, val_d2 loss: 0.4203, val_d1_f1: 0.8593, val_d2_f1: 0.4597, val_d1_accuracy: 0.8597, val_d2_accuracy: 0.8507
Epoch 8, val_d1 loss: 0.3229, val_

In [124]:
# predictions = predict(lstm_model, test_loader, device)
class Predictor:
    def __init__(self, model, device):
        self.model = model
        self.device = device

    def predict(self, dataloader):
        self.model.eval()  # Ensure the model is in evaluation mode
        predictions = []  # Store the predictions

        with torch.no_grad():
            for batch in dataloader:
                txt = batch.to(self.device)  # Assuming each batch is just a tensor of token ids
                logits = self.model(txt)
                probs = torch.sigmoid(logits)
                preds = torch.round(probs).int()  # Convert probabilities to binary labels (0 or 1)
                predictions.extend(preds.tolist())

        return predictions


In [125]:
# Instantiate the Predictor
predictor = Predictor(lstm_model, device)

# Make predictions on the test set
test_predictions = predictor.predict(test_loader)

test_ids = test['id'].tolist()

assert len(test_ids) == len(test_predictions)

# Write the results to the CSV
with open("submission.csv", "w") as f:
    f.write("id,class\n")  # header line
    for test_id, prediction in zip(test_ids, test_predictions):
        f.write(f"{test_id},{prediction}\n")

## TF-IFD Representation

In [127]:
class TFIDF_Dataset(Dataset):
    def __init__(self, tfidf_matrix, labels=None, max_length=MAX_LENGTH, labels_column='label'):
        self.tfidf_matrix = tfidf_matrix
        self.labels = labels
        self.max_length = max_length
        self.labels_column = labels_column
        self.has_labels = labels is not None  # Check if labels are provided

    def __len__(self):
        return len(self.labels) if self.has_labels else self.tfidf_matrix.shape[0]

    def __getitem__(self, idx):
        tfidf_vector = self.tfidf_matrix[idx].toarray()  # Convert to dense array
        tfidf_vector = tfidf_vector.squeeze()  # Remove the extra dimension
        tfidf_vector = torch.tensor(tfidf_vector, dtype=torch.float32)

        if self.has_labels:
            label = torch.tensor(self.labels[idx])
            return tfidf_vector, label
        else:
            return tfidf_vector
        

In [128]:
import torch.nn.functional as F

class TFIDF_Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_prob=0.8):
        super(TFIDF_Classifier, self).__init__()

        # A sequential container for the classifier layers
        self.classifier = nn.Sequential(
            nn.Linear(input_size, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(p=dropout_prob),  # Add dropout layer with the specified dropout probability
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout_prob),  # Add another dropout layer
            nn.Linear(hidden_size, 1)
        )

    def forward(self, tfidf_vector):
        output = self.classifier(tfidf_vector)
        return output.squeeze()


In [135]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
train_combined.reset_index(drop=True, inplace=True)
val_d1.reset_index(drop=True, inplace=True)
val_d2.reset_index(drop=True, inplace=True)


all_text_data = pd.concat([train_combined['text'], val_d1['text'], val_d2['text']], axis=0)


# Convert the pandas Series to a list of strings
all_text_data_list = all_text_data.tolist()

tfidf_vectorizer = TfidfVectorizer()  # Initialize TF-IDF vectorizer with your desired MAX_FEATURES
tfidf_matrix_all = tfidf_vectorizer.fit_transform(all_text_data)

# Transform individual datasets
tfidf_matrix_train_combined = tfidf_vectorizer.transform(train_combined['text'])
tfidf_matrix_val_d1 = tfidf_vectorizer.transform(val_d1['text'])
tfidf_matrix_val_d2 = tfidf_vectorizer.transform(val_d2['text'])
tfidf_matrix_test = tfidf_vectorizer.transform(test['text'])

# Create instances of TFIDF_Dataset for each dataset
tfidf_train_combined_db = TFIDF_Dataset(tfidf_matrix_train_combined, train_combined['label'])
tfidf_val_d1_db = TFIDF_Dataset(tfidf_matrix_val_d1, val_d1['label'])
tfidf_val_d2_db = TFIDF_Dataset(tfidf_matrix_val_d2, val_d2['label'])
tfidf_test_db = TFIDF_Dataset(tfidf_matrix_test)



# Create DataLoader instances for each dataset
tfidf_train_loader = DataLoader(tfidf_train_combined_db, batch_size=batch_size, shuffle=True)
tfidf_val_d1_loader = DataLoader(tfidf_val_d1_db, batch_size=batch_size, shuffle=False)
tfidf_val_d2_loader = DataLoader(tfidf_val_d2_db, batch_size=batch_size, shuffle=False)
tfidf_test_loader = DataLoader(tfidf_test_db, batch_size=batch_size, shuffle=False)  # No need to shuffle the test dataset


In [137]:
#Initialise the training process
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
max_length = len(tfidf_vectorizer.vocabulary_)

model_tfidf=TFIDF_Classifier(input_size=max_length,hidden_size=300).to(device)

optimizer=optim.Adam(model_tfidf.parameters(),lr=0.00005)

criterion = nn.BCEWithLogitsLoss()

#Training the Label_Classifier
num_epoch=20
for epoch in range(num_epoch):
    train_loss, train_f1, train_accuracy = train_one_epoch_nn(model_tfidf, tfidf_train_loader, criterion, device)
    val_d1_loss, val_d1_f1 ,val_d1_accuracy = evaluate(model_tfidf, tfidf_val_d1_loader, criterion, device)
    val_d2_loss, val_d2_f1, val_d2_accuracy = evaluate(model_tfidf, tfidf_val_d2_loader, criterion, device)
    print(f"Epoch {epoch + 1}, val_d1 loss: {val_d1_loss:.4f}, val_d2 loss: {val_d2_loss:.4f}, val_d1_f1: {val_d1_f1:.4f}, val_d2_f1: {val_d2_f1:.4f}, val_d1_accuracy: {val_d1_accuracy:.4f}, val_d2_accuracy: {val_d2_accuracy:.4f}")


Epoch 1, val_d1 loss: 0.6910, val_d2 loss: 0.4994, val_d1_f1: 0.3333, val_d2_f1: 0.4611, val_d1_accuracy: 0.5000, val_d2_accuracy: 0.8557
Epoch 2, val_d1 loss: 0.5712, val_d2 loss: 0.4570, val_d1_f1: 0.3401, val_d2_f1: 0.4611, val_d1_accuracy: 0.5031, val_d2_accuracy: 0.8557
Epoch 3, val_d1 loss: 0.4224, val_d2 loss: 0.4575, val_d1_f1: 0.8296, val_d2_f1: 0.4743, val_d1_accuracy: 0.8305, val_d2_accuracy: 0.8302
Epoch 4, val_d1 loss: 0.3412, val_d2 loss: 0.4722, val_d1_f1: 0.8746, val_d2_f1: 0.4802, val_d1_accuracy: 0.8746, val_d2_accuracy: 0.8168
Epoch 5, val_d1 loss: 0.3040, val_d2 loss: 0.4828, val_d1_f1: 0.8843, val_d2_f1: 0.4862, val_d1_accuracy: 0.8844, val_d2_accuracy: 0.8101
Epoch 6, val_d1 loss: 0.2863, val_d2 loss: 0.4896, val_d1_f1: 0.8866, val_d2_f1: 0.4918, val_d1_accuracy: 0.8867, val_d2_accuracy: 0.8070
Epoch 7, val_d1 loss: 0.2728, val_d2 loss: 0.4975, val_d1_f1: 0.8935, val_d2_f1: 0.4987, val_d1_accuracy: 0.8936, val_d2_accuracy: 0.7973
Epoch 8, val_d1 loss: 0.2707, val_

In [66]:
def ensemble_predict_dataloader(label_models, tfidf_models, label_loader, tfidf_loader, device):
    total_probabilities = []

    # Assuming the dataloaders provide data in the same order
    for (label_batch, tfidf_batch) in zip(label_loader, tfidf_loader):
        X_label = label_batch[0].to(device)
        X_tfidf = tfidf_batch[0].to(device)

        batch_probabilities = torch.zeros(X_label.size(0), device=device)

        for model in label_models:
            model.eval()
            with torch.no_grad():
                probabilities = torch.sigmoid(model(X_label).squeeze())
                batch_probabilities += probabilities

        for model in tfidf_models:
            model.eval()
            with torch.no_grad():
                probabilities = torch.sigmoid(model(X_tfidf).squeeze()) 
                batch_probabilities += probabilities

        avg_batch_probabilities = batch_probabilities / (len(label_models) + len(tfidf_models))
        total_probabilities.extend(avg_batch_probabilities.tolist())

    return total_probabilities

def ensemble_predict_unlabeled_dataloader(label_models, tfidf_models=None, label_loader, tfidf_loader=None, device):
    total_probabilities = []

    
    for (X_label, X_tfidf) in zip(label_loader, tfidf_loader):
        X_label = X_label.to(device)  
        X_tfidf = X_tfidf.to(device)  

        batch_probabilities = torch.zeros(X_label.size(0), device=device)

        for model in label_models:
            model.eval()
            with torch.no_grad():
                probabilities = torch.sigmoid(model(X_label).squeeze())  
                batch_probabilities += probabilities

        for model in tfidf_models:
            model.eval()
            with torch.no_grad():
                probabilities = torch.sigmoid(model(X_tfidf).squeeze())  
                batch_probabilities += probabilities

        avg_batch_probabilities = batch_probabilities / (len(label_models) + len(tfidf_models))
        total_probabilities.extend(avg_batch_probabilities.tolist())

    # Convert probabilities to binary predictions
    predicted_labels = [1 if prob > 0.5 else 0 for prob in total_probabilities]
    return predicted_labels

In [40]:
predicted_labels_test = ensemble_predict_unlabeled_dataloader(label_models, tfidf_models, test_loader, tfidf_test_loader, device)


# Assuming the test set dataframe has a column named 'id' for IDs
test_ids = test['id'].tolist()

assert len(test_ids) == len(predicted_labels_test)

# Write the results to the CSV
with open("submission.csv", "w") as f:
    f.write("id,class\n")  # header line
    for test_id, prediction in zip(test_ids, predicted_labels_test):
        f.write(f"{test_id},{prediction}\n")