In [None]:
pip install transformers

In [None]:
pip install openpyxl

In [None]:
pip install git+https://github.com/csebuetnlp/normalizer

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
from transformers import AutoModelForPreTraining, AutoTokenizer
from normalizer import normalize #buet_bert

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt 

# Load your dataset
train_data = pd.read_excel("/kaggle/input/shuffled-and-new-threshold-dataset/train_shuffle_data.xlsx")
val_data = pd.read_excel("/kaggle/input/shuffled-and-new-threshold-dataset/val_shuffle_Data.xlsx")
test_data = pd.read_excel("/kaggle/input/test-data-suffled-and-removed/new Test Data_suffled.xlsx")
# Define the allowed labels
allowed_labels = ['entailment', 'contradiction', 'neutral']
test_labels = ['contradiction', 'entailment', 'neutral'] 

# Filter the training dataset to keep only rows with allowed labels
train_data = train_data[train_data['label'].isin(allowed_labels)]

# Filter the validation dataset to keep only rows with allowed labels
val_data = val_data[val_data['label'].isin(allowed_labels)]

# Filter the test dataset to keep only rows with allowed labels
test_data = test_data[test_data['label'].isin(allowed_labels)]

In [None]:
# Extract unique labels from each dataset
unique_train_labels = train_data['label'].unique()
unique_val_labels = val_data['label'].unique()
unique_test_labels = test_data['label'].unique()

# Print the unique labels for each dataset
print("Unique Labels in Training Data:", unique_train_labels)
print("Unique Labels in Validation Data:", unique_val_labels)
print("Unique Labels in Test Data:", unique_test_labels)

In [None]:
# Encode the labels
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])
unique_labels = train_data['label'].unique()

# Tokenize and prepare the data
class TextEntailmentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        premise = self.data.iloc[idx]['sentence']
        hypothesis = self.data.iloc[idx]['output']
        label = self.data.iloc[idx]['label']

        encoding = self.tokenizer(premise, hypothesis, padding='max_length', truncation=True, max_length=self.max_length)
        inputs = {k: torch.tensor(v) for k, v in encoding.items()}
        inputs['labels'] = torch.tensor(label)

        return inputs

# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm1 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_size, 64, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.dense1 = nn.Linear(64, 1)  # Linear layer without activation
        self.relu = nn.ReLU()  # ReLU activation function
        self.dense2 = nn.Linear(1, output_size) 

    def forward(self, x):
        x = self.embedding(x)
        lstm_out1, _ = self.lstm1(x)
        lstm_out2, _ = self.lstm2(lstm_out1[:, -1, :])
        x = self.dropout(lstm_out2)
        x = self.dense1(x)
        x = self.relu(x)
        output = self.dense2(x)
        return output
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Initialize the LSTM model
input_size = len(tokenizer.get_vocab())
hidden_size = 128
output_size = len(unique_labels)
lstm_model = LSTMModel(input_size, hidden_size, output_size)

# Create data loaders
train_dataset = TextEntailmentDataset(train_data, tokenizer)
val_dataset = TextEntailmentDataset(val_data, tokenizer)
test_dataset = TextEntailmentDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

# Define training parameters
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=1e-3)
num_epochs = 30

# Training loop
for epoch in range(num_epochs):
    lstm_model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'], batch['labels']
        outputs = lstm_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation loop
    lstm_model.eval()
    true_labels = []
    predicted_labels = []
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch['input_ids'], batch['labels']
            outputs = lstm_model(inputs)
            predictions = torch.argmax(outputs, dim=1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

        accuracy = correct / total
        print(f"Validation Accuracy: {accuracy:.4f}")

# Test loop
lstm_model.eval()
true_labels = []
predicted_labels = []
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch['input_ids'], batch['labels']
        outputs = lstm_model(inputs)
        predictions = torch.argmax(outputs, dim=1)
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predictions.cpu().numpy())
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")

# Print classification report
print(classification_report(true_labels, predicted_labels, target_names=test_labels))