In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [2]:
import re
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch.nn as nn
from sklearn.metrics import accuracy_score
import ast
import torch.optim as optim
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare Train Data

In [5]:
df_train = pd.read_csv('/kaggle/input/nn-text-classfication/train.csv')

In [6]:
model_new = SentenceTransformer('all-mpnet-base-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
df_train.head()

Unnamed: 0,SampleID,Discussion,Category
0,1,"Without sitting down and doing it manually, yo...",Sports
1,2,All your Search ends with this link.,STEM
2,3,"No, the program you're using is made to be com...",STEM
3,4,Mike Woicik\n\nThe correct answer is: Mike Woi...,Sports
4,5,"No, but not because of why you might think. Wh...",Politics


In [8]:
df_train['Discussion'] = df_train['Discussion'].fillna('No Text')

In [9]:
def replace_dates(text):
    date_pattern = r'\b(\d{1,2}-[A-Za-z]{3}|\b[A-Za-z]+ \d{1,2}(\w{2})?)\b'
    return re.sub(date_pattern, '[DATE]', text)

df_train['Discussion'] = df_train['Discussion'].apply(replace_dates)

In [10]:
# Generate embeddings for the dataset
embeddings_train = model_new.encode(df_train['Discussion'].tolist(), convert_to_tensor=True).cpu().numpy()

# Save embeddings to a file
np.save('news_embeddings_train.npy', embeddings_train)

# Save corresponding labels
df_train['Category'].to_csv('news_labels.csv', index=False)

print("Embeddings and labels saved successfully!")

Batches:   0%|          | 0/781 [00:00<?, ?it/s]

Embeddings and labels saved successfully!


In [11]:
# Example of checking the value counts
category_distribution = df_train['Category'].value_counts()
print(category_distribution)

Category
STEM                5530
Market & Economy    5530
Sports              5529
Politics            4200
Media               4200
Name: count, dtype: int64


In [53]:
class_counts = [5530, 5530, 5529, 4200, 4200]
class_names = ['STEM', 'Market & Economy', 'Sports', 'Politics', 'Media']
num_classes = len(class_counts)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.arange(num_classes), y=np.repeat(np.arange(num_classes), class_counts))
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

print("Class weights:", class_weights_tensor)

Class weights: tensor([0.9038, 0.9038, 0.9039, 1.1900, 1.1900], device='cuda:0')


In [11]:
label_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df_train['Category']=df_train['Category'].map(label_mapping)

In [12]:
df_train['Category']

0        1
1        4
2        4
3        1
4        0
        ..
24984    1
24985    3
24986    3
24987    0
24988    2
Name: Category, Length: 24989, dtype: int64

In [13]:
labels = list(df_train['Category'])

In [14]:
class TextDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = torch.stack([torch.tensor(x, dtype=torch.float32) for x in sentences])  
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

In [15]:
X_train, X_val, y_train, y_val = train_test_split(embeddings_train, labels, test_size=0.2, random_state=42)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [16]:
for batch in train_dataloader:
    sentences, labels = batch
    print(f"Train batch size: {sentences.size(0)}")
    print(f"Sentence shape: {sentences.shape}")
    print(f"Labels shape: {labels.shape}")
    break

Train batch size: 32
Sentence shape: torch.Size([32, 768])
Labels shape: torch.Size([32])


# Archi BI LSTM

In [17]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.5):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # BiLSTM Layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, bidirectional=True, dropout=dropout)
        
        # Dropout and BatchNorm
        self.dropout = nn.Dropout(dropout)
        self.batch_norm = nn.BatchNorm1d(hidden_size * 2)
        
        # Fully Connected Layer
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        device = x.device
        x = x.unsqueeze(1)

        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)

        out, _ = self.lstm(x, (h0, c0))  
        
        last_hidden = out[:, -1, :]  
        last_hidden = self.batch_norm(last_hidden)
        last_hidden = self.dropout(last_hidden)

        # Fully connected layer
        output = self.fc(last_hidden)
        return output

In [18]:
def train_model(model, train_dataloader, optimizer, criterion, scheduler, 
                epochs=5, save_path='model.pth'):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        all_preds = []
        all_labels = []

        for sentences, labels in train_dataloader:
            sentences = sentences.float().to(device)
            labels = labels.long().to(device)

            optimizer.zero_grad()
            outputs = model(sentences)
            loss = criterion(outputs, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)  # Gradient clipping
            optimizer.step()

            running_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        epoch_loss = running_loss / len(train_dataloader)
        epoch_acc = accuracy_score(all_labels, all_preds) * 100
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")

        scheduler.step()

    print("Training complete.")
    # Save the entire model
    torch.save(model, save_path)

    print(f"Model saved to {save_path}")

In [19]:
def val_model(model, val_dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for sentences, labels in val_dataloader:
            sentences = sentences.float().to(device) 
            labels = labels.long().to(device)
            outputs = model(sentences)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {accuracy * 100:.2f}%")
    return accuracy

In [21]:
# Hyperparameters
input_size = 768
hidden_size = 128
num_layers = 2
num_classes = 5
learning_rate = 0.001
dropout = 0.5
weight_decay = 1e-4
epochs = 20

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Model, criterion, optimizer, and scheduler
model = BiLSTM(input_size, hidden_size, num_layers, num_classes, dropout=dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [22]:
train_model(model, train_dataloader, optimizer, criterion,scheduler, 50, save_path='bilstm_model_entire.pth')
val_model(model, val_dataloader)

Epoch [1/50], Loss: 0.8003, Accuracy: 72.02%
Epoch [2/50], Loss: 0.6998, Accuracy: 76.13%
Epoch [3/50], Loss: 0.6827, Accuracy: 76.53%
Epoch [4/50], Loss: 0.6690, Accuracy: 77.04%
Epoch [5/50], Loss: 0.6635, Accuracy: 76.95%
Epoch [6/50], Loss: 0.6309, Accuracy: 78.10%
Epoch [7/50], Loss: 0.6234, Accuracy: 78.21%
Epoch [8/50], Loss: 0.6201, Accuracy: 78.35%
Epoch [9/50], Loss: 0.6210, Accuracy: 78.38%
Epoch [10/50], Loss: 0.6185, Accuracy: 78.46%
Epoch [11/50], Loss: 0.6094, Accuracy: 78.72%
Epoch [12/50], Loss: 0.6080, Accuracy: 78.60%
Epoch [13/50], Loss: 0.6080, Accuracy: 78.66%
Epoch [14/50], Loss: 0.6082, Accuracy: 78.66%
Epoch [15/50], Loss: 0.6104, Accuracy: 78.71%
Epoch [16/50], Loss: 0.6099, Accuracy: 78.84%
Epoch [17/50], Loss: 0.6071, Accuracy: 78.75%
Epoch [18/50], Loss: 0.6065, Accuracy: 78.71%
Epoch [19/50], Loss: 0.6086, Accuracy: 78.68%
Epoch [20/50], Loss: 0.6043, Accuracy: 78.80%
Epoch [21/50], Loss: 0.6085, Accuracy: 78.77%
Epoch [22/50], Loss: 0.6102, Accuracy: 78.6

0.7857142857142857

# Testing

In [23]:
df_test=pd.read_csv('/kaggle/input/nn-text-classfication/test.csv')

In [24]:
df_test['Discussion'] = df_test['Discussion'].fillna('No Text')

In [25]:
def replace_dates(text):
    date_pattern = r'\b(\d{1,2}-[A-Za-z]{3}|\b[A-Za-z]+ \d{1,2}(\w{2})?)\b'
    return re.sub(date_pattern, '[DATE]', text)

df_test['Discussion'] = df_test['Discussion'].apply(replace_dates)

In [26]:
# Generate embeddings for the dataset
embeddings_test = model_new.encode(df_test['Discussion'].tolist(), convert_to_tensor=True).cpu().numpy()

# Save embeddings to a file
np.save('news_embeddings_train.npy', embeddings_test)

print("Embeddings and labels saved successfully!")

Batches:   0%|          | 0/330 [00:00<?, ?it/s]

Embeddings and labels saved successfully!


In [27]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, sentences, sample_ids):
        # Store sentences as a tensor
        self.sentences = torch.stack([torch.tensor(x, dtype=torch.float32) for x in sentences])   # Converting string to list using eval
        self.sample_ids = sample_ids

    def __len__(self):
        return len(self.sentences)  # Use self.sentences here

    def __getitem__(self, idx):
        # Return sentence embeddings and the corresponding sample ID
        return self.sentences[idx], self.sample_ids[idx]

In [28]:
test_dataset = CustomDataset(embeddings_test, df_test['SampleID'])
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [29]:
save_csv_path = '/kaggle/working/predictions_BILSTM.csv'

In [30]:
def test_model(model, val_dataloader, save_csv_path='predictions.csv'):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    sample_ids = []  # To store sample IDs

    with torch.no_grad():
        for sentences, ids in val_dataloader:  # Extract sentences and IDs from DataLoader
            sentences = sentences.float().to(device)  # Move sentences to GPU (or CPU if needed)
            outputs = model(sentences)
            _, preds = torch.max(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())  # Move predictions back to CPU
            sample_ids.extend(ids.numpy())  # Collect the sample IDs

    # Save predictions to a CSV file
    predictions_df = pd.DataFrame({
        'SampleID': sample_ids,
        'Category': all_preds
    })
    predictions_df.to_csv(save_csv_path, index=False)
    print(f"Predictions saved to {save_csv_path}")
    
    return predictions_df

In [32]:
test_model(model, test_dataloader, save_csv_path)

Predictions saved to /kaggle/working/predictions_BILSTM.csv


Unnamed: 0,SampleID,Category
0,1,3
1,2,0
2,3,1
3,4,4
4,5,3
...,...,...
10552,10553,4
10553,10554,3
10554,10555,3
10555,10556,0
