In [2]:
import re
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch.nn as nn
from sklearn.metrics import accuracy_score
import ast
import torch.optim as optim
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [3]:
df_train = pd.read_csv('/kaggle/input/nn-text-classfication/train.csv')

In [4]:
model_new = SentenceTransformer('all-mpnet-base-v2', device='cuda')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
df_train.head()

Unnamed: 0,SampleID,Discussion,Category
0,1,"Without sitting down and doing it manually, yo...",Sports
1,2,All your Search ends with this link.,STEM
2,3,"No, the program you're using is made to be com...",STEM
3,4,Mike Woicik\n\nThe correct answer is: Mike Woi...,Sports
4,5,"No, but not because of why you might think. Wh...",Politics


In [6]:
df_train['Discussion'] = df_train['Discussion'].fillna('No Text')

In [7]:
def replace_dates(text):
    date_pattern = r'\b(\d{1,2}-[A-Za-z]{3}|\b[A-Za-z]+ \d{1,2}(\w{2})?)\b'
    return re.sub(date_pattern, '[DATE]', text)

df_train['Discussion'] = df_train['Discussion'].apply(replace_dates)

In [8]:
# Generate embeddings for the dataset
embeddings_train = model_new.encode(df_train['Discussion'].tolist(), convert_to_tensor=True).cpu().numpy()

# Save embeddings to a file
np.save('news_embeddings_train.npy', embeddings_train)

# Save corresponding labels
df_train['Category'].to_csv('news_labels.csv', index=False)

print("Embeddings and labels saved successfully!")

Batches:   0%|          | 0/781 [00:00<?, ?it/s]

Embeddings and labels saved successfully!


In [9]:
label_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df_train['Category']=df_train['Category'].map(label_mapping)

In [10]:
labels = list(df_train['Category'])

In [11]:
class TextDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = torch.stack([torch.tensor(x, dtype=torch.float32).unsqueeze(0) for x in sentences])
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

In [12]:
X_train, X_val, y_train, y_val = train_test_split(embeddings_train, labels, test_size=0.2, random_state=42)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [13]:
for batch in train_dataloader:
    sentences, labels = batch
    print(f"Train batch size: {sentences.size(0)}")
    print(f"Sentence shape: {sentences.shape}")
    print(f"Labels shape: {labels.shape}")
    break

Train batch size: 32
Sentence shape: torch.Size([32, 1, 768])
Labels shape: torch.Size([32])


In [14]:
class TextCNN(nn.Module):
    """
    TextCNN model for text classification.
    """
    def __init__(self, input_size, num_classes, kernel_sizes, num_filters, dropout=0.5):
        super(TextCNN, self).__init__()
        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv2d(1, num_filters, (k, input_size)),
                nn.ReLU(inplace=True),
                nn.BatchNorm2d(num_filters)
            )
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout) 
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)
        self.init_weights()

    def init_weights(self):  
        for conv in self.convs:
            nn.init.kaiming_uniform_(conv[0].weight, nonlinearity='relu')
        nn.init.xavier_uniform_(self.fc.weight)

    def forward(self, x):
        x = x.unsqueeze(1)  
        conv_outputs = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  
        pooled_outputs = [torch.max(output, dim=2)[0] for output in conv_outputs]  
        out = torch.cat(pooled_outputs, dim=1)  
        out = self.dropout(out)
        out = self.fc(out)
        out = torch.softmax(out, dim=1)  
        
        return out

In [18]:
def train_model(
    model, train_dataloader, optimizer, criterion, epochs=5, save_path='model.pth', scheduler=None
):
    model.train()
    best_acc = 0.0

    for epoch in range(epochs):
        running_loss = 0.0
        all_preds = []
        all_labels = []

        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
        for sentences, labels in progress_bar:
            sentences = sentences.float().to(device)
            labels = labels.long().to(device)

            optimizer.zero_grad()
            outputs = model(sentences)
            loss = criterion(outputs, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)  # Gradient clipping
            optimizer.step()

            running_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        if scheduler:
            scheduler.step()

        epoch_loss = running_loss / len(train_dataloader)
        epoch_acc = accuracy_score(all_labels, all_preds) * 100

        print(
            f"Epoch [{epoch+1}/{epochs}] "
            f"Loss: {epoch_loss:.4f} | Accuracy: {epoch_acc:.2f}%"
        )

        # Save the best model
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            torch.save(model.state_dict(), save_path)
            print(f"Model saved to {save_path} (Best Accuracy: {best_acc:.2f}%)")

    print("Training complete.")

In [16]:
# Validation function
def val_model(model, val_dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for sentences, labels in val_dataloader:
            sentences = sentences.float().to(device)
            labels = labels.long().to(device)
            outputs = model(sentences)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {accuracy * 100:.2f}%")
    return accuracy

In [17]:
input_size = 768 
num_classes = len(set(labels))  
kernel_sizes = [1]  
num_filters = 128 
dropout = 0.7      
learning_rate = 0.001
epochs = 30

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 


model = TextCNN(input_size, num_classes, kernel_sizes, num_filters, dropout).to(device)

criterion = nn.CrossEntropyLoss()  
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

In [19]:
# Train and validate
train_model(model, train_dataloader, optimizer, criterion, epochs=epochs, save_path='textcnn.pth', scheduler=scheduler)
val_model(model, val_dataloader)

                                                              

Epoch [1/30] Loss: 1.1490 | Accuracy: 64.36%
Model saved to textcnn.pth (Best Accuracy: 64.36%)


                                                              

Epoch [2/30] Loss: 0.7514 | Accuracy: 74.03%
Model saved to textcnn.pth (Best Accuracy: 74.03%)


                                                              

Epoch [3/30] Loss: 0.7065 | Accuracy: 75.33%
Model saved to textcnn.pth (Best Accuracy: 75.33%)


                                                              

Epoch [4/30] Loss: 0.6819 | Accuracy: 76.15%
Model saved to textcnn.pth (Best Accuracy: 76.15%)


                                                              

Epoch [5/30] Loss: 0.6595 | Accuracy: 76.52%
Model saved to textcnn.pth (Best Accuracy: 76.52%)


                                                              

Epoch [6/30] Loss: 0.6478 | Accuracy: 76.78%
Model saved to textcnn.pth (Best Accuracy: 76.78%)


                                                              

Epoch [7/30] Loss: 0.6361 | Accuracy: 77.29%
Model saved to textcnn.pth (Best Accuracy: 77.29%)


                                                              

Epoch [8/30] Loss: 0.6238 | Accuracy: 77.55%
Model saved to textcnn.pth (Best Accuracy: 77.55%)


                                                              

Epoch [9/30] Loss: 0.6218 | Accuracy: 77.72%
Model saved to textcnn.pth (Best Accuracy: 77.72%)


                                                               

Epoch [10/30] Loss: 0.6061 | Accuracy: 78.25%
Model saved to textcnn.pth (Best Accuracy: 78.25%)


                                                               

Epoch [11/30] Loss: 0.5836 | Accuracy: 79.00%
Model saved to textcnn.pth (Best Accuracy: 79.00%)


                                                               

Epoch [12/30] Loss: 0.5714 | Accuracy: 79.36%
Model saved to textcnn.pth (Best Accuracy: 79.36%)


                                                               

Epoch [13/30] Loss: 0.5629 | Accuracy: 79.48%
Model saved to textcnn.pth (Best Accuracy: 79.48%)


                                                               

Epoch [14/30] Loss: 0.5523 | Accuracy: 79.79%
Model saved to textcnn.pth (Best Accuracy: 79.79%)


                                                               

Epoch [15/30] Loss: 0.5433 | Accuracy: 80.38%
Model saved to textcnn.pth (Best Accuracy: 80.38%)


                                                               

Epoch [16/30] Loss: 0.5416 | Accuracy: 80.12%


                                                               

Epoch [17/30] Loss: 0.5349 | Accuracy: 80.50%
Model saved to textcnn.pth (Best Accuracy: 80.50%)


                                                               

Epoch [18/30] Loss: 0.5286 | Accuracy: 80.63%
Model saved to textcnn.pth (Best Accuracy: 80.63%)


                                                               

Epoch [19/30] Loss: 0.5216 | Accuracy: 80.81%
Model saved to textcnn.pth (Best Accuracy: 80.81%)


                                                               

Epoch [20/30] Loss: 0.5187 | Accuracy: 81.12%
Model saved to textcnn.pth (Best Accuracy: 81.12%)


                                                               

Epoch [21/30] Loss: 0.5086 | Accuracy: 81.16%
Model saved to textcnn.pth (Best Accuracy: 81.16%)


                                                               

Epoch [22/30] Loss: 0.4906 | Accuracy: 81.68%
Model saved to textcnn.pth (Best Accuracy: 81.68%)


                                                               

Epoch [23/30] Loss: 0.4922 | Accuracy: 81.81%
Model saved to textcnn.pth (Best Accuracy: 81.81%)


                                                               

Epoch [24/30] Loss: 0.4829 | Accuracy: 82.29%
Model saved to textcnn.pth (Best Accuracy: 82.29%)


                                                               

Epoch [25/30] Loss: 0.4772 | Accuracy: 82.33%
Model saved to textcnn.pth (Best Accuracy: 82.33%)


                                                               

Epoch [26/30] Loss: 0.4741 | Accuracy: 82.44%
Model saved to textcnn.pth (Best Accuracy: 82.44%)


                                                               

Epoch [27/30] Loss: 0.4671 | Accuracy: 82.58%
Model saved to textcnn.pth (Best Accuracy: 82.58%)


                                                               

Epoch [28/30] Loss: 0.4672 | Accuracy: 82.60%
Model saved to textcnn.pth (Best Accuracy: 82.60%)


                                                               

Epoch [29/30] Loss: 0.4662 | Accuracy: 82.83%
Model saved to textcnn.pth (Best Accuracy: 82.83%)


                                                               

Epoch [30/30] Loss: 0.4599 | Accuracy: 82.88%
Model saved to textcnn.pth (Best Accuracy: 82.88%)
Training complete.
Validation Accuracy: 78.57%


0.7857142857142857

In [20]:
df_test=pd.read_csv('/kaggle/input/nn-text-cSlassfication/test.csv')

In [21]:
df_test['Discussion'] = df_test['Discussion'].fillna('No Text')

In [22]:
def replace_dates(text):
    date_pattern = r'\b(\d{1,2}-[A-Za-z]{3}|\b[A-Za-z]+ \d{1,2}(\w{2})?)\b'
    return re.sub(date_pattern, '[DATE]', text)

df_test['Discussion'] = df_test['Discussion'].apply(replace_dates)

In [23]:
# Generate embeddings for the dataset
embeddings_test = model_new.encode(df_test['Discussion'].tolist(), convert_to_tensor=True).cpu().numpy()

# Save embeddings to a file
np.save('news_embeddings_train.npy', embeddings_test)

print("Embeddings and labels saved successfully!")

Batches:   0%|          | 0/330 [00:00<?, ?it/s]

Embeddings and labels saved successfully!


In [24]:
class CustomDataset(Dataset):
    def __init__(self, sentences, sample_ids):
        self.sentences = torch.stack([torch.tensor(x, dtype=torch.float32).unsqueeze(0) for x in sentences])
        self.sample_ids = torch.tensor(sample_ids, dtype=torch.long)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.sample_ids[idx]

In [25]:
test_dataset = CustomDataset(embeddings_test, df_test['SampleID'])
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [26]:
save_csv_path = '/kaggle/working/predictions_text_trial.csv'

In [27]:
def test_model(model, val_dataloader, save_csv_path='predictions.csv', device='cuda'):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    sample_ids = []  # To store sample IDs

    with torch.no_grad():
        for sentences, ids in val_dataloader:  # Extract sentences and IDs from DataLoader
            sentences = sentences.float().to(device)  # Move sentences to GPU (or CPU if needed)
            outputs = model(sentences)
            _, preds = torch.max(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())  # Move predictions back to CPU
            sample_ids.extend(ids.numpy())  # Collect the sample IDs

    # Save predictions to a CSV file
    predictions_df = pd.DataFrame({
        'SampleID': sample_ids,
        'Category': all_preds
    })
    predictions_df.to_csv(save_csv_path, index=False)
    print(f"Predictions saved to {save_csv_path}")
    
    return predictions_df

In [28]:
test_model(model, test_dataloader, save_csv_path)

Predictions saved to /kaggle/working/predictions_text_trial.csv


Unnamed: 0,SampleID,Category
0,1,3
1,2,0
2,3,1
3,4,4
4,5,3
...,...,...
10552,10553,4
10553,10554,3
10554,10555,3
10555,10556,3
