# 1st
- pre-trained model

In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import random
import numpy as np

# Load dataset
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Preprocess data
X = train_data.drop(columns=['SUBCLASS', 'ID'], errors='ignore')
y = train_data['SUBCLASS']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# List of pre trained models
pre_trained_model1 = 'bert-base-uncased'
pre_trained_model2 = 'zhihan1996/DNA_bert_6'

# Define custom dataset
class CustomDataset(Dataset):
    def __init__(self, data, labels=None, tokenizer_name='bert-base-uncased', max_length=128):
        self.data = data
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx].astype(str).values
        text = ' '.join(text)
        inputs = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        if self.labels is not None:
            label = self.labels[idx]
            return input_ids, attention_mask, label
        return input_ids, attention_mask

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Data augmentation function
def augment_text(text, n_augments=3):
    words = text.split()
    augmented_texts = []
    for _ in range(n_augments):
        random.shuffle(words)
        augmented_texts.append(' '.join(words))
    return augmented_texts

# Apply data augmentation on training data
augmented_data = []
augmented_labels = []
for idx in range(len(X_train)):
    text = ' '.join(X_train.iloc[idx].astype(str).values)
    aug_texts = augment_text(text)
    for aug_text in aug_texts:
        augmented_data.append(aug_text)
        augmented_labels.append(y_train[idx])

augmented_df = pd.DataFrame({'text': augmented_data})
X_train_augmented = pd.concat([X_train, augmented_df], ignore_index=True)
y_train_augmented = np.concatenate([y_train, augmented_labels])

# Create datasets and dataloaders
train_dataset = CustomDataset(X_train_augmented, y_train_augmented)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(test_data.drop(columns=['ID'], errors='ignore'))

batch_size = 8 if torch.cuda.is_available() else 4
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_dataloader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Loss function
criterion = nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Training loop
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = tuple(b.to(device) for b in batch)
        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}')

# Validation loop
model.eval()
y_val_pred = []
with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = tuple(b.to(device) for b in batch)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        y_val_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())

print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

Epoch 1/20, Training Loss: 3.0978
Epoch 2/20, Training Loss: 3.0709
Epoch 3/20, Training Loss: 3.0546
Epoch 4/20, Training Loss: 3.0489
Epoch 5/20, Training Loss: 3.0483
Epoch 6/20, Training Loss: 3.0487
Epoch 7/20, Training Loss: 3.0490
Epoch 8/20, Training Loss: 3.0479
Epoch 9/20, Training Loss: 3.0485
Epoch 10/20, Training Loss: 3.0495
Epoch 11/20, Training Loss: 3.0484
Epoch 12/20, Training Loss: 3.0494
Epoch 13/20, Training Loss: 3.0493
Epoch 14/20, Training Loss: 3.0495
Epoch 15/20, Training Loss: 3.0479
Epoch 16/20, Training Loss: 3.0477
Epoch 17/20, Training Loss: 3.0489
Epoch 18/20, Training Loss: 3.0485
Epoch 19/20, Training Loss: 3.0480
Epoch 20/20, Training Loss: 3.0473
              precision    recall  f1-score   support

         ACC       0.00      0.00      0.00        14
        BLCA       0.00      0.00      0.00        21
        BRCA       0.14      0.90      0.24       157
        CESC       0.00      0.00      0.00        31
        COAD       0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
# Test set prediction
model.eval()
y_test_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask = tuple(b.to(device) for b in batch)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        y_test_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())

# Prepare submission
submission = pd.DataFrame({'ID': test_data['ID'], 'SUBCLASS': label_encoder.inverse_transform(y_test_pred)})
submission.to_csv('data/submission/final_1st.csv', index=False)