In [42]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn import functional as F
import numpy as np

## Define the device on which we load the model

In [28]:
# define the device on which we load the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [29]:
model_name = "bert-base-uncased"  # You can replace this with distilbert-base-uncased or roberta-base
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Use num_labels=2 for binary classification

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Example of tokenizing text

In [30]:
inputs = tokenizer("Example disaster tweet text here", return_tensors="pt", padding=True, truncation=True, max_length=512)
output = model(**inputs)
print(f"{type(output)}: {output}")

<class 'transformers.modeling_outputs.SequenceClassifierOutput'>: SequenceClassifierOutput(loss=None, logits=tensor([[0.1375, 0.1818]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## Load and preprocess the data

In [31]:
# Load the dataset
df = pd.read_csv('data/train.csv')

# Basic preprocessing
# For simplicity, here we'll just fill missing values
df.fillna('', inplace=True)  # Replace NaN with empty strings

## Tokenize the text data

In [32]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
inputs = tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
print(type(inputs))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


## Prepare the dataset and dataloader

In [33]:
class DisasterTweetsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # self.encodings is a dictionary containing tokenized input data
        # we are making tensors of the values in the dictionary
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['target'], test_size=0.2)

# Tokenize the training and validation data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Create datasets
train_dataset = DisasterTweetsDataset(train_encodings, train_labels.tolist())
val_dataset = DisasterTweetsDataset(val_encodings, val_labels.tolist())

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

## Define method to calculate the accuracy

In [34]:
def calculate_accuracy(preds, labels):
    # Convert predictions to binary (0 or 1) by looking at the predicted class with the highest score
    pred_flat = torch.argmax(preds, dim=1).flatten()
    labels_flat = labels.flatten()
    correct = torch.sum(pred_flat == labels_flat).item()  # Count how many predictions match the labels
    total = len(labels_flat)
    accuracy = correct / total
    return accuracy

## Define the model

In [35]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

## Train the model

In [36]:
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3  # Example epoch count
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epochs):
    # Training phase
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(model.device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss:.2f}")

    # Evaluation phase
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(model.device)
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()
            logits = outputs.logits
            total_eval_accuracy += calculate_accuracy(logits, labels)

    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    avg_val_loss = total_eval_loss / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.2f}, Validation Accuracy: {avg_val_accuracy:.2f}")



Epoch 1, Average Training Loss: 0.44
Epoch 1, Validation Loss: 0.39, Validation Accuracy: 0.84
Epoch 2, Average Training Loss: 0.29
Epoch 2, Validation Loss: 0.41, Validation Accuracy: 0.83
Epoch 3, Average Training Loss: 0.17
Epoch 3, Validation Loss: 0.53, Validation Accuracy: 0.81


## Load and preprocess the test data

In [38]:
# Load the test data
test_df = pd.read_csv('data/test.csv')

# Basic preprocessing if necessary (similar to what was done with training data)
test_df.fillna('', inplace=True)

## Tokenize the test data

In [39]:
test_encodings = tokenizer(test_df['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

## Prepare the test dataset

In [40]:
class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_encodings)

## Make predictions

In [43]:
# Prepare DataLoader for the test data
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model.eval()  # Set the model to evaluation mode
predictions = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Convert predictions list to numpy array if needed
predictions = np.array(predictions)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [44]:
submission_df = pd.DataFrame({'id': test_df['id'], 'target': predictions})
submission_df.to_csv('submission.csv', index=False)