In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import warnings
import emoji
warnings.filterwarnings('ignore')

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = emoji.demojize(text)
    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [6]:
train['text'] = train['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)

In [7]:
class TDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self,idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item 

    def __len__(self):
        return len(self.labels)

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=0.4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.4, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
X = train['text'].values.tolist() 
y = train['target'].values.tolist() 

In [12]:
train_texts, val_texts, train_labels, val_labels = train_test_split(X,y, train_size=0.3, random_state=1)

In [13]:
train_dataset = TDataset(train_texts, train_labels, tokenizer)
val_dataset = TDataset(val_texts, val_labels, tokenizer)

In [14]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [15]:
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = torch.FloatTensor(class_weights).to(device)

In [16]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

In [17]:
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [18]:
def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs):
    best_val_accuracy = 0
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        model.train()
        total_train_loss = 0
        train_predictions = []
        train_true_labels = []
        
        for batch in tqdm(train_loader, desc='Training'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            loss = criterion(logits, labels)
            total_train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            predictions = torch.argmax(logits, dim=1)
            train_predictions.extend(predictions.cpu().numpy())
            train_true_labels.extend(labels.cpu().numpy())
        
        avg_train_loss = total_train_loss / len(train_loader)
        train_accuracy = accuracy_score(train_true_labels, train_predictions)
        
        print(f'Average training loss: {avg_train_loss:}')
        print(f'Training accuracy: {train_accuracy:}')
        print(classification_report(train_true_labels, train_predictions))
        
        model.eval()
        total_val_loss = 0
        val_predictions = []
        val_true_labels = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                
                loss = criterion(logits, labels)
                total_val_loss += loss.item()
                
                predictions = torch.argmax(logits, dim=1)
                val_predictions.extend(predictions.cpu().numpy())
                val_true_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = accuracy_score(val_true_labels, val_predictions)
        
        print(f'\nAverage validation loss: {avg_val_loss}')
        print(f'Validation accuracy: {val_accuracy}')
        print(classification_report(val_true_labels, val_predictions))

In [19]:
train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs=4)


Epoch 1/4


Training: 100%|████████████████████████████████████████████████████████████████████████| 36/36 [00:25<00:00,  1.44it/s]


Average training loss: 0.640159995191627
Training accuracy: 0.6202365308804205
              precision    recall  f1-score   support

           0       0.68      0.60      0.64      1277
           1       0.56      0.65      0.60      1006

    accuracy                           0.62      2283
   macro avg       0.62      0.62      0.62      2283
weighted avg       0.63      0.62      0.62      2283



Validation: 100%|██████████████████████████████████████████████████████████████████████| 84/84 [00:17<00:00,  4.67it/s]



Average validation loss: 0.5251189182911601
Validation accuracy: 0.7637898686679174
              precision    recall  f1-score   support

           0       0.77      0.85      0.81      3065
           1       0.76      0.65      0.70      2265

    accuracy                           0.76      5330
   macro avg       0.76      0.75      0.75      5330
weighted avg       0.76      0.76      0.76      5330


Epoch 2/4


Training: 100%|████████████████████████████████████████████████████████████████████████| 36/36 [00:23<00:00,  1.52it/s]


Average training loss: 0.5011729117896822
Training accuracy: 0.7744196233026719
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1277
           1       0.75      0.73      0.74      1006

    accuracy                           0.77      2283
   macro avg       0.77      0.77      0.77      2283
weighted avg       0.77      0.77      0.77      2283



Validation: 100%|██████████████████████████████████████████████████████████████████████| 84/84 [00:17<00:00,  4.68it/s]



Average validation loss: 0.46810848727112725
Validation accuracy: 0.799812382739212
              precision    recall  f1-score   support

           0       0.83      0.82      0.83      3065
           1       0.76      0.77      0.77      2265

    accuracy                           0.80      5330
   macro avg       0.80      0.80      0.80      5330
weighted avg       0.80      0.80      0.80      5330


Epoch 3/4


Training: 100%|████████████████████████████████████████████████████████████████████████| 36/36 [00:23<00:00,  1.53it/s]


Average training loss: 0.43758362614446217
Training accuracy: 0.8177836180464302
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      1277
           1       0.82      0.75      0.78      1006

    accuracy                           0.82      2283
   macro avg       0.82      0.81      0.81      2283
weighted avg       0.82      0.82      0.82      2283



Validation: 100%|██████████████████████████████████████████████████████████████████████| 84/84 [00:17<00:00,  4.67it/s]



Average validation loss: 0.4539369028948602
Validation accuracy: 0.8120075046904315
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      3065
           1       0.80      0.75      0.77      2265

    accuracy                           0.81      5330
   macro avg       0.81      0.80      0.81      5330
weighted avg       0.81      0.81      0.81      5330


Epoch 4/4


Training: 100%|████████████████████████████████████████████████████████████████████████| 36/36 [00:23<00:00,  1.53it/s]


Average training loss: 0.40355272094408673
Training accuracy: 0.8287341217696014
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      1277
           1       0.83      0.77      0.80      1006

    accuracy                           0.83      2283
   macro avg       0.83      0.82      0.82      2283
weighted avg       0.83      0.83      0.83      2283



Validation: 100%|██████████████████████████████████████████████████████████████████████| 84/84 [00:17<00:00,  4.69it/s]


Average validation loss: 0.46409895750028746
Validation accuracy: 0.8078799249530957
              precision    recall  f1-score   support

           0       0.84      0.83      0.83      3065
           1       0.77      0.78      0.78      2265

    accuracy                           0.81      5330
   macro avg       0.80      0.80      0.80      5330
weighted avg       0.81      0.81      0.81      5330






In [20]:
test_dataset = TDataset(test['text'].values.tolist(), [-1] * len(test), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [21]:
def predict(model, test_loader, device):
    model.eval()  
    predictions = []
    with torch.no_grad():  
        for batch in tqdm(test_loader, desc='Testing'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1)
            predictions.extend(batch_predictions.cpu().numpy()) 
    return predictions

In [22]:
test_predictions = predict(model, test_loader, device)

Testing: 100%|█████████████████████████████████████████████████████████████████████████| 51/51 [00:08<00:00,  6.18it/s]


In [23]:
submission = pd.DataFrame({'id': test['id'], 'target': test_predictions})
submission.to_csv('submission_bert.csv', index=False)