<div class="alert alert-info alertinfo" style="margin-top: 0px">
<h1> Natural Language Processing with Disaster Tweets </h1>
part 5 - BERT model
</div>

### Imports 

In [1]:
# Standard libraries
import pandas as pd
import numpy as np
from collections import Counter

# Warning messages
import warnings
warnings.filterwarnings("ignore")
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# NLP
import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score
from nltk.corpus import words, stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize

In [2]:
# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

if cuda_available:
    device_count = torch.cuda.device_count()
    current_device = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(current_device)

    print(f"CUDA device count: {device_count}")
    print(f"CUDA current device: {current_device}")
    print(f"CUDA device name: {device_name}")
else:
    print("CUDA is not available.")


CUDA available: False
CUDA is not available.


### Read data set

In [2]:
# Load data
df = pd.read_csv('train.csv')
eval_set = pd.read_csv('test.csv')
df.fillna('', inplace=True)
logger.info("Data loaded successfully")
df.head()

INFO:__main__:Data loaded successfully


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Data cleaning

In [3]:
### Data cleaning
STOPWORDS = set(stopwords.words('english'))

def clean_phase_1(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'([^\x00-\x7F])+', '', text)
    text = ''.join(' ' if not c.isalpha() else c for c in text)
    text = ' '.join(text.split())
    text = ' '.join([word for word in text.split() if word.lower() not in STOPWORDS])
    text = ' '.join([word for word in text.split() if len(word) > 2])
    return text

def prepare_for_ml(data):
    data.fillna('', inplace=True)
    data['processed text'] = data['text'].apply(clean_phase_1)
    return data

# Prepare data for training
df = prepare_for_ml(df)
eval_set = prepare_for_ml(eval_set)
logger.info("Data cleaned successfully")
df.head()

INFO:__main__:Data cleaned successfully


Unnamed: 0,id,keyword,location,text,target,processed text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...


### fine-tuning BERT

In [4]:
# Set logging level to suppress unnecessary output
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

# Define custom dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Split data into train and validation sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['processed text'], df['target'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

# Convert labels to tensors
train_labels = torch.tensor(list(train_labels.values))
test_labels = torch.tensor(list(test_labels.values))

# Create datasets
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# Define BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Custom weight initialization function for classifier layer
def initialize_classifier_weights(model):
    torch.nn.init.xavier_uniform_(model.classifier.weight)
    torch.nn.init.zeros_(model.classifier.bias)

# Manually initialize the classifier weights
initialize_classifier_weights(model)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    logging_dir='./logs',
    evaluation_strategy="epoch",
)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_score(labels, preds)
    return {'accuracy': accuracy}

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


In [5]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.501765,0.815496
2,0.454000,0.449694,0.824032
3,0.313700,0.722879,0.790545
4,0.195000,0.681207,0.795798
5,0.195000,1.057714,0.790545


Checkpoint destination directory ./results\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results\checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1905, training_loss=0.27611856135170604, metrics={'train_runtime': 9798.0679, 'train_samples_per_second': 3.108, 'train_steps_per_second': 0.194, 'total_flos': 657212360742000.0, 'train_loss': 0.27611856135170604, 'epoch': 5.0})

### accuracy and predictions

In [6]:
# accuracy
eval_results = trainer.evaluate()
print(f"Accuracy: {eval_results['eval_accuracy'] * 100:.2f}%")

# Get predictions
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Add predicted labels to the validation DataFrame
df_test = df.iloc[test_texts.index].copy()
df_test['predicted'] = predicted_labels

# Save or print the DataFrame with predictions
df_test.to_csv('test_with_predictions.csv', index=False)
df_test.head()

Accuracy: 79.05%


Unnamed: 0,id,keyword,location,text,target,processed text,predicted
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1,new weapon cause imaginable destruction,0
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0,things gishwhes got soaked deluge going pads t...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1,georgegalloway galloway mayor col police catch...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0,aftershock back school kick great want thank e...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0,response trauma children addicts develop defen...,0


### Submissions (actual predictions)

In [8]:
# prepare evaluation set the same way as training set
eval_set = pd.read_csv('test.csv')
eval_set.fillna('', inplace=True)
eval_set = prepare_for_ml(eval_set)
eval_texts = eval_set['processed text']
eval_encodings = tokenizer(list(eval_texts), truncation=True, padding=True)

# create data set without labels
class EvalDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])
    
eval_dataset = EvalDataset(eval_encodings)

In [9]:
# Get predictions
eval_predictions = trainer.predict(eval_dataset)
predicted_eval_labels = np.argmax(eval_predictions.predictions, axis=1)

# Create submissions
submissions = pd.DataFrame({
    'id': eval_set['id'],
    'target': predicted_eval_labels
})
submissions

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [10]:
submissions.to_csv('submissions.csv', index=False)

<div class="alert-danger" style="margin-top: 0px">
<h1> Actual evaluation score 81.8% </h1>
</div>
