In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd gdrive/My Drive/SMM4H-2024

In [None]:
!pip install -U transformers datasets evaluate accelerate
!pip install scikit-learn
!pip install tensorboard
!pip3 install emoji==0.6.0

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datasets import Dataset
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification, RobertaForSequenceClassification, RobertaTokenizerFast, DataCollatorWithPadding, pipeline, BioGptForTokenClassification, OpenAIGPTForSequenceClassification
from transformers.optimization import Adafactor, AdafactorSchedule
from sklearn.metrics import accuracy_score, f1_score
import evaluate
import glob
import numpy as np
import spacy
import html
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import set_seed

BATCH_SIZE = 8
NUM_PROCS = 32
LR = 7.21422e-06
WEIGHT_DECAY = 0.00694763
EPOCHS = 10

# Load the training data
train_data = pd.read_csv('SMM4H-2024-Task5-Training.tsv', sep='\t')
# Remove the "twitter_id" column
train_data = train_data.drop(columns=["tweet_id"])

# Load the validation data
val_data = pd.read_csv('SMM4H-2024-Task5-Validation.tsv', sep='\t')
# Remove the "twitter_id" column
val_data = val_data.drop(columns=["tweet_id"])

# View the first few rows of the training data
print(train_data.head())
# View the first few rows of the validation data
print(val_data.head())

In [None]:
# Define preprocessing function
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    # Remove URLs
    cleaned_tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)

    # Remove mentions and hashtags
    cleaned_tweet = re.sub(r'@\w+', '', cleaned_tweet)
    cleaned_tweet = re.sub(r'#', '', cleaned_tweet)

    # Remove unscape HTML tags
    cleaned_tweet = html.unescape(cleaned_tweet)

    # Remove extra spaces, tabs, and newlines
    cleaned_tweet = re.sub(r'\s+', ' ', cleaned_tweet).strip()

    return cleaned_tweet


train_data["text"] = train_data["text"].apply(preprocess_tweet)
val_data["text"] = val_data["text"].apply(preprocess_tweet)

# View the first few rows of the training data
print(train_data.head())
# View the first few rows of the validation data
print(val_data.head())

In [None]:
id2label = {
    0: "Negative",
    1: "Positive",
}
label2id = {
    "Negative": 0,
    "Positive": 1,
}


# Convert train_data and val_data from Pandas DataFrames into Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(val_data)

In [None]:
# uncomment this if you want to set random seed values manually
#set_seed(1)

# Load BERTweet tokenizer
#tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

# Load BERT tokenizer
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load PubMedBERT abstract only tokenizer
#tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")

# Load PubMedBERT abstract large tokenizer
#tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-large-uncased-abstract")

# Load PubMedBERT abstract + full-text tokenizer
#tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Load BioBERT tokenizer
#tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

# Load BioLinkBERT-base tokenizer
#tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')

# Load BioLinkBERT-large tokenizer
#tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-large')

# Load RoBERTa-base tokenizer
#tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")

# Load RoBERTa-large tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')

# Load ClinicalBERT tokenizer
#tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")

# Helper function for preprocessing.
def preprocess_function(examples):
    max_length = 512  # Adjust as needed
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt", max_length=max_length)

#tokenizing training data
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)
#tokenizing validation data
tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)

# Initialize data collator.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load('accuracy')
f1_score = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    #calculating the accuracy
    #accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    #calulating the F1 score
    f1_score_result = f1_score.compute(predictions=predictions, references=labels)
    return f1_score_result

In [None]:
# This cell is basically for loading the model
"""
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-large",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)
"""

model = RobertaForSequenceClassification.from_pretrained(
    "roberta-large",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)


In [None]:

# Define the training arguments
training_args = TrainingArguments(
    output_dir="smm4h_classification_900",
    #seed=123,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_dir='./logs',
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to='tensorboard',
    fp16=True,
    logging_steps=500
)


# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    callbacks=[],
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

history = trainer.train()

# Get predictions for the evaluation dataset using the best model
eval_predictions = trainer.predict(tokenized_valid)

# Access the predicted labels from the output
predicted_labels = eval_predictions.predictions.argmax(axis=1)  # Get class indices

# Access the predicted probabilities for class "1" (or any other class) from the output
predicted_probabilities_class_1 = eval_predictions.predictions[:, 1]  # Assuming class "1" is index 1

# You can now process or analyze the predicted labels for the evaluation set
print(f"Predicted labels for evaluation data: {predicted_labels}")

# You can now process or analyze the predicted probabilities for the evaluation set
print(f"Predicted probabilities for class 1 (logit values): {predicted_probabilities_class_1}")


# Apply softmax to convert logits to probabilities
probabilities = np.exp(eval_predictions.predictions) / np.sum(np.exp(eval_predictions.predictions), axis=1, keepdims=True)

# Now you can access the normalized probabilities for class "1"
predicted_probabilities_class_1_normalized = probabilities[:, 1]

# You can now process or analyze the predicted probabilities for the evaluation set
print(f"Predicted probabilities for class 1 (normalized values): {predicted_probabilities_class_1_normalized}")

