In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd gdrive/My Drive/SMM4H-2024

In [None]:
!pip install -U transformers datasets evaluate accelerate
!pip install scikit-learn
!pip install tensorboard
!pip install ray[tune]
!pip install -U hyperopt
!pip3 install emoji==0.6.0

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import re
import string
import nltk
from datasets import Dataset
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification, RobertaForSequenceClassification, RobertaTokenizerFast, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import evaluate
import glob
import numpy as np
import html
from ray import tune
import random
from ray.tune.stopper import TrialPlateauStopper

NUM_PROCS = 32

# Load the training data
train_data = pd.read_csv('SMM4H-2024-Task5-Training.tsv', sep='\t')
# Remove the "twitter_id" column
train_data = train_data.drop(columns=["tweet_id"])

# Load the validation data
val_data = pd.read_csv('SMM4H-2024-Task5-Validation.tsv', sep='\t')
# Remove the "twitter_id" column
val_data = val_data.drop(columns=["tweet_id"])

# View the first few rows of the training data
print(train_data.head())
# View the first few rows of the validation data
print(val_data.head())

In [None]:
# Define preprocessing function
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    # Remove URLs
    cleaned_tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)

    # Remove mentions and hashtags
    cleaned_tweet = re.sub(r'@\w+', '', cleaned_tweet)
    cleaned_tweet = re.sub(r'#', '', cleaned_tweet)

    # Remove unscape HTML tags
    cleaned_tweet = html.unescape(cleaned_tweet)

    # Remove extra spaces, tabs, and newlines
    cleaned_tweet = re.sub(r'\s+', ' ', cleaned_tweet).strip()

    return cleaned_tweet


train_data["text"] = train_data["text"].apply(preprocess_tweet)
val_data["text"] = val_data["text"].apply(preprocess_tweet)

# View the first few rows of the training data
print(train_data.head())
# View the first few rows of the validation data
print(val_data.head())

In [None]:
id2label = {
    0: "Negative",
    1: "Positive",
}
label2id = {
    "Negative": 0,
    "Positive": 1,
}


# Convert train_data and val_data pandas DataFrames into Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(val_data)

In [None]:
# Load BERTweet large tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

# Load BERT tokenizer
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load PubMedBERT abstract only tokenizer
#tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")

# Load PubMedBERT abstract large tokenizer
#tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-large-uncased-abstract")

# Load PubMedBERT abstract + full-text tokenizer
#tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Load BioBERT tokenizer
#tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

# Load BioLinkBERT-base tokenizer
#tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')

# Load BioLinkBERT-large tokenizer
#tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-large')

# Load RoBERTa-base tokenizer
#tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")

# Load RoBERTa-large tokenizer
#tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')

# Helper function for preprocessing.
def preprocess_function(examples):
    max_length = 512  # Adjust as needed
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt", max_length=max_length)


tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

# Initialize data collator.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load('accuracy')
f1_score = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    #accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    f1_score_result = f1_score.compute(predictions=predictions, references=labels)
    return f1_score_result

In [None]:
# Defining the hyperparameter space
def ray_hp_space(trial):
    return {
        "learning_rate": tune.loguniform(0.5e-5, 1.5e-5),
        "weight_decay": tune.loguniform(0.005, 0.05),
        "per_device_train_batch_size": tune.choice([8, 16]),
        "per_device_eval_batch_size": tune.choice([8, 16])
    }

# change the model name here as you want
def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(
        'vinai/bertweet-large', return_dict=True)

training_args = TrainingArguments(
    "test", evaluation_strategy="epoch", logging_strategy="steps", save_strategy="steps", save_steps=1000, disable_tqdm=True)

# Defining an instance of Hugging Face Trainer Class
trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    model_init=model_init,
    data_collator=data_collator,
)

# Performing hyperparameters search
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    hp_space=ray_hp_space,
    n_trials=10
)

