In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd gdrive/My Drive/SMM4H-2024

In [None]:
!pip install -U transformers accelerate
!pip3 install emoji==0.6.0

In [None]:
import pandas as pd
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast, AutoTokenizer, AutoModelForSequenceClassification
import re
import html
import torch

In [None]:
# Load the validation or test data here. The purpose is to predict classes based on saved fine-tuned models.
val_data = pd.read_csv('SMM4H-2024-Task5-Test-Unlabeled.tsv', sep='\t')
# Remove the "twitter_id" column
val_data = val_data.drop(columns=["tweet_id"])

# If your test/validation dataset is too large to be processed by existing Colab GPUs, you can split it as follows
# val_data = val_data.iloc[9750:10000]

# Define preprocessing function
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    # Remove URLs
    cleaned_tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)

    # Remove mentions and hashtags
    cleaned_tweet = re.sub(r'@\w+', '', cleaned_tweet)
    cleaned_tweet = re.sub(r'#', '', cleaned_tweet)

    # Remove unscape HTML tags
    cleaned_tweet = html.unescape(cleaned_tweet)

    # Remove extra spaces, tabs, and newlines
    cleaned_tweet = re.sub(r'\s+', ' ', cleaned_tweet).strip()

    return cleaned_tweet

val_data["text"] = val_data["text"].apply(preprocess_tweet)
print(val_data.head())

In [None]:
# Load the trained model from the checkpoint
model_checkpoint = "/content/gdrive/My Drive/SMM4H-2024/smm4h_classification_700/checkpoint-5550"  # Specify the path to your checkpoint directory
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
model = model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


# Preprocess the text data using your tokenizer
encoded_data = tokenizer(
    val_data["text"].tolist(), padding="max_length", truncation=True, return_tensors="pt", max_length=512
)

# Load the encoded data onto the device (GPU if available)
input_ids = encoded_data["input_ids"].to("cuda" if torch.cuda.is_available() else "cpu")
attention_mask = encoded_data["attention_mask"].to("cuda" if torch.cuda.is_available() else "cpu")

print(next(model.parameters()).device)
print(input_ids.device, attention_mask.device)

# Perform inference
with torch.no_grad():  # Disable gradient calculation for efficiency during inference
    outputs = model(input_ids, attention_mask=attention_mask)

# Extract predictions (assuming your model has a classification head)
logits = outputs.logits  # logits represent the unnormalized model outputs
predictions = torch.argmax(logits, dim=-1)  # Get the class with the highest probability
probabilities = torch.nn.functional.softmax(logits, dim=-1)  # Apply softmax to get probabilities for normalization

# Print the predictions
print(predictions)

print(probabilities[:, 1])  # Access probabilities for class 1