In [1]:
import pandas as pd

mt3_path = './Q2_20230202_majority_top_30.csv'
mt3 = pd.read_csv(mt3_path)
gai_path = './GAItweets.csv'
gai = pd.read_csv(gai_path)

In [2]:
import re
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)  
    tweet = re.sub(r'@\w+', '', tweet)  
    tweet = re.sub(r'#\w+', '', tweet)  
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    tweet = re.sub(r'[^\w\s,]', '', tweet)
    return tweet

mt3['cl_tweet'] = mt3['tweet'].apply(clean_tweet)
gai['cl_tweet'] = gai['tweet'].apply(clean_tweet)


In [5]:
#fine-tuning set-up

from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import re

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def fine_tune(examples):
    inputs = examples['cl_tweet']
    targets = examples['label_true']
    model_inputs = tokenizer(inputs, max_length=512, padding='max_length', truncation=True)
    labels = tokenizer(targets, max_length=512, padding='max_length', truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

dataset = Dataset.from_pandas(gai)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

tokenized_train_dataset = train_dataset.map(fine_tune, batched=True)
tokenized_val_dataset = val_dataset.map(fine_tune, batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    no_cuda=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained('./fine-tuned-flan-t5-large')
tokenizer.save_pretrained('./fine-tuned-flan-t5-large')




  0%|          | 0/1014 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
tokenizer = T5Tokenizer.from_pretrained('./fine-tuned-flan-t5-large')
model = T5ForConditionalGeneration.from_pretrained('./fine-tuned-flan-t5-large')

prompt_template = (
    'Given the following tweet, determine the stance regarding COVID-19 vaccines. '
    'Here is the tweet: "{tweet}" '
    'Respond with one of the following labels: '
    '"in-favor" if the tweet supports COVID-19 vaccination, '
    '"against" if the tweet opposes COVID-19 vaccination, or '
    '"neutral-or-unclear" if the tweet neither clearly supports nor opposes COVID-19 vaccination or if the stance is unclear.'
)

def get_enhanced_stance_prediction(tweet, tokenizer, model):
    prompt = prompt_template.format(tweet=tweet)
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(inputs.input_ids, max_length=50, num_beams=5, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction.strip()

mt3['label_pred'] = mt3['cl_tweet'].apply(lambda tweet: get_enhanced_stance_prediction(tweet, tokenizer, model))
mt3.to_csv('./Q2_20230202_majority_top_30.csv', index=False)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
