# Preparing Data

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [None]:
vaccine = pd.read_csv('vaccine.csv')
vaccine

In [None]:
# Converting pandas dataframe to Dataset
vaccine = vaccine.rename(columns={'decision': 'label', 'response': 'text'})
vaccine = Dataset.from_pandas(vaccine)
vaccine

In [None]:
model_ckpt = 'distilbert-base-uncased'

# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenize = lambda batch: tokenizer(batch['text'], padding="max_length", truncation=True)
vaccine = vaccine.map(tokenize, batched=True)

# Feature Extraction

In [None]:
import torch
from transformers import AutoModel

In [None]:
# Loading the model and moving it to the GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = AutoModel.from_pretrained(model_ckpt).to(device)
model

In [None]:
def extract_features(batch):
    """Extract features from a batch of items"""
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}


vaccine = vaccine.map(extract_features, batched=True, batch_size=5)
embeds = pd.DataFrame(vaccine['hidden_state'])
embeds

# Predicting vaccine decisions with embeddings

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

In [None]:
clf = LogisticRegressionCV()
X_train, X_test, y_train, y_test = train_test_split(embeds, vaccine['label'])
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

# Pedicting vaccine decisions the LM fine-tuning

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
vaccine = vaccine.train_test_split(test_size=0.2)
vaccine

In [None]:
# Training the model
compute_metrics = lambda x: x
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=vaccine['train'],
    eval_dataset=vaccine['test'],
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_labels = len(vaccine['decision'].unique())
model = (
    AutoModelForSequenceClassification
    .from_pretrained(model_ckpt, num_labels=num_labels)
    .to_device(device)
)