# Preparing data

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [56]:
# Converting pandas dataframe to Dataset
vaccine = pd.read_csv('vaccine.csv')
vaccine = Dataset.from_pandas(vaccine)
vaccine

Dataset({
    features: ['label', 'text'],
    num_rows: 1051
})

In [57]:
model_ckpt = 'distilbert-base-uncased'

# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenize = lambda batch: tokenizer(batch['text'], padding="max_length", truncation=True)
vaccine = vaccine.map(tokenize, batched=True)

Map:   0%|          | 0/1051 [00:00<?, ? examples/s]

In [59]:
vaccine.set_format('torch', columns=['input_ids', 'attention_mask'])
vaccine

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 1051
})

# Feature Extraction

In [60]:
import torch
from transformers import AutoModel

In [63]:
# Loading the model and moving it to the GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = AutoModel.from_pretrained(model_ckpt).to(device)


def extract_features(batch):
    """Extract features from a batch of items"""
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}


vaccine = vaccine.map(extract_features, batched=True, batch_size=5)
embeds = pd.DataFrame(vaccine['hidden_state'])
embeds

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/1051 [00:00<?, ? examples/s]

KeyboardInterrupt: 

# Predicting vaccine decisions with embeddings

In [52]:
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split

In [53]:
regr = RidgeCV()
X_train, X_test, y_train, y_test = train_test_split(embeds, vaccine['label'])
regr.fit(X_train, y_train)
print(regr.score(X_test, y_test))

0.09236943886640603


# Pedicting vaccine decisions the LM fine-tuning

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
vaccine = vaccine.train_test_split(test_size=0.2)
vaccine

In [None]:
# Training the model
compute_metrics = lambda x: x
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=vaccine['train'],
    eval_dataset=vaccine['test'],
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_labels = len(vaccine['decision'].unique())
model = (
    AutoModelForSequenceClassification
    .from_pretrained(model_ckpt, num_labels=num_labels)
    .to_device(device)
)