In [None]:
# # Installing packages in Google Colab environment
# !pip install datasets transformers evaluate
# !pip install accelerate -U
#
# # Mounting google drive to enable access to data files
# from google.colab import drive
# drive.mount('/content/drive')
#
# # Changing working directory to ex1
# %cd /content/drive/MyDrive/LLM4JDM/ex1

# Preparing data

In [86]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [120]:
choices = pd.read_csv('choices.txt', delimiter='\t')

# Pivots choices such that 'vax' becomes the new columns, choice is the value and 'sub' is the index
labels = choices.pivot(index='sub', columns='vax', values='choice')
labels

vax,AstraZeneca,Bharat Biotech,BioNTech/Pfizer,CanSino Biologics,Johnson & Johnson,Moderna,Novavax,Sinovac
sub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,1,0,1,1,0,0
2,1,0,1,0,1,1,1,0
3,1,1,1,0,1,1,1,0
4,1,0,1,0,1,1,1,1
5,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1196,0,0,1,0,0,0,0,0
1197,0,1,1,1,1,0,1,1
1198,0,1,0,0,0,0,0,0
1199,0,1,0,1,0,0,0,0


In [121]:
text = pd.read_csv('text_responses.txt', delimiter='\t', index_col='sub', usecols=['sub', 'oe'])

# Concatenates the text responses with the labels and drops rows with missing values
vaccine = pd.concat([text, labels], axis=1).rename(columns={'oe': 'text'}).dropna().reset_index(drop=True)
vaccine

Unnamed: 0,text,AstraZeneca,Bharat Biotech,BioNTech/Pfizer,CanSino Biologics,Johnson & Johnson,Moderna,Novavax,Sinovac
0,Looking at the probability of certain negative...,1,0,1,0,1,1,1,0
1,"Not really, I looked at all the numbers and ma...",1,1,1,0,1,1,1,0
2,I weighed the side effects with the benefits a...,1,0,1,0,1,1,1,1
3,Percentages were important. In all things (med...,0,1,1,0,0,1,0,0
4,I would look up the potential side effects and...,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1046,I looked primarily at the effectiveness of eac...,1,1,1,1,0,1,1,0
1047,Personal experience talking,0,0,1,0,0,0,0,0
1048,Biased against the vaccine,0,1,1,1,1,0,1,1
1049,I have had a lot of people whos gotten the vac...,0,1,0,1,0,0,0,0


In [122]:
vaccine_hf = Dataset.from_pandas(vaccine)
vaccine_hf

Dataset({
    features: ['text', 'AstraZeneca', 'Bharat Biotech', 'BioNTech/Pfizer', 'CanSino Biologics', 'Johnson & Johnson', 'Moderna', 'Novavax', 'Sinovac'],
    num_rows: 1051
})

In [123]:
model_ckpt = 'distilbert-base-uncased'

# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenize = lambda batch: tokenizer(batch['text'], padding="max_length", truncation=True)
vaccine_hf = vaccine_hf.map(tokenize, batched=True)
vaccine_hf

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 4290399b-3a5d-4b95-a96e-bac687d44c4d)')' thrown while requesting HEAD https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json


Map:   0%|          | 0/1051 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'AstraZeneca', 'Bharat Biotech', 'BioNTech/Pfizer', 'CanSino Biologics', 'Johnson & Johnson', 'Moderna', 'Novavax', 'Sinovac', 'input_ids', 'attention_mask'],
    num_rows: 1051
})

In [124]:
# Creating labels from the individual vaccine columns
def format_labels(batch):
    labs = [[float(batch[c][i]) for c in labels.columns] for i in range(len(batch['AstraZeneca']))]
    return {"labels": labs}

vaccine_hf = vaccine_hf.map(format_labels, batched=True)

# Selecting only 'text', 'input_ids', 'attention_mask' and 'labels'
vaccine_hf = vaccine_hf.remove_columns(labels.columns.tolist())
vaccine_hf

Map:   0%|          | 0/1051 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1051
})

In [125]:
vaccine_hf.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
vaccine_hf

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1051
})

# Feature Extraction

In [95]:
import torch
torch.manual_seed(42)
from transformers import AutoModel

In [96]:
# Loading the model and moving it to the GPU if available ('cuda' for nvidia GPUs and 'mps' for Apple's Metal Performance Shaders)
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='mps')

In [97]:
model = AutoModel.from_pretrained(model_ckpt).to(device)

def extract_features(batch):
    """Extract features from a batch of items"""
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

vaccine_hf = vaccine_hf.map(extract_features, batched=True, batch_size=8)
embeds = pd.DataFrame(vaccine_hf['hidden_state'])
embeds

Map:   0%|          | 0/1051 [00:00<?, ? examples/s]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.086551,-0.138291,-0.136351,-0.044146,-0.006536,-0.196219,0.129933,0.005525,0.046519,-0.363784,...,-0.094813,-0.216771,-0.098917,0.025280,0.003007,0.224140,-0.215962,-0.277577,0.211385,0.269926
1,-0.086077,-0.046284,0.170872,-0.090205,-0.148626,-0.077882,0.115379,0.122023,0.147362,-0.176775,...,0.094826,-0.011131,0.072211,-0.027861,-0.058841,0.019084,-0.260078,-0.105455,0.249027,0.535908
2,0.040985,-0.031094,0.085166,-0.015342,-0.150943,-0.062235,0.269550,-0.149736,0.259255,-0.130736,...,-0.165882,-0.097996,-0.081691,-0.126770,0.114904,0.140170,-0.236507,-0.087407,0.400360,0.436200
3,-0.088134,-0.037886,0.070339,0.060876,-0.184012,0.031210,0.169997,0.052630,0.159975,-0.296517,...,-0.023511,-0.199103,0.055980,-0.137853,0.082254,0.127104,-0.297239,-0.143943,0.213958,0.313107
4,0.069829,0.019595,-0.001682,-0.246711,-0.061300,-0.168547,0.322209,0.217674,0.237314,-0.358912,...,0.042846,0.005196,0.082999,-0.093391,0.086103,-0.116669,-0.277758,-0.230386,0.221281,0.411202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,0.060096,-0.027563,-0.197975,-0.167290,-0.256647,-0.193959,0.278221,-0.008111,0.037555,-0.141178,...,-0.165374,-0.132563,-0.002246,0.049726,0.072532,-0.039805,-0.274175,0.038117,0.321610,0.417477
1047,-0.157278,-0.037632,-0.135468,-0.104194,-0.073145,-0.114484,0.210148,0.364075,-0.133996,-0.354392,...,0.080500,-0.176303,0.098889,-0.213833,0.114860,0.073110,-0.062321,-0.282838,0.039596,0.202290
1048,-0.264745,-0.118108,-0.243033,-0.065164,-0.172901,-0.085691,0.123498,0.065335,-0.042342,-0.166539,...,-0.049490,0.014468,-0.110201,-0.100587,0.256933,-0.017435,-0.034038,-0.193906,0.303171,0.435974
1049,0.214027,0.031197,-0.053656,-0.248237,-0.046856,-0.366515,0.371055,0.387127,-0.086699,-0.266235,...,-0.101303,-0.187162,0.108514,-0.177368,0.260718,0.150828,-0.205036,-0.006340,0.436779,0.446601


# Predicting vaccine decisions with embeddings

In [98]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score

In [99]:
labels = vaccine.drop(columns='text').astype(int)
clf = MultiOutputClassifier(LogisticRegressionCV())
cross_val_score(clf, embeds, labels, cv=5, scoring='f1_macro').mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.350832307123636

# Pedicting vaccine decisions the LM fine-tuning

In [126]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, BertConfig
from sklearn.metrics import f1_score

In [127]:
vaccine_hf = vaccine_hf.train_test_split(test_size=0.2)
vaccine_hf

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 840
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 211
    })
})

In [128]:
config = BertConfig.from_pretrained(model_ckpt, num_labels=8)
config.problem_type = "multi_label_classification" # Informing that this is a multi-label classification problem

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['encoder.layer.5.attention.self.query.bias', 'encoder.layer.4.attention.self.value.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.5.attention.self.value.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.8.output.LayerNorm.bias', 'encoder.layer.6.intermediate.dense.weight', 'encoder.layer.4.attention.self.query.bias', 'encoder.layer.7.attention.self.key.weight', 'encoder.layer.2.attention.output.LayerNorm.weight', 'encoder.layer.1.attention.self.query.weight', 'encoder.layer.7.attention.self.query.weight', 'encoder.layer.11.attention.self.value.bias', 'encoder.layer.11.attention.self.key.weight', 'encoder.layer.4.intermediate.dense.weight', 'encoder.l

In [129]:
# Training the model
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    num_train_epochs=1,
)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = (preds > 0.5).astype(int)
    return {"f1": f1_score(labels, preds, average='macro')}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=vaccine_hf['train'],
    eval_dataset=vaccine_hf['test'],
    compute_metrics=compute_metrics,
)

In [130]:
trainer.train()



Epoch,Training Loss,Validation Loss


TrainOutput(global_step=105, training_loss=0.628652082170759, metrics={'train_runtime': 136.5203, 'train_samples_per_second': 6.153, 'train_steps_per_second': 0.769, 'total_flos': 221025192837120.0, 'train_loss': 0.628652082170759, 'epoch': 1.0})

In [131]:
trainer.evaluate()

{'eval_loss': 0.6252403855323792,
 'eval_f1': 0.0,
 'eval_runtime': 9.3868,
 'eval_samples_per_second': 22.478,
 'eval_steps_per_second': 2.876,
 'epoch': 1.0}