## Installation of the required libraries


In [1]:
!pip install -U adapter-transformers
!pip install -U datasets
!pip install -U sentencepiece

Collecting adapter-transformers
  Downloading adapter_transformers-3.2.1-py3-none-any.whl (6.4 MB)
     ---------------------------------------- 6.4/6.4 MB 7.6 MB/s eta 0:00:00
Installing collected packages: adapter-transformers
  Attempting uninstall: adapter-transformers
    Found existing installation: adapter-transformers 3.2.0
    Uninstalling adapter-transformers-3.2.0:
      Successfully uninstalled adapter-transformers-3.2.0
Successfully installed adapter-transformers-3.2.1
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
     -------------------------------------- 468.7/468.7 kB 5.8 MB/s eta 0:00:00
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.10.1
    Uninstalling datasets-2.10.1:
      Successfully uninstalled datasets-2.10.1
Successfully installed datasets-2.11.0
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-win_amd64.whl (977 kB)
     -----------------

In [2]:
import os
import shutil
import glob
import re
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import XLMRobertaTokenizer, XLMRobertaModelWithHeads, XLMRobertaConfig
from transformers import TrainingArguments, AdapterTrainer, TextClassificationPipeline
from transformers import PfeifferInvConfig

## Dataset Preprocessing


In [3]:
# set directory
os.chdir("C:\\Users\\c.loschke\\Desktop\\Coding\\twitter_corpus_analysis\\transformer")

In [5]:
text_all = pd.read_csv("AAC-saving-energy-Tweets2.csv", sep=';', encoding='latin-1')

  text_all = pd.read_csv("AAC-saving-energy-Tweets2.csv", sep=';', encoding='latin-1')


In [6]:
text_all = text_all[text_all["content"].notna()]

In [7]:
text_all["content"] = [re.sub(r'http\S+', 'Internetseite', x) for x in text_all["content"]]
text_all["content"] = [re.sub(r'@\S+', 'Reaktion.', x) for x in text_all["content"]]
text_all["content"] = [re.sub(r'\xa0\s+', ' ', x) for x in text_all["content"]]
text_all["content"] = [re.sub('\x84', '"', x) for x in text_all["content"]]
text_all["content"] = [re.sub('\x93', '"', x) for x in text_all["content"]]

In [8]:
frame_to_predict = text_all[text_all.frame.isna()]
frame_labeled = text_all[text_all.frame.isna() == 0]

In [9]:
frame_labeled['labels'] = [int(x) for x in frame_labeled['frame']]
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["content"], max_length=400, truncation=True, padding="max_length" )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame_labeled['labels'] = [int(x) for x in frame_labeled['frame']]


Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [12]:
output_path = "C:\\Users\\c.loschke\\Desktop\\Coding\\twitter_corpus_analysis\\transformer" 
#+ "energy_final/"

## Training

In [13]:
vers = "frame"
arch = "pfeifferinv"
c_rate = 8 # reduction factor: represent different trade-off between performance and adapter size; a larger reduction factor can make the model more efficient, by reducing the number of parameters that need to be trained, but it can also reduce the model's performance
adapter_name = "energy"+"_"+vers+"_"+arch+"_"+str(c_rate)
training_args = TrainingArguments(
    seed=2023, # random seed for training
    full_determinism=True, # set to True to ensure reproducibility across different runs
    learning_rate=5e-5, # determines how much the model parameters are updated during training
    num_train_epochs=30, # one epoch = one pass through the entire training set
    logging_strategy="epoch", # log metrics every epoch
    evaluation_strategy="no", # evaluation metrics are not calculated
    output_dir=output_path+adapter_name,
    overwrite_output_dir=True,
    remove_unused_columns=False,
)
dataset = DatasetDict({'train': Dataset.from_pandas(frame_labeled)}) # datasetdict called 'train' with labeled data
dataset = dataset.map(encode_batch, batched=True) # encode the data and pad it to the same length
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) # return pytorch format (torch); attention_mask tells RoBERTa which tokens to pay attention to; labels are the target labels for each sequence
config = XLMRobertaConfig.from_pretrained("xlm-roberta-base", num_labels=12)
model = XLMRobertaModelWithHeads.from_pretrained("xlm-roberta-base", config=config)
config_adapter = PfeifferInvConfig(reduction_factor=c_rate) # adapters are small trainable subnetworks that can be added to a pretrained model to adapt it to a new task without interfering with the pre-trained weights
model.add_adapter(adapter_name, config=config_adapter)
model.add_classification_head(adapter_name, num_labels=12)
model.train_adapter(adapter_name)
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"]
)
trainer.train()

Map:   0%|          | 0/723 [00:00<?, ? examples/s]



Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModelWithHeads were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for pr

  0%|          | 0/2730 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [10]:
model.eval() # sets the model in evaluation mode, which disables dropout and batch normalization layers, and prepares the model to run on input data
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)

dataset_predict = DatasetDict({'predict': Dataset.from_pandas(frame_to_predict)})
dataset_predict = dataset_predict.map(encode_batch, batched=True)
dataset_predict.set_format(type="torch", columns=["input_ids", "attention_mask"])

pred = [p['label'] for p in classifier(dataset_predict['predict']['content'])] # classifier (pipeline) predicts the labels for the unlabeled data and returns a list of dictionaries with the predicted labels
pred_nr = [[int(s) for s in txt.split("_") if s.isdigit()][0] for txt in pred]

The model 'XLMRobertaModelWithHeads' is not supported for . Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GPT2ForSequenceClassification', 'GPTNeoForSequenceClassification', 'GPTJForSequenceClassification', 'IBertForSequenceClassification', 'LayoutLMForSequenceClassification', 'LayoutLMv2ForSequenceClassification', 'LayoutLMv3ForSequenceClassification', 'LEDForSequenceC

  0%|          | 0/22 [00:00<?, ?ba/s]

Disabling tokenizer parallelism, we're using DataLoader multithreading already


In [8]:
dataset_predict = DatasetDict({'predict': Dataset.from_pandas(frame_to_predict)})
dataset_predict = dataset_predict.map(encode_batch, batched=True)
dataset_predict.set_format(type="torch", columns=["input_ids", "attention_mask"])

  0%|          | 0/22 [00:00<?, ?ba/s]

In [28]:
frame_to_predict["frame"] = pred_nr
frame_to_predict.to_csv(output_path+"/frame_predictions.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame_to_predict["frame"] = pred_nr
