# XLM-RoBERTa (SIMPLETRANSFORMERS)

In [None]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting wandb>=0.10.32 (from simpletransformers)
  Downloading wandb-0.17.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecti

In [None]:
from google.colab import drive
import os
import shutil

# Step 1: Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import load_dataset
import pandas as pd

# Define the available languages for PAWS-X dataset
languages = ["en", "de", "es", "fr", "ja", "ko", "zh"]

# Initialize empty lists to store dataframes for each language
train_dfs = []
valid_dfs = []
one_way = False

# Load and concatenate the datasets for each language
for lang in ['en']:
    dataset = load_dataset("paws-x", lang)
    train_df = pd.DataFrame(dataset['train'])
    valid_df = pd.DataFrame(dataset['validation'])

    train_df = train_df.rename(columns={"sentence1": "text_a", "sentence2": "text_b", "label": "labels"})
    valid_df = valid_df.rename(columns={"sentence1": "text_a", "sentence2": "text_b", "label": "labels"})
    if one_way:
        train_df['text_b'] = '.'
        valid_df['text_b'] = '.'
    train_dfs.append(train_df)
    valid_dfs.append(valid_df)

# Concatenate all the dataframes for training and validation
train_df_all_languages = pd.concat(train_dfs, ignore_index=True)
valid_df_all_languages = pd.concat(valid_dfs, ignore_index=True)

# train_df_cleaned_en = pd.read_csv('/content/paws-x-en-easynegatives-100-percent.csv', sep=',')
train_df_cleaned_en = pd.read_csv('/content/relabeled_train_paws_x_en.tsv', sep='\t')
label_column = 'LLama3 zero-shot (Sem Equiv)' # x3 Original x3 Paraph x3 LLama3 zero-shot (Ex. Same Content) x3 "Zero_Shot_ALL_THREE"
train_df_cleaned_en = train_df_cleaned_en.rename(columns={"sentence1": "text_a", "sentence2": "text_b", label_column: "labels"})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/310k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/307k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
train_df_cleaned_en.head()

Unnamed: 0.1,Unnamed: 0,mapping1,mapping2,id,text_a,text_b,label,language,labels,LLama3 zero-shot (Ex. Same Content),LLama3 zero-shot (Paraph),Zero_Shot_Majority_vote,Zero_Shot_Paraph_AND_Same_Content,Zero_Shot_Paraph_AND_Sem_Equiv,Zero_Shot_ALL_THREE
0,0,swap_42142_2,backtransl_set3_02388_2,1,"In Paris , in October 1560 , he secretly met t...","In October 1560 , he secretly met with the Eng...",0,en,1,1,1,1,1,1,1
1,1,backtransl_set1_06985_2,backtransl_set1_11833_2,2,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1,en,1,1,1,1,1,1,1
2,2,backtransl_set1_10174_2,swap_13022_1,3,"There are also specific discussions , public p...","There are also public discussions , profile sp...",0,en,0,1,1,1,1,0,0
3,3,swap_25254_2,backtransl_set3_00804_2,4,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1,en,1,1,1,1,1,1,1
4,4,swap_49138_1,backtransl_set2_00346_2,5,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1,en,1,1,1,1,1,1,1


In [None]:
from simpletransformers.classification import ClassificationModel

# Configure the model
model_args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",
    "max_seq_length": 256,
    "train_batch_size": 32,
    "eval_batch_size": 32,
    "num_train_epochs": 6,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 386,
    "evaluate_during_training_verbose": True,  # This enables verbose logging of evaluation
    "use_multiprocessing": False,
    "use_multiprocessing_for_evaluation" : False,
    "use_cuda": True,  # Make sure CUDA is available
    "overwrite_output_dir": True,
    "reprocess_input_data": True,
    "save_model_every_epoch": True,
    "save_steps": -1,
    "no_cache": True,
    "save_optimizer_and_scheduler": False,
    "silent": False,
}

# Create a ClassificationModel
model = ClassificationModel(
    "xlmroberta", "xlm-roberta-base",
    args=model_args
)

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



In [None]:
import logging
from transformers import logging as transformers_logging

# Configure logging to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training_logs.log"),
        logging.StreamHandler()  # This will also print to console
    ]
)
transformers_logging.set_verbosity_info()

model.train_model(
    train_df=train_df_cleaned_en,
    eval_df=valid_df_all_languages,
    args=model_args,
    show_running_loss=True,
)

Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 6:   0%|          | 0/1544 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-386/config.json
Model weights saved in outputs/checkpoint-386/model.safetensors
tokenizer config file saved in outputs/checkpoint-386/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-386/special_tokens_map.json
Configuration saved in outputs/best_model/config.json
Model weights saved in outputs/best_model/model.safetensors
tokenizer config file saved in outputs/best_model/tokenizer_config.json
Special tokens file saved in outputs/best_model/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-772/config.json
Model weights saved in outputs/checkpoint-772/model.safetensors
tokenizer config file saved in outputs/checkpoint-772/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-772/special_tokens_map.json
Configuration saved in outputs/best_model/config.json
Model weights saved in outputs/best_model/model.

Running Epoch 2 of 6:   0%|          | 0/1544 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-1930/config.json
Model weights saved in outputs/checkpoint-1930/model.safetensors
tokenizer config file saved in outputs/checkpoint-1930/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1930/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-2316/config.json
Model weights saved in outputs/checkpoint-2316/model.safetensors
tokenizer config file saved in outputs/checkpoint-2316/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-2316/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-2702/config.json
Model weights saved in outputs/checkpoint-2702/model.safetensors
tokenizer config file saved in outputs/checkpoint-2702/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-2702/special_tokens_map.json
Configuration saved in outpu

Running Epoch 3 of 6:   0%|          | 0/1544 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-3474/config.json
Model weights saved in outputs/checkpoint-3474/model.safetensors
tokenizer config file saved in outputs/checkpoint-3474/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-3474/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-3860/config.json
Model weights saved in outputs/checkpoint-3860/model.safetensors
tokenizer config file saved in outputs/checkpoint-3860/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-3860/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-4246/config.json
Model weights saved in outputs/checkpoint-4246/model.safetensors
tokenizer config file saved in outputs/checkpoint-4246/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-4246/special_tokens_map.json
  with amp.autocast():
  wit

Running Epoch 4 of 6:   0%|          | 0/1544 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-5018/config.json
Model weights saved in outputs/checkpoint-5018/model.safetensors
tokenizer config file saved in outputs/checkpoint-5018/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-5018/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-5404/config.json
Model weights saved in outputs/checkpoint-5404/model.safetensors
tokenizer config file saved in outputs/checkpoint-5404/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-5404/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-5790/config.json
Model weights saved in outputs/checkpoint-5790/model.safetensors
tokenizer config file saved in outputs/checkpoint-5790/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-5790/special_tokens_map.json
  with amp.autocast():
  wit

Running Epoch 5 of 6:   0%|          | 0/1544 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-6562/config.json
Model weights saved in outputs/checkpoint-6562/model.safetensors
tokenizer config file saved in outputs/checkpoint-6562/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-6562/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-6948/config.json
Model weights saved in outputs/checkpoint-6948/model.safetensors
tokenizer config file saved in outputs/checkpoint-6948/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-6948/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-7334/config.json
Model weights saved in outputs/checkpoint-7334/model.safetensors
tokenizer config file saved in outputs/checkpoint-7334/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-7334/special_tokens_map.json
  with amp.autocast():
  wit

Running Epoch 6 of 6:   0%|          | 0/1544 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-8106/config.json
Model weights saved in outputs/checkpoint-8106/model.safetensors
tokenizer config file saved in outputs/checkpoint-8106/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-8106/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-8492/config.json
Model weights saved in outputs/checkpoint-8492/model.safetensors
tokenizer config file saved in outputs/checkpoint-8492/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-8492/special_tokens_map.json
  with amp.autocast():
  with amp.autocast():
Configuration saved in outputs/checkpoint-8878/config.json
Model weights saved in outputs/checkpoint-8878/model.safetensors
tokenizer config file saved in outputs/checkpoint-8878/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-8878/special_tokens_map.json
  with amp.autocast():
  wit

(9264,
 defaultdict(list,
             {'global_step': [386,
               772,
               1158,
               1544,
               1544,
               1930,
               2316,
               2702,
               3088,
               3088,
               3474,
               3860,
               4246,
               4632,
               4632,
               5018,
               5404,
               5790,
               6176,
               6176,
               6562,
               6948,
               7334,
               7720,
               7720,
               8106,
               8492,
               8878,
               9264,
               9264],
              'train_loss': [0.6920318603515625,
               0.7294158935546875,
               0.5680465698242188,
               0.6097265481948853,
               0.6097265481948853,
               0.6311569213867188,
               0.619415283203125,
               0.6474266052246094,
               0.40244629979133606,
 

In [None]:
from datasets import load_dataset
one_way = False
# List of available languages in PAWS-X
languages = ['en', 'fr', 'es', 'de', 'zh', 'ja', 'ko']

# Load datasets for all languages
datasets = {lang: load_dataset("paws-x", lang) for lang in languages}

# Prepare datasets by renaming columns
for lang in languages:
    current_df = pd.DataFrame(datasets[lang]['test']).rename(columns={"sentence1": "text_a", "sentence2": "text_b", "label": "labels"})
    if one_way:
      current_df = current_df['text_b'] = '.'
    datasets[lang]['test'] = current_df

In [None]:
# from simpletransformers.classification import ClassificationModel

# # Configure the model
# model_args = {
#     "output_dir": "outputs/",
#     "cache_dir": "cache_dir/",
#     "max_seq_length": 256,
#     "train_batch_size": 32,
#     "eval_batch_size": 32,
#     "num_train_epochs": 6,
#     "evaluate_during_training": True,
#     "evaluate_during_training_steps": 2701,
#     "evaluate_during_training_verbose": True,  # This enables verbose logging of evaluation
#     "use_multiprocessing": False,
#     "use_multiprocessing_for_evaluation" : False,
#     "use_cuda": True,  # Make sure CUDA is available
#     "overwrite_output_dir": True,
#     "reprocess_input_data": True,
#     "save_model_every_epoch": True,
#     "save_steps": -1,
#     "no_cache": True,
#     "save_optimizer_and_scheduler": True,
#     "silent": False,
#     "tensorboard_dir": "tb_logs/",  # TensorBoard logs
# }

# # Create a ClassificationModel
# model = ClassificationModel(
#     "xlmroberta", "/content/drive/My Drive/XLM-EN-ONLY_V3",
#     args=model_args
# )

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from math import ceil

def batchify_dataframe(dataframe, batch_size):
    """Split a Pandas DataFrame into batches of specified size."""
    num_rows = len(dataframe)
    for i in range(0, num_rows, batch_size):
        yield dataframe.iloc[i:i + batch_size]

batch_size = 16
evaluation_results = {}

for lang in languages:
    # Convert the dataset for 'test' split to DataFrame and prepare inputs
    test_df = datasets[lang]['test']
    validation_df = datasets[lang]['validation']

    # Prepare the data for prediction
    true_labels = test_df['labels'].tolist()

    # Get predictions using model's built-in batching
    all_predictions = []
    # Process the data in batches
    for batch_df in batchify_dataframe(test_df, batch_size):
        batch = list(zip(batch_df['text_a'], batch_df['text_b']))
        batch_predictions, raw_outputs = model.predict(batch)
        all_predictions.extend(batch_predictions)

    accuracy = accuracy_score(true_labels, all_predictions)

    # Optionally calculate other metrics
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, all_predictions, average='macro')

    # Store the results
    evaluation_results[lang] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Print evaluation results for each language
for lang, metrics in evaluation_results.items():
    print(f"Results for {lang}: {metrics}")


In [None]:
# Print evaluation results for each language
for lang, metrics in evaluation_results.items():
    print(f"Results for {lang}: {metrics}")

In [None]:
import shutil
import os
# Step 2: Define source and destination paths
checkpoint = '5404'
source_folder = '/content/outputs/checkpoint-' + checkpoint  # Update this with the actual path
destination_folder = '/content/drive/My Drive/XLM-R-EN-SEMEQUIV-V4-' + checkpoint  # Update this with the desired path on Google Drive

# Step 3: Create destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Step 4: Move files from source to destination, skipping subfolders
for item in os.listdir(source_folder):
    item_path = os.path.join(source_folder, item)
    if os.path.isfile(item_path):
        shutil.move(item_path, destination_folder)

# Step 5: Verify the move
print(f"Contents of {destination_folder}:")
print(os.listdir(destination_folder))

Contents of /content/drive/My Drive/XLM-R-EN-SEMEQUIV-V4-5404:
['tokenizer.json', 'tokenizer_config.json', 'eval_results.txt', 'model.safetensors', 'sentencepiece.bpe.model', 'model_args.json', 'special_tokens_map.json', 'training_args.bin', 'config.json']
