# XLM-RoBERTa (SIMPLETRANSFORMERS)

In [None]:
!pip install simpletransformers

In [None]:
from google.colab import drive
import os
import shutil

# Step 1: Mount Google Drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset
import pandas as pd

# Define the available languages for PAWS-X dataset
languages = ["en", "de", "es", "fr", "ja", "ko", "zh"]

# Initialize empty lists to store dataframes for each language
train_dfs = []
valid_dfs = []
one_way = False

# Load and concatenate the datasets for each language
for lang in ['en']:
    dataset = load_dataset("paws-x", lang)
    train_df = pd.DataFrame(dataset['train'])
    valid_df = pd.DataFrame(dataset['validation'])

    train_df = train_df.rename(columns={"sentence1": "text_a", "sentence2": "text_b", "label": "labels"})
    valid_df = valid_df.rename(columns={"sentence1": "text_a", "sentence2": "text_b", "label": "labels"})
    if one_way:
        train_df['text_b'] = '.'
        valid_df['text_b'] = '.'
    train_dfs.append(train_df)
    valid_dfs.append(valid_df)

# Concatenate all the dataframes for training and validation
train_df_all_languages = pd.concat(train_dfs, ignore_index=True)
valid_df_all_languages = pd.concat(valid_dfs, ignore_index=True)

# train_df_cleaned_en = pd.read_csv('/content/paws-x-en-easynegatives-100-percent.csv', sep=',')
train_df_cleaned_en = pd.read_csv('/content/relabeled_train_paws_x_en.tsv', sep='\t')
label_column = 'LLama3 zero-shot (Sem Equiv)' # x3 Original x3 Paraph x3 LLama3 zero-shot (Ex. Same Content) x3 "Zero_Shot_ALL_THREE"
train_df_cleaned_en = train_df_cleaned_en.rename(columns={"sentence1": "text_a", "sentence2": "text_b", label_column: "labels"})

In [None]:
train_df_cleaned_en.head()

In [None]:
from simpletransformers.classification import ClassificationModel

# Configure the model
model_args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",
    "max_seq_length": 256,
    "train_batch_size": 32,
    "eval_batch_size": 32,
    "num_train_epochs": 6,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 386,
    "evaluate_during_training_verbose": True,  # This enables verbose logging of evaluation
    "use_multiprocessing": False,
    "use_multiprocessing_for_evaluation" : False,
    "use_cuda": True,  # Make sure CUDA is available
    "overwrite_output_dir": True,
    "reprocess_input_data": True,
    "save_model_every_epoch": True,
    "save_steps": -1,
    "no_cache": True,
    "save_optimizer_and_scheduler": False,
    "silent": False,
}

# Create a ClassificationModel
model = ClassificationModel(
    "xlmroberta", "xlm-roberta-base",
    args=model_args
)

In [None]:
import logging
from transformers import logging as transformers_logging

# Configure logging to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training_logs.log"),
        logging.StreamHandler()  # This will also print to console
    ]
)
transformers_logging.set_verbosity_info()

model.train_model(
    train_df=train_df_cleaned_en,
    eval_df=valid_df_all_languages,
    args=model_args,
    show_running_loss=True,
)

In [None]:
from datasets import load_dataset
one_way = False
# List of available languages in PAWS-X
languages = ['en', 'fr', 'es', 'de', 'zh', 'ja', 'ko']

# Load datasets for all languages
datasets = {lang: load_dataset("paws-x", lang) for lang in languages}

# Prepare datasets by renaming columns
for lang in languages:
    current_df = pd.DataFrame(datasets[lang]['test']).rename(columns={"sentence1": "text_a", "sentence2": "text_b", "label": "labels"})
    if one_way:
      current_df = current_df['text_b'] = '.'
    datasets[lang]['test'] = current_df

In [None]:
# from simpletransformers.classification import ClassificationModel

# # Configure the model
# model_args = {
#     "output_dir": "outputs/",
#     "cache_dir": "cache_dir/",
#     "max_seq_length": 256,
#     "train_batch_size": 32,
#     "eval_batch_size": 32,
#     "num_train_epochs": 6,
#     "evaluate_during_training": True,
#     "evaluate_during_training_steps": 2701,
#     "evaluate_during_training_verbose": True,  # This enables verbose logging of evaluation
#     "use_multiprocessing": False,
#     "use_multiprocessing_for_evaluation" : False,
#     "use_cuda": True,  # Make sure CUDA is available
#     "overwrite_output_dir": True,
#     "reprocess_input_data": True,
#     "save_model_every_epoch": True,
#     "save_steps": -1,
#     "no_cache": True,
#     "save_optimizer_and_scheduler": True,
#     "silent": False,
#     "tensorboard_dir": "tb_logs/",  # TensorBoard logs
# }

# # Create a ClassificationModel
# model = ClassificationModel(
#     "xlmroberta", "/content/drive/My Drive/XLM-EN-ONLY_V3",
#     args=model_args
# )

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from math import ceil

def batchify_dataframe(dataframe, batch_size):
    """Split a Pandas DataFrame into batches of specified size."""
    num_rows = len(dataframe)
    for i in range(0, num_rows, batch_size):
        yield dataframe.iloc[i:i + batch_size]

batch_size = 16
evaluation_results = {}

for lang in languages:
    # Convert the dataset for 'test' split to DataFrame and prepare inputs
    test_df = datasets[lang]['test']
    validation_df = datasets[lang]['validation']

    # Prepare the data for prediction
    true_labels = test_df['labels'].tolist()

    # Get predictions using model's built-in batching
    all_predictions = []
    # Process the data in batches
    for batch_df in batchify_dataframe(test_df, batch_size):
        batch = list(zip(batch_df['text_a'], batch_df['text_b']))
        batch_predictions, raw_outputs = model.predict(batch)
        all_predictions.extend(batch_predictions)

    accuracy = accuracy_score(true_labels, all_predictions)

    # Optionally calculate other metrics
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, all_predictions, average='macro')

    # Store the results
    evaluation_results[lang] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Print evaluation results for each language
for lang, metrics in evaluation_results.items():
    print(f"Results for {lang}: {metrics}")


In [None]:
# Print evaluation results for each language
for lang, metrics in evaluation_results.items():
    print(f"Results for {lang}: {metrics}")

In [None]:
import shutil
import os
# Step 2: Define source and destination paths
checkpoint = '5404'
source_folder = '/content/outputs/checkpoint-' + checkpoint  # Update this with the actual path
destination_folder = '/content/drive/My Drive/XLM-R-EN-SEMEQUIV-V4-' + checkpoint  # Update this with the desired path on Google Drive

# Step 3: Create destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Step 4: Move files from source to destination, skipping subfolders
for item in os.listdir(source_folder):
    item_path = os.path.join(source_folder, item)
    if os.path.isfile(item_path):
        shutil.move(item_path, destination_folder)

# Step 5: Verify the move
print(f"Contents of {destination_folder}:")
print(os.listdir(destination_folder))