In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch

In [10]:
p_dataset_path = "./power/power-tr-train.tsv"
p_data = pd.read_csv(p_dataset_path, sep="\t")

# Display basic information
print(p_data.head())
print(p_data.info())

        id                           speaker sex  \
0  tr18146  ca2031caa4032c51980160359953d507   M   
1  tr18147  4cee0addb3c69f6866869b180f90d45f   M   
2  tr18148  b3d7f76d74ec268492f8190ca123a6b2   M   
3  tr18149  722efac7138c8197a9d1e97eed3a8b18   M   
4  tr18150  fcc61122f3553c57ae207adeb1a1af84   M   

                                                text  \
0  Yeni yasama döneminin ülkemiz için, milletimiz...   
1  Sayın Başkan, değerli milletvekilleri; bugün, ...   
2  Sayın Başkanım, öncelikle yüce Meclisin Başkan...   
3  24’üncü Dönem Meclis Başkanlığına seçilmenizde...   
4  Usul tartışmasında 2 kişi lehte 2 kişi aleyhte...   

                                             text_en  label  
0  Mr. President, dear lawmakers, I salute you, a...      0  
1  Mr. President, members of lawmakers, as I spea...      0  
2  Mr. President, I'm here to share with you the ...      0  
3  Mr. President, under the principles determined...      0  
4  Two in favour of two in the legal deb

In [11]:
# Drop rows with missing translations or text fields
p_data = p_data.dropna(subset=['text', 'label'])

# Display class distribution
print(p_data['label'].value_counts())

label
1    8932
0    8452
Name: count, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split
p_train_data, p_test_data = train_test_split(
    p_data, test_size=0.1, stratify=p_data['label'], random_state=42
)
print(f"Training size: {len(p_train_data)}, Test size: {len(p_test_data)}")

Training size: 15645, Test size: 1739


In [13]:
# Path to the saved model directory
model_dir = "./xlm_roberta_model_power_tr"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

# Set the model to evaluation mode
model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [14]:
# Convert to Hugging Face Dataset
test_dataset = Dataset.from_pandas(p_test_data)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1739/1739 [00:01<00:00, 1191.96 examples/s]


In [15]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,  # Adjust batch size as per your hardware
    logging_dir="./logs",
    do_train=False,
    do_eval=True
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [16]:
from sklearn.metrics import classification_report
# Get predictions
predictions = trainer.predict(tokenized_test_dataset)

# Extract predicted labels
preds = np.argmax(predictions.predictions, axis=-1)

# True labels
true_labels = p_test_data["label"].values

# Print Classification Report
print(classification_report(true_labels, preds))

100%|██████████| 109/109 [00:32<00:00,  3.34it/s]

              precision    recall  f1-score   support

           0       0.89      0.74      0.81       845
           1       0.79      0.91      0.84       894

    accuracy                           0.83      1739
   macro avg       0.84      0.82      0.83      1739
weighted avg       0.84      0.83      0.83      1739




