In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch

In [10]:
dataset_path = "./orientation/orientation-tr-train.tsv"
data = pd.read_csv(dataset_path, sep="\t")

# Display basic information
print(data.head())
print(data.info())

        id                           speaker sex  \
0  tr00000  ca2031caa4032c51980160359953d507   M   
1  tr00001  4cee0addb3c69f6866869b180f90d45f   M   
2  tr00002  b3d7f76d74ec268492f8190ca123a6b2   M   
3  tr00003  722efac7138c8197a9d1e97eed3a8b18   M   
4  tr00004  be82a4ade406ec6774a0a2e38f6957e3   M   

                                                text  \
0  Yeni yasama döneminin ülkemiz için, milletimiz...   
1  Sayın Başkan, değerli milletvekilleri; bugün, ...   
2  Sayın Başkanım, öncelikle yüce Meclisin Başkan...   
3  24’üncü Dönem Meclis Başkanlığına seçilmenizde...   
4  24’üncü Yasama Dönemimizin tüm milletvekilleri...   

                                             text_en  label  
0  Mr. President, dear lawmakers, I salute you, a...      1  
1  Mr. President, members of lawmakers, as I spea...      1  
2  Mr. President, I'm here to share with you the ...      1  
3  Mr. President, under the principles determined...      1  
4  Mr. President, dear lawmakers, I ask 

In [11]:
# Drop rows with missing translations or text fields
data = data.dropna(subset=['text', 'label'])

# Display class distribution
print(data['label'].value_counts())

label
1    9390
0    6748
Name: count, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(
    data, test_size=0.1, stratify=data['label'], random_state=42
)
print(f"Training size: {len(train_data)}, Test size: {len(test_data)}")

Training size: 14524, Test size: 1614


In [13]:
# Path to the saved model directory
model_dir = "./xlm_roberta_model"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

# Set the model to evaluation mode
model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [14]:
# Convert to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_data)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1614/1614 [00:01<00:00, 1103.15 examples/s]


In [15]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,  # Adjust batch size as per your hardware
    logging_dir="./logs",
    do_train=False,
    do_eval=True
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [16]:
from sklearn.metrics import classification_report
# Get predictions
predictions = trainer.predict(tokenized_test_dataset)

# Extract predicted labels
preds = np.argmax(predictions.predictions, axis=-1)

# True labels
true_labels = test_data["label"].values

# Print Classification Report
print(classification_report(true_labels, preds))

100%|██████████| 101/101 [00:30<00:00,  3.29it/s]

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       675
           1       0.91      0.88      0.89       939

    accuracy                           0.88      1614
   macro avg       0.87      0.88      0.88      1614
weighted avg       0.88      0.88      0.88      1614




