In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import classification_report
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import accelerate

In [7]:
orientation_file_path = 'trainingset-ideology-power/orientation/orientation-tr-train.tsv'
power_file_path = 'trainingset-ideology-power/power/power-tr-train.tsv'

orientation_data = pd.read_csv(orientation_file_path, sep='\t')
power_data = pd.read_csv(power_file_path, sep='\t')

# Split the dataset as: 90% training, 10% testing
orientation_train_data, orientation_test_data = train_test_split(orientation_data, test_size=0.1, stratify=orientation_data['label'], random_state=42)
power_train_data, power_test_data = train_test_split(power_data, test_size=0.1, stratify=power_data['label'], random_state=42)

orientation_train_data.to_csv('turkey_train.tsv', sep='\t', index=False)
orientation_test_data.to_csv('turkey_test.tsv', sep='\t', index=False)
power_train_data.to_csv('power_train.tsv', sep='\t', index=False)
power_test_data.to_csv('power_test.tsv', sep='\t', index=False)



Orientation:
Training set size: 14524 , testing set size: 1614
Power:
Training set size: 15645 , testing set size: 1739


In [10]:
# Statistics About Dataset
print("Orientation:")
print("Training set size:", len(orientation_train_data), ", orientation testing set size:", len(orientation_test_data))
print("Power:")
print("Training set size:", len(power_train_data), ", testing set size:", len(power_test_data))

print("\nOrientation training label distribution:")
print(orientation_train_data['label'].value_counts(normalize=True))

print("\nOrientation testing label distribution:")
print(orientation_test_data['label'].value_counts(normalize=True))

print("\nPower training label distribution:")
print(power_train_data['label'].value_counts(normalize=True))

print("\nPower testing label distribution:")
print(power_test_data['label'].value_counts(normalize=True))

Orientation:
Training set size: 14524 , rientation testing set size: 1614
Power:
Training set size: 15645 , testing set size: 1739

Orientation training label distribution:
label
1    0.581865
0    0.418135
Name: proportion, dtype: float64

Orientation testing label distribution:
label
1    0.581784
0    0.418216
Name: proportion, dtype: float64

Power training label distribution:
label
1    0.513774
0    0.486226
Name: proportion, dtype: float64

Power testing label distribution:
label
1    0.514089
0    0.485911
Name: proportion, dtype: float64


In [12]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_data(data, text_column):
    return tokenizer(
        data[text_column].tolist(),
        padding=True, 
        truncation=True, 
        max_length=128, 
        return_tensors="pt" 
    )

orientation_train_encodings = tokenize_data(orientation_train_data, "text")
orientation_test_encodings = tokenize_data(orientation_test_data, "text")

power_train_encodings = tokenize_data(power_train_data, "text_en")
power_test_encodings = tokenize_data(power_test_data, "text_en")

print("Tokenization complete")

Tokenization complete


In [13]:
orientation_train_labels = torch.tensor(orientation_train_data['label'].tolist())
orientation_test_labels = torch.tensor(orientation_test_data['label'].tolist())

power_train_labels = torch.tensor(power_train_data['label'].tolist())
power_test_labels = torch.tensor(power_test_data['label'].tolist())

class ParliamentaryDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Create datasets
orientation_train_dataset = ParliamentaryDataset(orientation_train_encodings, orientation_train_labels)
orientation_test_dataset = ParliamentaryDataset(orientation_test_encodings, orientation_test_labels)

power_train_dataset = ParliamentaryDataset(power_train_encodings, power_train_labels)
power_test_dataset = ParliamentaryDataset(power_test_encodings, power_test_labels)
print("Datasets are ready")

Datasets are ready


In [11]:
orientation_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2  # Binary classification (left vs right)
)

power_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2
)

print("Model loaded!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded!


In [14]:
training_args = TrainingArguments(
    output_dir="./results", 
    eval_strategy="epoch", 
    save_strategy="epoch", 
    learning_rate=5e-5, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16, 
    num_train_epochs=3, 
    weight_decay=0.01, 
    logging_dir="./logs", 
    logging_steps=10, 
    save_total_limit=2, 
    load_best_model_at_end=True, 
    fp16=True, 
    gradient_accumulation_steps=4
)
print("Training arguments defined")

Training arguments defined


In [14]:
orientation_trainer = Trainer(
    model=orientation_model,
    args=training_args,
    train_dataset=orientation_train_dataset,
    eval_dataset=orientation_test_dataset,
    processing_class=tokenizer 
)

power_trainer = Trainer(
    model=power_model,
    args=training_args,
    train_dataset=power_train_dataset,
    eval_dataset=power_test_dataset,
    processing_class=tokenizer
)

print("Starting training...")
orientation_trainer.train()
power_trainer.train()
print("Training complete!")

Starting training...


Epoch,Training Loss,Validation Loss
1,2.274,0.494472
2,1.7017,0.461051
3,1.2109,0.510811


Epoch,Training Loss,Validation Loss
0,1.8957,0.427344
1,1.4514,0.447493
2,0.867,0.500938


Training complete!


In [15]:
# Evaluate the model
results = orientation_trainer.evaluate()
print("Orientation evaluation results:", results)
results = power_trainer.evaluate()
print("Power evaluation results:", results)

Orientation evaluation results: {'eval_loss': 0.4610511064529419, 'eval_runtime': 1.6013, 'eval_samples_per_second': 1007.921, 'eval_steps_per_second': 63.073, 'epoch': 3.0}


Power evaluation results: {'eval_loss': 0.42734360694885254, 'eval_runtime': 1.4806, 'eval_samples_per_second': 1174.512, 'eval_steps_per_second': 73.618, 'epoch': 2.997955010224949}


In [16]:
# Get model predictions on the test set
orientation_predictions = orientation_trainer.predict(orientation_test_dataset)
power_predictions = power_trainer.predict(power_test_dataset)

# Extract predicted labels
orientation_preds = np.argmax(orientation_predictions.predictions, axis=1)
power_preds = np.argmax(power_predictions.predictions, axis=1)

print("Orientation Classification Report:")
print(classification_report(orientation_test_labels, orientation_preds, target_names=["Left", "Right"]))
print("Power Classification Report:")
print(classification_report(power_test_labels, power_preds, target_names=["Governing", "Opposition"]))

orientation_model.save_pretrained("./fine_tuned_model")
power_model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
print("Models and tokenizer saved!")

Orientation Classification Report:
              precision    recall  f1-score   support

        Left       0.73      0.79      0.76       675
       Right       0.84      0.79      0.81       939

    accuracy                           0.79      1614
   macro avg       0.78      0.79      0.78      1614
weighted avg       0.79      0.79      0.79      1614

Power Classification Report:
              precision    recall  f1-score   support

   Governing       0.76      0.84      0.80       845
  Opposition       0.84      0.75      0.79       894

    accuracy                           0.79      1739
   macro avg       0.80      0.80      0.79      1739
weighted avg       0.80      0.79      0.79      1739

Models and tokenizer saved!


In [3]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
causal_pipeline = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype = torch.float16,
    device=0
    #device_map="auto"
)

Device set to use cuda:0


In [4]:
orientation_file_path = 'trainingset-ideology-power/orientation/orientation-tr-train.tsv'
power_file_path = 'trainingset-ideology-power/power/power-tr-train.tsv'

orientation_data = pd.read_csv(orientation_file_path, sep='\t')
power_data = pd.read_csv(power_file_path, sep='\t')

orientation_data.to_csv('orientation.tsv', sep='\t', index=False)
power_data.to_csv('power.tsv', sep='\t', index=False)
print("Orientation set size:", len(orientation_data), "Power set size:", len(power_data))

orientation_tr_data = orientation_data['text']
orientation_en_data = orientation_data['text_en']
power_tr_data = power_data['text']
power_en_data = power_data['text_en']

orientation_labels = torch.tensor(orientation_data['label'].tolist())
power_labels = torch.tensor(power_data['label'].tolist())

def classify_text_orientation(text):
    prompt = (
        f"The following is a parliamentary speech: '{text}'\n"
        "Classify the speech as Political Left (0) or Political Right (1). Answer with 0 for Political Left or 1 for Political Right: "
    )
    result = causal_pipeline(prompt, max_new_tokens=1)
    return result[0]['generated_text'][-1]

def classify_data_orientation(data):
    preds = [classify_text_orientation(row) for row in data]
    return preds

def classify_text_power(text):
    prompt = (
        f"The following is a parliamentary speech: '{text}'\n"
        "Classify the speech as Governing Party (0) or Opposing Party (1). Answer with 0 for Governing Party or 1 for Opposing Party: "
    )
    result = causal_pipeline(prompt, max_new_tokens=1)
    return result[0]['generated_text'][-1]

def classify_data_power(data):
    preds = [classify_text_power(row) for row in data]
    return preds


Orientation set size: 16138 Power set size: 17384


In [7]:
orientation_tr_preds = classify_data_orientation(orientation_tr_data[:4000])
orientation_tr_preds = [int(pred) for pred in orientation_tr_preds]
print("TR Orientation Classification Report:")
print(classification_report(orientation_labels[:4000], orientation_tr_preds, 
                            target_names=["Left", "Right"]))

TR Orientation Classification Report:
              precision    recall  f1-score   support

        Left       0.39      0.82      0.53      1562
       Right       0.61      0.18      0.27      2438

    accuracy                           0.43      4000
   macro avg       0.50      0.50      0.40      4000
weighted avg       0.52      0.43      0.37      4000



In [8]:
orientation_en_preds = classify_data_orientation(orientation_en_data[:4000])
orientation_en_preds = [int(pred) for pred in orientation_en_preds]
print("EN Orientation Classification Report:")
print(classification_report(orientation_labels[:4000], orientation_en_preds, 
                            target_names=["Left", "Right"]))

EN Orientation Classification Report:
              precision    recall  f1-score   support

        Left       0.39      0.98      0.56      1562
       Right       0.50      0.01      0.02      2438

    accuracy                           0.39      4000
   macro avg       0.44      0.50      0.29      4000
weighted avg       0.46      0.39      0.23      4000



In [9]:
power_tr_preds = classify_data_power(power_tr_data[:4000])
power_tr_preds = [int(pred) for pred in power_tr_preds]
print("TR Power Classification Report:")
print(classification_report(power_labels[:4000], power_tr_preds, 
                            target_names=["Governing", "Opposition"]))

TR Power Classification Report:
              precision    recall  f1-score   support

   Governing       0.49      0.84      0.62      1961
  Opposition       0.53      0.17      0.25      2039

    accuracy                           0.50      4000
   macro avg       0.51      0.51      0.44      4000
weighted avg       0.51      0.50      0.43      4000



In [10]:
power_en_preds = classify_data_power(power_en_data[:4000])
power_en_preds = [int(pred) for pred in power_en_preds]
print("EN Power Classification Report:")
print(classification_report(power_labels[:4000], power_en_preds, 
                            target_names=["Governing", "Opposition"]))

EN Power Classification Report:
              precision    recall  f1-score   support

   Governing       0.48      0.85      0.61      1961
  Opposition       0.45      0.12      0.19      2039

    accuracy                           0.48      4000
   macro avg       0.47      0.48      0.40      4000
weighted avg       0.47      0.48      0.40      4000

