In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = "./orientation/orientation-tr-train.tsv"
data = pd.read_csv(dataset_path, sep="\t")

# Display basic information
print(data.head())
print(data.info())

        id                           speaker sex  \
0  tr00000  ca2031caa4032c51980160359953d507   M   
1  tr00001  4cee0addb3c69f6866869b180f90d45f   M   
2  tr00002  b3d7f76d74ec268492f8190ca123a6b2   M   
3  tr00003  722efac7138c8197a9d1e97eed3a8b18   M   
4  tr00004  be82a4ade406ec6774a0a2e38f6957e3   M   

                                                text  \
0  Yeni yasama döneminin ülkemiz için, milletimiz...   
1  Sayın Başkan, değerli milletvekilleri; bugün, ...   
2  Sayın Başkanım, öncelikle yüce Meclisin Başkan...   
3  24’üncü Dönem Meclis Başkanlığına seçilmenizde...   
4  24’üncü Yasama Dönemimizin tüm milletvekilleri...   

                                             text_en  label  
0  Mr. President, dear lawmakers, I salute you, a...      1  
1  Mr. President, members of lawmakers, as I spea...      1  
2  Mr. President, I'm here to share with you the ...      1  
3  Mr. President, under the principles determined...      1  
4  Mr. President, dear lawmakers, I ask 

In [3]:
# Drop rows with missing translations or text fields
data = data.dropna(subset=['text_en', 'label'])

# Display class distribution
print(data['label'].value_counts())


label
1    9390
0    6748
Name: count, dtype: int64


In [4]:
train_data, test_data = train_test_split(
    data, test_size=0.1, stratify=data['label'], random_state=42
)
print(f"Training size: {len(train_data)}, Test size: {len(test_data)}")

Training size: 14524, Test size: 1614


In [5]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text_en"], padding="max_length", truncation=True)




In [6]:
# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Apply tokenization
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
tokenized_train = tokenized_train.remove_columns(["text_en", "__index_level_0__"])
tokenized_test = tokenized_test.remove_columns(["text_en", "__index_level_0__"])

# Set format for PyTorch
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")


Map: 100%|██████████| 14524/14524 [00:14<00:00, 1001.16 examples/s]
Map: 100%|██████████| 1614/1614 [00:01<00:00, 1132.44 examples/s]


In [10]:

print(tokenized_test)

Dataset({
    features: ['id', 'speaker', 'sex', 'text_en', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1614
})


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
output_dir = 'C:\\Users\\dor_b\\Documents\\results'
log = 'C:\\Users\\dor_b\\Documents\\logs'
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=log,
    logging_steps=50,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)
trainer.train()


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.den

{'loss': 0.6514, 'learning_rate': 2.944933920704846e-05, 'epoch': 0.06}


  4%|▎         | 100/2724 [44:14<19:13:49, 26.38s/it]

{'loss': 0.6637, 'learning_rate': 2.889867841409692e-05, 'epoch': 0.11}


  6%|▌         | 150/2724 [1:06:15<18:51:39, 26.38s/it]

{'loss': 0.653, 'learning_rate': 2.8348017621145374e-05, 'epoch': 0.17}


  7%|▋         | 200/2724 [1:28:15<18:30:14, 26.39s/it]

{'loss': 0.5978, 'learning_rate': 2.7797356828193832e-05, 'epoch': 0.22}


  9%|▉         | 250/2724 [1:50:14<18:07:50, 26.38s/it]

{'loss': 0.5442, 'learning_rate': 2.724669603524229e-05, 'epoch': 0.28}


 11%|█         | 300/2724 [2:12:14<17:46:14, 26.39s/it]

{'loss': 0.5446, 'learning_rate': 2.669603524229075e-05, 'epoch': 0.33}


 13%|█▎        | 350/2724 [2:34:14<17:24:02, 26.39s/it]

{'loss': 0.4874, 'learning_rate': 2.614537444933921e-05, 'epoch': 0.39}


 15%|█▍        | 400/2724 [2:56:12<17:02:36, 26.40s/it]

{'loss': 0.4971, 'learning_rate': 2.5594713656387664e-05, 'epoch': 0.44}


 17%|█▋        | 450/2724 [3:18:12<16:41:35, 26.43s/it]

{'loss': 0.494, 'learning_rate': 2.5044052863436125e-05, 'epoch': 0.5}


 18%|█▊        | 500/2724 [3:40:11<16:21:19, 26.47s/it]

{'loss': 0.4727, 'learning_rate': 2.4493392070484583e-05, 'epoch': 0.55}


 20%|██        | 550/2724 [4:02:20<15:55:44, 26.38s/it]

{'loss': 0.449, 'learning_rate': 2.394273127753304e-05, 'epoch': 0.61}


 22%|██▏       | 600/2724 [4:24:20<15:34:55, 26.41s/it]

{'loss': 0.4557, 'learning_rate': 2.33920704845815e-05, 'epoch': 0.66}


 24%|██▍       | 650/2724 [4:46:21<15:11:31, 26.37s/it]

{'loss': 0.3773, 'learning_rate': 2.2841409691629956e-05, 'epoch': 0.72}


 26%|██▌       | 700/2724 [5:08:22<14:52:28, 26.46s/it]

{'loss': 0.4437, 'learning_rate': 2.2290748898678414e-05, 'epoch': 0.77}


 28%|██▊       | 750/2724 [5:30:22<14:28:29, 26.40s/it]

{'loss': 0.4169, 'learning_rate': 2.1740088105726872e-05, 'epoch': 0.83}


 29%|██▉       | 800/2724 [5:52:21<14:05:22, 26.36s/it]

{'loss': 0.4249, 'learning_rate': 2.1189427312775333e-05, 'epoch': 0.88}


 31%|███       | 850/2724 [6:14:22<13:46:33, 26.46s/it]

{'loss': 0.4556, 'learning_rate': 2.063876651982379e-05, 'epoch': 0.94}


 33%|███▎      | 900/2724 [6:36:25<13:27:20, 26.56s/it]

{'loss': 0.3931, 'learning_rate': 2.0088105726872246e-05, 'epoch': 0.99}


                                                       
 33%|███▎      | 908/2724 [6:47:47<12:43:15, 25.22s/it]

{'eval_loss': 0.5057454109191895, 'eval_runtime': 476.8042, 'eval_samples_per_second': 3.385, 'eval_steps_per_second': 0.212, 'epoch': 1.0}


 35%|███▍      | 950/2724 [7:06:36<13:01:14, 26.42s/it] 

{'loss': 0.3755, 'learning_rate': 1.9537444933920703e-05, 'epoch': 1.05}


 37%|███▋      | 1000/2724 [7:28:36<12:33:20, 26.22s/it]

{'loss': 0.332, 'learning_rate': 1.8986784140969165e-05, 'epoch': 1.1}


 39%|███▊      | 1050/2724 [7:50:29<12:13:50, 26.30s/it]

{'loss': 0.4285, 'learning_rate': 1.8436123348017622e-05, 'epoch': 1.16}


 40%|████      | 1100/2724 [8:12:24<11:50:36, 26.25s/it]

{'loss': 0.3474, 'learning_rate': 1.788546255506608e-05, 'epoch': 1.21}


 42%|████▏     | 1150/2724 [8:34:14<11:29:23, 26.28s/it]

{'loss': 0.3056, 'learning_rate': 1.7334801762114538e-05, 'epoch': 1.27}


 44%|████▍     | 1200/2724 [8:56:08<11:07:05, 26.26s/it]

{'loss': 0.3398, 'learning_rate': 1.6784140969162996e-05, 'epoch': 1.32}


 46%|████▌     | 1250/2724 [9:18:02<10:46:38, 26.32s/it]

{'loss': 0.2935, 'learning_rate': 1.6233480176211454e-05, 'epoch': 1.38}


 48%|████▊     | 1300/2724 [9:39:56<10:23:07, 26.25s/it]

{'loss': 0.3105, 'learning_rate': 1.5682819383259912e-05, 'epoch': 1.43}


 50%|████▉     | 1350/2724 [10:01:51<10:02:00, 26.29s/it]

{'loss': 0.387, 'learning_rate': 1.5132158590308371e-05, 'epoch': 1.49}


 51%|█████▏    | 1400/2724 [10:23:48<9:40:24, 26.30s/it] 

{'loss': 0.3258, 'learning_rate': 1.458149779735683e-05, 'epoch': 1.54}


 53%|█████▎    | 1450/2724 [10:45:39<9:17:55, 26.28s/it]

{'loss': 0.2816, 'learning_rate': 1.4030837004405287e-05, 'epoch': 1.6}


 55%|█████▌    | 1500/2724 [11:07:32<8:56:14, 26.29s/it]

{'loss': 0.3673, 'learning_rate': 1.3480176211453745e-05, 'epoch': 1.65}


 57%|█████▋    | 1550/2724 [11:29:24<8:31:52, 26.16s/it]

{'loss': 0.3191, 'learning_rate': 1.2929515418502203e-05, 'epoch': 1.71}


 59%|█████▊    | 1600/2724 [11:51:15<8:11:55, 26.26s/it]

{'loss': 0.3596, 'learning_rate': 1.237885462555066e-05, 'epoch': 1.76}


 61%|██████    | 1650/2724 [12:13:07<7:49:24, 26.22s/it]

{'loss': 0.3099, 'learning_rate': 1.182819383259912e-05, 'epoch': 1.82}


 62%|██████▏   | 1700/2724 [12:34:58<7:27:34, 26.23s/it]

{'loss': 0.2852, 'learning_rate': 1.1277533039647576e-05, 'epoch': 1.87}


 64%|██████▍   | 1750/2724 [12:56:51<7:06:22, 26.26s/it]

{'loss': 0.2821, 'learning_rate': 1.0726872246696036e-05, 'epoch': 1.93}


 66%|██████▌   | 1800/2724 [13:18:45<6:45:00, 26.30s/it]

{'loss': 0.2863, 'learning_rate': 1.0176211453744494e-05, 'epoch': 1.98}


                                                        
 67%|██████▋   | 1816/2724 [13:33:34<6:17:27, 24.94s/it]

{'eval_loss': 0.34190812706947327, 'eval_runtime': 473.9843, 'eval_samples_per_second': 3.405, 'eval_steps_per_second': 0.213, 'epoch': 2.0}


 68%|██████▊   | 1850/2724 [13:48:43<6:21:38, 26.20s/it]  

{'loss': 0.2616, 'learning_rate': 9.625550660792952e-06, 'epoch': 2.04}


 70%|██████▉   | 1900/2724 [14:10:38<6:00:08, 26.22s/it]

{'loss': 0.2581, 'learning_rate': 9.074889867841411e-06, 'epoch': 2.09}


 72%|███████▏  | 1950/2724 [14:32:33<5:39:41, 26.33s/it]

{'loss': 0.1823, 'learning_rate': 8.524229074889867e-06, 'epoch': 2.15}


 73%|███████▎  | 2000/2724 [14:54:26<5:16:40, 26.24s/it]

{'loss': 0.2275, 'learning_rate': 7.973568281938327e-06, 'epoch': 2.2}


 75%|███████▌  | 2050/2724 [15:16:29<4:55:20, 26.29s/it]

{'loss': 0.2476, 'learning_rate': 7.422907488986785e-06, 'epoch': 2.26}


 77%|███████▋  | 2100/2724 [15:38:24<4:32:39, 26.22s/it]

{'loss': 0.2248, 'learning_rate': 6.8722466960352425e-06, 'epoch': 2.31}


 79%|███████▉  | 2150/2724 [16:00:18<4:11:07, 26.25s/it]

{'loss': 0.2244, 'learning_rate': 6.3215859030837e-06, 'epoch': 2.37}


 81%|████████  | 2200/2724 [16:22:13<3:49:37, 26.29s/it]

{'loss': 0.2052, 'learning_rate': 5.770925110132158e-06, 'epoch': 2.42}


 83%|████████▎ | 2250/2724 [16:44:07<3:27:31, 26.27s/it]

{'loss': 0.2597, 'learning_rate': 5.220264317180617e-06, 'epoch': 2.48}


 84%|████████▍ | 2300/2724 [17:06:00<3:05:48, 26.29s/it]

{'loss': 0.2073, 'learning_rate': 4.669603524229075e-06, 'epoch': 2.53}


 86%|████████▋ | 2350/2724 [17:28:06<2:43:58, 26.31s/it]

{'loss': 0.2462, 'learning_rate': 4.1189427312775335e-06, 'epoch': 2.59}


 88%|████████▊ | 2400/2724 [17:49:59<2:21:42, 26.24s/it]

{'loss': 0.2367, 'learning_rate': 3.568281938325991e-06, 'epoch': 2.64}


 90%|████████▉ | 2450/2724 [18:12:01<2:00:43, 26.44s/it]

{'loss': 0.2285, 'learning_rate': 3.0176211453744496e-06, 'epoch': 2.7}


 92%|█████████▏| 2500/2724 [18:34:00<1:35:41, 25.63s/it]

{'loss': 0.2115, 'learning_rate': 2.4669603524229075e-06, 'epoch': 2.75}


 94%|█████████▎| 2550/2724 [18:56:06<1:16:16, 26.30s/it]

{'loss': 0.1982, 'learning_rate': 1.9162995594713658e-06, 'epoch': 2.81}


 95%|█████████▌| 2600/2724 [19:17:47<52:48, 25.55s/it]  

{'loss': 0.2647, 'learning_rate': 1.3656387665198238e-06, 'epoch': 2.86}


 97%|█████████▋| 2650/2724 [19:39:29<32:17, 26.18s/it]

{'loss': 0.2218, 'learning_rate': 8.14977973568282e-07, 'epoch': 2.92}


 99%|█████████▉| 2700/2724 [20:01:37<10:41, 26.73s/it]

{'loss': 0.2203, 'learning_rate': 2.643171806167401e-07, 'epoch': 2.97}


                                                      
100%|██████████| 2724/2724 [20:20:21<00:00, 25.34s/it]

{'eval_loss': 0.3190705180168152, 'eval_runtime': 481.8603, 'eval_samples_per_second': 3.35, 'eval_steps_per_second': 0.21, 'epoch': 3.0}


100%|██████████| 2724/2724 [20:20:43<00:00, 25.34s/it]

{'train_runtime': 73243.8627, 'train_samples_per_second': 0.595, 'train_steps_per_second': 0.037, 'train_loss': 0.3518685877935827, 'epoch': 3.0}


100%|██████████| 2724/2724 [20:20:44<00:00, 26.89s/it]


TrainOutput(global_step=2724, training_loss=0.3518685877935827, metrics={'train_runtime': 73243.8627, 'train_samples_per_second': 0.595, 'train_steps_per_second': 0.037, 'train_loss': 0.3518685877935827, 'epoch': 3.0})

In [8]:
model.save_pretrained("./xlm_roberta_model")
tokenizer.save_pretrained("./xlm_roberta_model")

('./xlm_roberta_model\\tokenizer_config.json',
 './xlm_roberta_model\\special_tokens_map.json',
 './xlm_roberta_model\\tokenizer.json')

In [9]:
from sklearn.metrics import classification_report

# Get predictions
predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=-1)

# Evaluate
labels = test_data['label'].values
print(classification_report(labels, preds))


100%|██████████| 101/101 [07:58<00:00,  4.74s/it]

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       675
           1       0.91      0.88      0.89       939

    accuracy                           0.88      1614
   macro avg       0.87      0.88      0.88      1614
weighted avg       0.88      0.88      0.88      1614






In [2]:
import transformers
import torch
import huggingface_hub



model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
    use_auth_token="your token here",
    trust_remote_code=True
)

def classification_pipeline(speech):

    messages = [
        {"role": "instruction", "content": """Assume you are politician, which will be asked to classify the ideology of the
                                                speakers' party from turkish parliament. Speeach will be given as a text. 
                                                In other words, this involves performing binary classification to determine whether
                                                the speaker’s party leans left (0) or right (1). You are not allowed to answer 
                                                anything else besides the number 1 or 0."""},
        {"role": "input", "content": speech},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=100,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.01,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][len(prompt):]



In [None]:
for index, row in tokenized_test.iterrows():
    print(classification_pipeline(row['text_en']))