In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch

In [14]:
p_dataset_path = "./power/power-tr-train.tsv"
p_data = pd.read_csv(p_dataset_path, sep="\t")

# Display basic information
print(p_data.head())
print(p_data.info())

        id                           speaker sex  \
0  tr18146  ca2031caa4032c51980160359953d507   M   
1  tr18147  4cee0addb3c69f6866869b180f90d45f   M   
2  tr18148  b3d7f76d74ec268492f8190ca123a6b2   M   
3  tr18149  722efac7138c8197a9d1e97eed3a8b18   M   
4  tr18150  fcc61122f3553c57ae207adeb1a1af84   M   

                                                text  \
0  Yeni yasama döneminin ülkemiz için, milletimiz...   
1  Sayın Başkan, değerli milletvekilleri; bugün, ...   
2  Sayın Başkanım, öncelikle yüce Meclisin Başkan...   
3  24’üncü Dönem Meclis Başkanlığına seçilmenizde...   
4  Usul tartışmasında 2 kişi lehte 2 kişi aleyhte...   

                                             text_en  label  
0  Mr. President, dear lawmakers, I salute you, a...      0  
1  Mr. President, members of lawmakers, as I spea...      0  
2  Mr. President, I'm here to share with you the ...      0  
3  Mr. President, under the principles determined...      0  
4  Two in favour of two in the legal deb

In [15]:
# Drop rows with missing translations or text fields
p_data = p_data.dropna(subset=['text', 'label'])

# Display class distribution
print(p_data['label'].value_counts())

label
1    8932
0    8452
Name: count, dtype: int64


In [16]:
from sklearn.model_selection import train_test_split
p_train_data, p_test_data = train_test_split(
    p_data, test_size=0.1, stratify=p_data['label'], random_state=42
)
print(f"Training size: {len(p_train_data)}, Test size: {len(p_test_data)}")

Training size: 15645, Test size: 1739


In [23]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
# Convert pandas DataFrame to Hugging Face Dataset
p_train_dataset = Dataset.from_pandas(p_train_data)
p_test_dataset = Dataset.from_pandas(p_test_data)

# Apply tokenization
tokenized_train_p = p_train_dataset.map(tokenize_function, batched=True)
tokenized_test_p = p_test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
tokenized_train_p = tokenized_train_p.remove_columns(["text", "__index_level_0__"])
tokenized_test_p = tokenized_test_p.remove_columns(["text", "__index_level_0__"])

# Set format for PyTorch
tokenized_train_p.set_format("torch")
tokenized_test_p.set_format("torch")

Map: 100%|██████████| 15645/15645 [00:13<00:00, 1144.76 examples/s]
Map: 100%|██████████| 1739/1739 [00:01<00:00, 1264.91 examples/s]


In [24]:
print(tokenized_train_p)

Dataset({
    features: ['id', 'speaker', 'sex', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 15645
})


In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
output_dir = 'C:\\Users\\dor_b\\Documents\\results'
log = 'C:\\Users\\dor_b\\Documents\\logs'
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=log,
    logging_steps=50,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_p,
    eval_dataset=tokenized_test_p,
    tokenizer=tokenizer,
)
trainer.train()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  2%|▏         | 50/2934 [05:24<5:34:35,  6.96s/it]

{'loss': 0.6702, 'grad_norm': 5.592130184173584, 'learning_rate': 2.948875255623722e-05, 'epoch': 0.05}


  3%|▎         | 100/2934 [11:12<5:32:40,  7.04s/it]

{'loss': 0.554, 'grad_norm': 10.038786888122559, 'learning_rate': 2.8977505112474437e-05, 'epoch': 0.1}


  5%|▌         | 150/2934 [17:03<5:31:28,  7.14s/it]

{'loss': 0.4963, 'grad_norm': 7.311159610748291, 'learning_rate': 2.8466257668711656e-05, 'epoch': 0.15}


  7%|▋         | 200/2934 [23:04<5:23:12,  7.09s/it]

{'loss': 0.5499, 'grad_norm': 8.739221572875977, 'learning_rate': 2.7955010224948877e-05, 'epoch': 0.2}


  9%|▊         | 250/2934 [28:53<5:06:57,  6.86s/it]

{'loss': 0.4433, 'grad_norm': 15.393159866333008, 'learning_rate': 2.7443762781186092e-05, 'epoch': 0.26}


 10%|█         | 300/2934 [34:38<5:02:30,  6.89s/it]

{'loss': 0.405, 'grad_norm': 16.861812591552734, 'learning_rate': 2.6932515337423314e-05, 'epoch': 0.31}


 12%|█▏        | 350/2934 [40:23<4:57:41,  6.91s/it]

{'loss': 0.4208, 'grad_norm': 6.214046955108643, 'learning_rate': 2.6421267893660532e-05, 'epoch': 0.36}


 14%|█▎        | 400/2934 [46:08<4:48:23,  6.83s/it]

{'loss': 0.4149, 'grad_norm': 5.763175964355469, 'learning_rate': 2.591002044989775e-05, 'epoch': 0.41}


 15%|█▌        | 450/2934 [52:03<4:53:20,  7.09s/it]

{'loss': 0.4263, 'grad_norm': 10.247779846191406, 'learning_rate': 2.539877300613497e-05, 'epoch': 0.46}


 17%|█▋        | 500/2934 [57:58<4:41:35,  6.94s/it]

{'loss': 0.3687, 'grad_norm': 12.904627799987793, 'learning_rate': 2.488752556237219e-05, 'epoch': 0.51}


 19%|█▊        | 550/2934 [1:03:51<4:43:44,  7.14s/it]

{'loss': 0.4202, 'grad_norm': 11.398418426513672, 'learning_rate': 2.4376278118609406e-05, 'epoch': 0.56}


 20%|██        | 600/2934 [1:09:43<4:35:02,  7.07s/it]

{'loss': 0.4123, 'grad_norm': 6.457627773284912, 'learning_rate': 2.3865030674846628e-05, 'epoch': 0.61}


 22%|██▏       | 650/2934 [1:15:31<4:25:28,  6.97s/it]

{'loss': 0.4003, 'grad_norm': 7.628976821899414, 'learning_rate': 2.3353783231083846e-05, 'epoch': 0.66}


 24%|██▍       | 700/2934 [1:21:18<4:16:02,  6.88s/it]

{'loss': 0.3812, 'grad_norm': 5.1510910987854, 'learning_rate': 2.2842535787321064e-05, 'epoch': 0.72}


 26%|██▌       | 750/2934 [1:27:07<4:08:55,  6.84s/it]

{'loss': 0.3425, 'grad_norm': 10.644410133361816, 'learning_rate': 2.2331288343558283e-05, 'epoch': 0.77}


 27%|██▋       | 800/2934 [1:32:53<4:08:20,  6.98s/it]

{'loss': 0.3661, 'grad_norm': 12.402985572814941, 'learning_rate': 2.18200408997955e-05, 'epoch': 0.82}


 29%|██▉       | 850/2934 [1:38:39<3:54:52,  6.76s/it]

{'loss': 0.3633, 'grad_norm': 10.18461799621582, 'learning_rate': 2.130879345603272e-05, 'epoch': 0.87}


 31%|███       | 900/2934 [1:44:26<3:55:37,  6.95s/it]

{'loss': 0.3647, 'grad_norm': 6.4427995681762695, 'learning_rate': 2.0797546012269938e-05, 'epoch': 0.92}


 32%|███▏      | 950/2934 [1:50:12<3:50:19,  6.97s/it]

{'loss': 0.304, 'grad_norm': 19.184078216552734, 'learning_rate': 2.028629856850716e-05, 'epoch': 0.97}


 33%|███▎      | 978/2934 [1:53:20<3:10:21,  5.84s/it]
 33%|███▎      | 978/2934 [1:54:39<3:10:21,  5.84s/it]

{'eval_loss': 0.367783784866333, 'eval_runtime': 78.612, 'eval_samples_per_second': 22.121, 'eval_steps_per_second': 1.387, 'epoch': 1.0}


 34%|███▍      | 1000/2934 [1:57:37<3:45:01,  6.98s/it]

{'loss': 0.3586, 'grad_norm': 12.657052993774414, 'learning_rate': 1.9775051124744374e-05, 'epoch': 1.02}


 36%|███▌      | 1050/2934 [2:03:26<3:41:07,  7.04s/it]

{'loss': 0.2921, 'grad_norm': 11.38012409210205, 'learning_rate': 1.9263803680981596e-05, 'epoch': 1.07}


 37%|███▋      | 1100/2934 [2:09:11<3:30:31,  6.89s/it]

{'loss': 0.2403, 'grad_norm': 4.62935209274292, 'learning_rate': 1.8752556237218814e-05, 'epoch': 1.12}


 39%|███▉      | 1150/2934 [2:14:57<3:24:54,  6.89s/it]

{'loss': 0.2907, 'grad_norm': 16.685230255126953, 'learning_rate': 1.8241308793456033e-05, 'epoch': 1.18}


 41%|████      | 1200/2934 [2:20:44<3:22:14,  7.00s/it]

{'loss': 0.2504, 'grad_norm': 11.056654930114746, 'learning_rate': 1.773006134969325e-05, 'epoch': 1.23}


 43%|████▎     | 1250/2934 [2:26:34<3:16:47,  7.01s/it]

{'loss': 0.2545, 'grad_norm': 11.11767864227295, 'learning_rate': 1.7218813905930473e-05, 'epoch': 1.28}


 44%|████▍     | 1300/2934 [2:32:23<3:09:24,  6.95s/it]

{'loss': 0.2691, 'grad_norm': 4.227703094482422, 'learning_rate': 1.6707566462167688e-05, 'epoch': 1.33}


 46%|████▌     | 1350/2934 [2:38:11<3:06:37,  7.07s/it]

{'loss': 0.2732, 'grad_norm': 12.931123733520508, 'learning_rate': 1.619631901840491e-05, 'epoch': 1.38}


 48%|████▊     | 1400/2934 [2:43:57<2:55:46,  6.88s/it]

{'loss': 0.2526, 'grad_norm': 8.294759750366211, 'learning_rate': 1.5685071574642128e-05, 'epoch': 1.43}


 49%|████▉     | 1450/2934 [2:49:45<2:54:59,  7.08s/it]

{'loss': 0.2863, 'grad_norm': 8.18204116821289, 'learning_rate': 1.5173824130879344e-05, 'epoch': 1.48}


 51%|█████     | 1500/2934 [2:55:33<2:46:21,  6.96s/it]

{'loss': 0.2747, 'grad_norm': 7.145063400268555, 'learning_rate': 1.4662576687116564e-05, 'epoch': 1.53}


 53%|█████▎    | 1550/2934 [3:01:20<2:37:20,  6.82s/it]

{'loss': 0.2741, 'grad_norm': 14.762751579284668, 'learning_rate': 1.4151329243353784e-05, 'epoch': 1.58}


 55%|█████▍    | 1600/2934 [3:07:07<2:34:07,  6.93s/it]

{'loss': 0.2886, 'grad_norm': 1.9343256950378418, 'learning_rate': 1.3640081799591003e-05, 'epoch': 1.64}


 56%|█████▌    | 1650/2934 [3:12:53<2:26:00,  6.82s/it]

{'loss': 0.2468, 'grad_norm': 9.682028770446777, 'learning_rate': 1.3128834355828221e-05, 'epoch': 1.69}


 58%|█████▊    | 1700/2934 [3:18:37<2:19:08,  6.77s/it]

{'loss': 0.2414, 'grad_norm': 8.62824821472168, 'learning_rate': 1.2617586912065441e-05, 'epoch': 1.74}


 60%|█████▉    | 1750/2934 [3:24:22<2:16:31,  6.92s/it]

{'loss': 0.274, 'grad_norm': 16.790782928466797, 'learning_rate': 1.210633946830266e-05, 'epoch': 1.79}


 61%|██████▏   | 1800/2934 [3:30:06<2:09:52,  6.87s/it]

{'loss': 0.2929, 'grad_norm': 26.221187591552734, 'learning_rate': 1.1595092024539878e-05, 'epoch': 1.84}


 63%|██████▎   | 1850/2934 [3:35:50<2:05:12,  6.93s/it]

{'loss': 0.3066, 'grad_norm': 8.270427703857422, 'learning_rate': 1.1083844580777098e-05, 'epoch': 1.89}


 65%|██████▍   | 1900/2934 [3:41:32<1:57:30,  6.82s/it]

{'loss': 0.2118, 'grad_norm': 3.5100669860839844, 'learning_rate': 1.0572597137014316e-05, 'epoch': 1.94}


 66%|██████▋   | 1950/2934 [3:47:17<1:51:48,  6.82s/it]

{'loss': 0.2368, 'grad_norm': 14.030057907104492, 'learning_rate': 1.0061349693251534e-05, 'epoch': 1.99}


 67%|██████▋   | 1956/2934 [3:47:53<1:32:43,  5.69s/it]
 67%|██████▋   | 1956/2934 [3:49:08<1:32:43,  5.69s/it]

{'eval_loss': 0.3140605092048645, 'eval_runtime': 75.2491, 'eval_samples_per_second': 23.11, 'eval_steps_per_second': 1.449, 'epoch': 2.0}


 68%|██████▊   | 2000/2934 [3:54:25<1:46:16,  6.83s/it]

{'loss': 0.204, 'grad_norm': 8.990534782409668, 'learning_rate': 9.550102249488754e-06, 'epoch': 2.04}


 70%|██████▉   | 2050/2934 [4:00:08<1:40:59,  6.85s/it]

{'loss': 0.1557, 'grad_norm': 12.580320358276367, 'learning_rate': 9.038854805725971e-06, 'epoch': 2.1}


 72%|███████▏  | 2100/2934 [4:05:51<1:34:19,  6.79s/it]

{'loss': 0.1844, 'grad_norm': 12.073935508728027, 'learning_rate': 8.52760736196319e-06, 'epoch': 2.15}


 73%|███████▎  | 2150/2934 [4:11:36<1:29:58,  6.89s/it]

{'loss': 0.1618, 'grad_norm': 8.442102432250977, 'learning_rate': 8.01635991820041e-06, 'epoch': 2.2}


 75%|███████▍  | 2200/2934 [4:17:12<1:23:13,  6.80s/it]

{'loss': 0.18, 'grad_norm': 23.259061813354492, 'learning_rate': 7.5051124744376285e-06, 'epoch': 2.25}


 77%|███████▋  | 2250/2934 [4:22:51<1:17:48,  6.83s/it]

{'loss': 0.1854, 'grad_norm': 3.674595832824707, 'learning_rate': 6.993865030674846e-06, 'epoch': 2.3}


 78%|███████▊  | 2300/2934 [4:28:29<1:10:51,  6.71s/it]

{'loss': 0.2, 'grad_norm': 22.701766967773438, 'learning_rate': 6.482617586912065e-06, 'epoch': 2.35}


 80%|████████  | 2350/2934 [4:34:12<1:07:09,  6.90s/it]

{'loss': 0.1771, 'grad_norm': 1.8274117708206177, 'learning_rate': 5.971370143149284e-06, 'epoch': 2.4}


 82%|████████▏ | 2400/2934 [4:39:59<1:01:30,  6.91s/it]

{'loss': 0.2027, 'grad_norm': 3.2407474517822266, 'learning_rate': 5.4601226993865036e-06, 'epoch': 2.45}


 84%|████████▎ | 2450/2934 [4:45:46<56:36,  7.02s/it]  

{'loss': 0.1382, 'grad_norm': 30.341142654418945, 'learning_rate': 4.948875255623722e-06, 'epoch': 2.51}


 85%|████████▌ | 2500/2934 [4:51:41<50:57,  7.04s/it]

{'loss': 0.1616, 'grad_norm': 12.912763595581055, 'learning_rate': 4.437627811860941e-06, 'epoch': 2.56}


 87%|████████▋ | 2550/2934 [4:57:32<46:17,  7.23s/it]

{'loss': 0.162, 'grad_norm': 0.6402721405029297, 'learning_rate': 3.92638036809816e-06, 'epoch': 2.61}


 89%|████████▊ | 2600/2934 [5:03:25<39:43,  7.14s/it]

{'loss': 0.164, 'grad_norm': 4.863226413726807, 'learning_rate': 3.415132924335378e-06, 'epoch': 2.66}


 90%|█████████ | 2650/2934 [5:09:22<33:31,  7.08s/it]

{'loss': 0.2015, 'grad_norm': 9.845714569091797, 'learning_rate': 2.9038854805725973e-06, 'epoch': 2.71}


 92%|█████████▏| 2700/2934 [5:15:18<28:25,  7.29s/it]

{'loss': 0.2026, 'grad_norm': 2.560894250869751, 'learning_rate': 2.392638036809816e-06, 'epoch': 2.76}


 94%|█████████▎| 2750/2934 [5:21:14<22:04,  7.20s/it]

{'loss': 0.1713, 'grad_norm': 16.745559692382812, 'learning_rate': 1.881390593047035e-06, 'epoch': 2.81}


 95%|█████████▌| 2800/2934 [5:27:13<15:52,  7.11s/it]

{'loss': 0.1991, 'grad_norm': 41.98064422607422, 'learning_rate': 1.3701431492842536e-06, 'epoch': 2.86}


 97%|█████████▋| 2850/2934 [5:33:08<09:59,  7.14s/it]

{'loss': 0.2, 'grad_norm': 18.784271240234375, 'learning_rate': 8.588957055214724e-07, 'epoch': 2.91}


 99%|█████████▉| 2900/2934 [5:39:01<03:58,  7.01s/it]

{'loss': 0.169, 'grad_norm': 4.15236234664917, 'learning_rate': 3.4764826175869123e-07, 'epoch': 2.97}


100%|██████████| 2934/2934 [5:42:57<00:00,  6.02s/it]
100%|██████████| 2934/2934 [5:44:19<00:00,  6.02s/it]

{'eval_loss': 0.3779737651348114, 'eval_runtime': 64.5917, 'eval_samples_per_second': 26.923, 'eval_steps_per_second': 1.688, 'epoch': 3.0}


100%|██████████| 2934/2934 [5:44:34<00:00,  7.05s/it]

{'train_runtime': 20674.1867, 'train_samples_per_second': 2.27, 'train_steps_per_second': 0.142, 'train_loss': 0.29087191157870823, 'epoch': 3.0}





TrainOutput(global_step=2934, training_loss=0.29087191157870823, metrics={'train_runtime': 20674.1867, 'train_samples_per_second': 2.27, 'train_steps_per_second': 0.142, 'total_flos': 1.23491173833216e+16, 'train_loss': 0.29087191157870823, 'epoch': 3.0})

In [26]:
model.save_pretrained("./xlm_roberta_model_power_tr")
tokenizer.save_pretrained("./xlm_roberta_model_power_tr")

('./xlm_roberta_model_power_tr\\tokenizer_config.json',
 './xlm_roberta_model_power_tr\\special_tokens_map.json',
 './xlm_roberta_model_power_tr\\tokenizer.json')

In [None]:
import transformers
import torch
import huggingface_hub



model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-classification",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
    use_auth_token="your token here",
    trust_remote_code=True
)

def classification_pipeline(speech):

    messages = [
        {"role": "instruction", "content": """Assume you are politician, which will be asked to classify the ideology of the
                                                speakers' party from turkish parliament. Speech will be given as a text. 
                                                In other words, this involves performing binary classification to determine whether
                                                the speaker’s party is governing (0) or oppositon (1). You are not allowed to answer 
                                                anything else besides the number 1 or 0."""},
        {"role": "input", "content": speech},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=100,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.01,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][len(prompt):]



In [None]:
for index, row in tokenized_test_p.iterrows():
    print(classification_pipeline(row['text']))

In [None]:
for index, row in tokenized_test_p.iterrows():
    print(classification_pipeline(row['text_en']))