In [None]:
# 1. Loading
!pip install transformers datasets torch pandas scikit-learn bitsandbytes -q

# 2. Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    pipeline
)
from datasets import Dataset
import torch

# 3. GPU control
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompati

In [None]:
# 4. Analyze and load datasets
def load_and_analyze_data(file_path, task_name):
    df = pd.read_csv(file_path, sep='\t')
    print(f"\n{task_name} dataset analysis:")
    print(f"Total sample number: {len(df)}")
    print(f"Labels:\n{df['label'].value_counts()}")
    return df

orientation_df = load_and_analyze_data('orientation-tr-train.tsv', 'Ideology')
power_df = load_and_analyze_data('power-tr-train.tsv', 'Power')




Ideology dataset analysis:
Total sample number: 16138
Labels:
label
1    9390
0    6748
Name: count, dtype: int64

Power dataset analysis:
Total sample number: 17384
Labels:
label
1    8932
0    8452
Name: count, dtype: int64


In [None]:
# 5. 90-10 stratified split
def split_data(df, task_name):
    print(f"\n{task_name} - Before splitting:")
    print(df['label'].value_counts(normalize=True))

    train_data, val_data = train_test_split(
        df,
        test_size=0.1,
        stratify=df['label'],
        random_state=42
    )

    print(f"\n{task_name} - After splitting on training set:")
    print(train_data['label'].value_counts(normalize=True))
    print(f"\n{task_name} - After splitting on validation set")
    print(val_data['label'].value_counts(normalize=True))


    print(f"\nTrain set size: {len(train_data)}")
    print(f"Validation set size: {len(val_data)}")

    return train_data, val_data

orientation_train, orientation_val = split_data(orientation_df, "Ideology")
power_train, power_val = split_data(power_df, "Power")


Ideology - Before splitting:
label
1    0.581856
0    0.418144
Name: proportion, dtype: float64

Ideology - After splitting on training set:
label
1    0.581865
0    0.418135
Name: proportion, dtype: float64

Ideology - After splitting on validation set
label
1    0.581784
0    0.418216
Name: proportion, dtype: float64

Train set size: 14524
Validation set size: 1614

Power - Before splitting:
label
1    0.513806
0    0.486194
Name: proportion, dtype: float64

Power - After splitting on training set:
label
1    0.513774
0    0.486226
Name: proportion, dtype: float64

Power - After splitting on validation set
label
1    0.514089
0    0.485911
Name: proportion, dtype: float64

Train set size: 15645
Validation set size: 1739


In [None]:
MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def prepare_dataset(df, text_column, max_length=128):
    dataset = Dataset.from_pandas(df)

    def tokenize_function(examples):
        return tokenizer(
            examples[text_column],
            padding='max_length',
            truncation=True,
            max_length=max_length
        )

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    tokenized_dataset = tokenized_dataset.remove_columns(
        [col for col in tokenized_dataset.column_names
         if col not in ['input_ids', 'attention_mask', 'label']]
    )
    return tokenized_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    metrics = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

    print(f"\nCurrent metrics: {metrics}")
    return metrics

def train_model(train_dataset, val_dataset, task_name, output_dir):
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2
    ).to(device)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_ratio=0.1,
        learning_rate=1e-5,
        weight_decay=0.05,
        #evaluation_strategy="steps",
        evaluation_strategy="epoch",
        #eval_steps=500,
        #save_strategy="steps",
        save_strategy="epoch",
        #save_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    print(f"\n{task_name} model training is starting...")
    train_result = trainer.train()

    print(f"\n{task_name} Final Training Metrics:")
    print(train_result.metrics)

    return trainer, model

In [None]:
# 8. Task 1: (English)
orientation_train_dataset = prepare_dataset(orientation_train, 'text_en')
orientation_val_dataset = prepare_dataset(orientation_val, 'text_en')

orientation_trainer, orientation_model = train_model(
    orientation_train_dataset,
    orientation_val_dataset,
    "Ideology",
    "./ideology_model"
)

Map:   0%|          | 0/14524 [00:00<?, ? examples/s]

Map:   0%|          | 0/1614 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Ideology model training is starting...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5569,0.470242,0.781289,0.80615,0.8322,0.781683
2,0.4493,0.426392,0.796778,0.816555,0.859835,0.777423
3,0.3638,0.440706,0.814126,0.841438,0.835257,0.84771
4,0.3474,0.599236,0.806691,0.831715,0.842623,0.821086
5,0.2893,0.86254,0.801115,0.83845,0.794847,0.887114
6,0.2227,0.964111,0.80917,0.830583,0.858931,0.804047
7,0.1791,0.976304,0.819083,0.845175,0.841605,0.848775
8,0.1167,1.218286,0.80855,0.827084,0.871462,0.787007
9,0.0958,1.304395,0.802354,0.823854,0.855505,0.794462
10,0.0751,1.406269,0.800496,0.827438,0.832794,0.822151



Current metrics: {'accuracy': 0.7812887236679058, 'f1': 0.8061504667764964, 'precision': 0.8321995464852607, 'recall': 0.7816826411075612}

Current metrics: {'accuracy': 0.7967781908302355, 'f1': 0.8165548098434005, 'precision': 0.8598351001177856, 'recall': 0.777422790202343}

Current metrics: {'accuracy': 0.8141263940520446, 'f1': 0.8414376321353065, 'precision': 0.8352570828961176, 'recall': 0.8477103301384451}

Current metrics: {'accuracy': 0.8066914498141264, 'f1': 0.8317152103559871, 'precision': 0.8426229508196721, 'recall': 0.8210862619808307}

Current metrics: {'accuracy': 0.8011152416356877, 'f1': 0.8384499245093106, 'precision': 0.7948473282442748, 'recall': 0.8871139510117146}

Current metrics: {'accuracy': 0.8091697645600991, 'f1': 0.8305830583058306, 'precision': 0.8589306029579067, 'recall': 0.8040468583599574}

Current metrics: {'accuracy': 0.8190830235439901, 'f1': 0.8451749734888653, 'precision': 0.8416050686378036, 'recall': 0.8487752928647497}

Current metrics: {'a

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 9. Task 2: (Turkish)
power_train_dataset = prepare_dataset(power_train, 'text')
power_val_dataset = prepare_dataset(power_val, 'text')

power_trainer, power_model = train_model(
    power_train_dataset,
    power_val_dataset,
    "Power",
    "./power_model"
)


Map:   0%|          | 0/15645 [00:00<?, ? examples/s]

Map:   0%|          | 0/1739 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Power model training is starting...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5809,0.479629,0.771708,0.784824,0.761304,0.809843
2,0.4493,0.437319,0.808511,0.816327,0.805223,0.82774
3,0.3687,0.4539,0.819436,0.825942,0.818681,0.833333
4,0.3487,0.581457,0.811386,0.822511,0.796646,0.850112
5,0.3027,0.906543,0.817711,0.830753,0.794688,0.870246
6,0.2552,0.965838,0.817711,0.832008,0.790534,0.878076
7,0.1815,0.980311,0.822887,0.825397,0.836782,0.814318
8,0.1104,1.09963,0.824037,0.833696,0.810782,0.857942
9,0.1003,1.167717,0.825187,0.83388,0.815171,0.853468
10,0.0638,1.301942,0.821162,0.831983,0.804598,0.861298



Current metrics: {'accuracy': 0.7717078780908568, 'f1': 0.7848238482384824, 'precision': 0.7613038906414301, 'recall': 0.8098434004474273}

Current metrics: {'accuracy': 0.8085106382978723, 'f1': 0.8163265306122449, 'precision': 0.8052230685527747, 'recall': 0.8277404921700223}

Current metrics: {'accuracy': 0.81943645773433, 'f1': 0.8259423503325942, 'precision': 0.8186813186813187, 'recall': 0.8333333333333334}

Current metrics: {'accuracy': 0.8113858539390454, 'f1': 0.8225108225108225, 'precision': 0.7966457023060797, 'recall': 0.8501118568232662}

Current metrics: {'accuracy': 0.8177113283496262, 'f1': 0.8307528029898559, 'precision': 0.7946884576098059, 'recall': 0.8702460850111857}

Current metrics: {'accuracy': 0.8177113283496262, 'f1': 0.8320084790673026, 'precision': 0.7905337361530715, 'recall': 0.8780760626398211}

Current metrics: {'accuracy': 0.8228867165037378, 'f1': 0.8253968253968254, 'precision': 0.8367816091954023, 'recall': 0.814317673378076}

Current metrics: {'acc

In [None]:
'''In this cell, prompt is not good. So accuracy results is actually class distributions. Ignoring.'''
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

login("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")

def zero_shot_inference():
    model_name = "meta-llama/Llama-3.1-8B"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16,
        load_in_4bit=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
        max_new_tokens=1,
        temperature=0.01,
        pad_token_id=tokenizer.eos_token_id,
        batch_size=1
    )

    def process_batch(texts, task_type, batch_size=16):
        results = []
        template = "You are an AI analyzing political speeches.\nSpeech: {}\nQuestion: Is the speaker from {} (0) or {} (1)?\nAnswer only with 0 or 1:"
        options = {
            "ideology": ("left-wing", "right-wing"),
            "power": ("government", "opposition")
        }

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            for text in batch:
                try:
                    prompt = template.format(text[:256], *options[task_type])
                    output = pipe(prompt)[0]['generated_text']
                    last_num = next((c for c in reversed(output) if c in '01'), '0')
                    results.append(int(last_num))
                except Exception as e:
                    print(f"Error: {str(e)[:100]}")
                    results.append(0)
            print(f"Processed {i+len(batch)}/{len(texts)}")
        return results

    results = {
        'ideology': {
            'en': process_batch(orientation_val['text_en'].tolist(), 'ideology'),
            'original': process_batch(orientation_val['text'].tolist(), 'ideology')
        },
        'power': {
            'en': process_batch(power_val['text_en'].tolist(), 'power'),
            'original': process_batch(power_val['text'].tolist(), 'power')
        }
    }

    print("\nZero-shot Results:")
    for task in results:
        for lang in results[task]:
            true_labels = orientation_val['label'].tolist() if task == 'ideology' else power_val['label'].tolist()
            pred_labels = results[task][lang]

            metrics = precision_recall_fscore_support(true_labels, pred_labels, average='binary')
            acc = accuracy_score(true_labels, pred_labels)

            print(f"\n{task.upper()} - {lang}")
            print(f"Accuracy: {acc:.3f}")
            print(f"F1/Precision/Recall: {metrics[2]:.3f}/{metrics[0]:.3f}/{metrics[1]:.3f}")

    return results

results = zero_shot_inference()

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 16/1614
Processed 32/1614
Processed 48/1614
Processed 64/1614
Processed 80/1614
Processed 96/1614
Processed 112/1614
Processed 128/1614
Processed 144/1614
Processed 160/1614
Processed 176/1614
Processed 192/1614
Processed 208/1614
Processed 224/1614
Processed 240/1614
Processed 256/1614
Processed 272/1614
Processed 288/1614
Processed 304/1614
Processed 320/1614
Processed 336/1614
Processed 352/1614
Processed 368/1614
Processed 384/1614
Processed 400/1614
Processed 416/1614
Processed 432/1614
Processed 448/1614
Processed 464/1614
Processed 480/1614
Processed 496/1614
Processed 512/1614
Processed 528/1614
Processed 544/1614
Processed 560/1614
Processed 576/1614
Processed 592/1614
Processed 608/1614
Processed 624/1614
Processed 640/1614
Processed 656/1614
Processed 672/1614
Processed 688/1614
Processed 704/1614
Processed 720/1614
Processed 736/1614
Processed 752/1614
Processed 768/1614
Processed 784/1614
Processed 800/1614
Processed 816/1614
Processed 832/1614
Processed 848/1614

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, precision_score, recall_score

login("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")


def create_optimized_prompt(text, task_type):
    if task_type == "ideology":
        prompt = """Analyze this parliamentary speech for political ideology classification.

Speech: {text}

Consider these aspects:
- Economic policy positions
- Social policy stances
- State intervention views
- Cultural and traditional values
- References to specific political concepts

Based on these indicators, is this speech from a:
- Left-wing politician (0): Focus on social welfare, equality, state intervention
- Right-wing politician (1): Focus on traditional values, free market, conservative policies

Output only 0 or 1."""

    else:
        prompt = """Analyze this parliamentary speech for power position classification.

Speech: {text}

Consider these aspects:
- Tone towards current policies
- Criticism or support of government actions
- Discussion of implementation vs opposition
- References to governance responsibilities
- Legislative agenda stance

Based on these indicators, is this speech from a:
- Government party member (0): Implementation focus, policy defense
- Opposition party member (1): Policy criticism, alternative proposals

Output only 0 or 1."""

    return prompt.format(text=text[:1000])

def zero_shot_inference():
    model_name = "meta-llama/Llama-3.1-8B"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16,
        load_in_4bit=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
        max_new_tokens=1,
        temperature=0.1,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    def process_batch(texts, task_type, batch_size=8):
        results = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_prompts = [create_optimized_prompt(text, task_type) for text in batch]

            try:
                outputs = pipe(batch_prompts)
                for output in outputs:
                    prediction = int(output[0]['generated_text'][-1]) if output[0]['generated_text'][-1] in ['0', '1'] else 0
                    results.append(prediction)

            except Exception as e:
                print(f"Error in batch processing: {str(e)[:100]}")
                results.extend([0] * len(batch))

            print(f"Processed {i+len(batch)}/{len(texts)} examples")

        return results

    def evaluate_results(predictions, true_labels):
        accuracy = accuracy_score(true_labels, predictions)
        f1 = f1_score(true_labels, predictions, average='weighted')
        precision = precision_score(true_labels, predictions, average='weighted')
        recall = recall_score(true_labels, predictions, average='weighted')

        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    results = {
        'ideology': {
            'en': process_batch(orientation_val['text_en'].tolist(), 'ideology'),
            'original': process_batch(orientation_val['text'].tolist(), 'ideology')
        },
        'power': {
            'en': process_batch(power_val['text_en'].tolist(), 'power'),
            'original': process_batch(power_val['text'].tolist(), 'power')
        }
    }


    return results

results = zero_shot_inference()

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Device set to use cuda:0


Processed 8/1614 examples
Processed 16/1614 examples
Processed 24/1614 examples
Processed 32/1614 examples
Processed 40/1614 examples
Processed 48/1614 examples
Processed 56/1614 examples
Processed 64/1614 examples
Processed 72/1614 examples


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 80/1614 examples
Processed 88/1614 examples
Processed 96/1614 examples
Processed 104/1614 examples
Processed 112/1614 examples
Processed 120/1614 examples
Processed 128/1614 examples
Processed 136/1614 examples
Processed 144/1614 examples
Processed 152/1614 examples
Processed 160/1614 examples
Processed 168/1614 examples
Processed 176/1614 examples
Processed 184/1614 examples
Processed 192/1614 examples
Processed 200/1614 examples
Processed 208/1614 examples
Processed 216/1614 examples
Processed 224/1614 examples
Processed 232/1614 examples
Processed 240/1614 examples
Processed 248/1614 examples
Processed 256/1614 examples
Processed 264/1614 examples
Processed 272/1614 examples
Processed 280/1614 examples
Processed 288/1614 examples
Processed 296/1614 examples
Processed 304/1614 examples
Processed 312/1614 examples
Processed 320/1614 examples
Processed 328/1614 examples
Processed 336/1614 examples
Processed 344/1614 examples
Processed 352/1614 examples
Processed 360/1614 exam

In [None]:
def evaluate_results2(predictions, true_labels):
        accuracy = accuracy_score(true_labels, predictions)
        f1 = f1_score(true_labels, predictions, average='weighted')
        precision = precision_score(true_labels, predictions, average='weighted')
        recall = recall_score(true_labels, predictions, average='weighted')

        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

print("\nZero-shot Results:")
for task in results:
    for lang in results[task]:
        true_labels = orientation_val['label'].tolist() if task == 'ideology' else power_val['label'].tolist()
        metrics = evaluate_results2(results[task][lang], true_labels)

        print(f"\n{task.upper()} - {lang}")
        print(f"Accuracy: {metrics['accuracy']:.3f}")
        print(f"F1/Precision/Recall: {metrics['f1']:.3f}/{metrics['precision']:.3f}/{metrics['recall']:.3f}")


Zero-shot Results:

IDEOLOGY - en
Accuracy: 0.418
F1/Precision/Recall: 0.247/0.175/0.418

IDEOLOGY - original
Accuracy: 0.418
F1/Precision/Recall: 0.247/0.175/0.418

POWER - en
Accuracy: 0.486
F1/Precision/Recall: 0.318/0.236/0.486

POWER - original
Accuracy: 0.486
F1/Precision/Recall: 0.318/0.236/0.486


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

login("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")


def create_optimized_prompt(text, task_type):
    """Create an optimized prompt for the specific task"""
    if task_type == "ideology":
        prompt = """Analyze this parliamentary speech and determine if it represents left-wing (0) or right-wing (1) political ideology.
Speech: {text}
Output only 0 or 1:"""
    else:  # power classification
        prompt = """Analyze this parliamentary speech and determine if it's from a government party member (0) or opposition party member (1).
Speech: {text}
Output only 0 or 1:"""

    return prompt.format(text=text[:512])

def run_zero_shot_inference(orientation_val, power_val, model_name="meta-llama/Llama-3.1-8B"):
    """Run zero-shot inference using Llama model"""

    print("\nLoading model...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16,
        load_in_4bit=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
    tokenizer.pad_token = tokenizer.eos_token

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
        max_new_tokens=10,
        temperature=0.9,
        top_p=0.9,
        do_sample=True,
        batch_size=8
    )

    def process_batch(texts, true_labels, task_type, batch_size=8, metric_interval=32):
        """Process texts in batches with intermediate metrics"""
        results = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_prompts = [create_optimized_prompt(text, task_type) for text in batch]

            try:
                with torch.no_grad():
                    outputs = pipe(batch_prompts)
                    for output in outputs:
                        last_char = output[0]['generated_text'].strip()[-1]
                        pred = int(last_char) if last_char in ['0', '1'] else 0
                        results.append(pred)
            except Exception as e:
                print(f"Error in batch {i}: {str(e)[:100]}")
                results.extend([0] * len(batch))

            if len(results) % metric_interval == 0:
                current_preds = np.array(results)
                current_true = true_labels[:len(results)]

                current_metrics = {
                    'accuracy': accuracy_score(current_true, current_preds),
                    'f1': f1_score(current_true, current_preds, average='weighted'),
                    'precision': precision_score(current_true, current_preds, average='weighted'),
                    'recall': recall_score(current_true, current_preds, average='weighted')
                }

                print(f"\nIntermediate Results ({len(results)}/{len(texts)} examples):")
                print(f"Accuracy: {current_metrics['accuracy']:.3f}")
                print(f"F1: {current_metrics['f1']:.3f}")
                print(f"Precision: {current_metrics['precision']:.3f}")
                print(f"Recall: {current_metrics['recall']:.3f}")

        return results

    results = {}
    tasks = {
        'ideology': orientation_val,
        'power': power_val
    }

    for task_name, val_data in tasks.items():
        results[task_name] = {}

        for text_field in ['text_en', 'text']:
            print(f"\nProcessing {task_name} - {text_field}")
            predictions = process_batch(
                texts=val_data[text_field].tolist(),
                true_labels=val_data['label'].values,
                task_type=task_name,
                batch_size=8,
                metric_interval=32
            )

            true_labels = val_data['label'].values
            metrics = {
                'accuracy': accuracy_score(true_labels, predictions),
                'f1': f1_score(true_labels, predictions, average='weighted'),
                'precision': precision_score(true_labels, predictions, average='weighted'),
                'recall': recall_score(true_labels, predictions, average='weighted')
            }

            results[task_name][text_field] = {
                'predictions': predictions,
                'metrics': metrics
            }

            print(f"\nFinal Results for {task_name} using {text_field}:")
            print(f"Accuracy: {metrics['accuracy']:.3f}")
            print(f"F1: {metrics['f1']:.3f}")
            print(f"Precision: {metrics['precision']:.3f}")
            print(f"Recall: {metrics['recall']:.3f}")

    return results

results = run_zero_shot_inference(orientation_val, power_val)


Loading model...


config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Device set to use cuda:0



Processing ideology - text_en





Intermediate Results (32/1614 examples):
Accuracy: 0.438
F1: 0.402
Precision: 0.681
Recall: 0.438

Intermediate Results (64/1614 examples):
Accuracy: 0.359
F1: 0.302
Precision: 0.389
Recall: 0.359


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Intermediate Results (96/1614 examples):
Accuracy: 0.438
F1: 0.381
Precision: 0.466
Recall: 0.438

Intermediate Results (128/1614 examples):
Accuracy: 0.430
F1: 0.360
Precision: 0.464
Recall: 0.430

Intermediate Results (160/1614 examples):
Accuracy: 0.412
F1: 0.342
Precision: 0.481
Recall: 0.412

Intermediate Results (192/1614 examples):
Accuracy: 0.432
F1: 0.370
Precision: 0.498
Recall: 0.432

Intermediate Results (224/1614 examples):
Accuracy: 0.424
F1: 0.358
Precision: 0.501
Recall: 0.424

Intermediate Results (256/1614 examples):
Accuracy: 0.418
F1: 0.354
Precision: 0.485
Recall: 0.418

Intermediate Results (288/1614 examples):
Accuracy: 0.438
F1: 0.371
Precision: 0.490
Recall: 0.438

Intermediate Results (320/1614 examples):
Accuracy: 0.459
F1: 0.398
Precision: 0.514
Recall: 0.459

Intermediate Results (352/1614 examples):
Accuracy: 0.449
F1: 0.384
Precision: 0.519
Recall: 0.449

Intermediate Results (384/1614 examples):
Accuracy: 0.451
F1: 0.386
Precision: 0.515
Recall: 0.451

