In [None]:
!pip install datasets



In [None]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, pipeline
from datasets import Dataset
from google.colab import drive
from huggingface_hub import login
from tqdm import tqdm

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the TSV file
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/power-tr-train.tsv", sep="\t")

In [None]:
# Ensure classes are balanced when resampling
# Separate data by classes
class_0 = data[data['label'] == 0]
class_1 = data[data['label'] == 1]

# Calculate 15% of the total data size per class
subset_size_class_0 = int(0.10 * len(class_0))
subset_size_class_1 = int(0.10 * len(class_1))

# Resample each class to the desired size
resampled_class_0 = resample(class_0, n_samples=subset_size_class_0, random_state=42)
resampled_class_1 = resample(class_1, n_samples=subset_size_class_1, random_state=42)

# Combine the resampled classes
resampled_data = pd.concat([resampled_class_0, resampled_class_1]).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Convert resampled data to Hugging Face Dataset
dataset = Dataset.from_pandas(resampled_data)

In [None]:
model_name = "bigscience/bloom-560m"
# login("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")
generator = pipeline("text-generation", model=model_name, tokenizer=model_name, device=0)

Device set to use cuda:0


In [None]:
# Define a function for batch processing with progress tracking
def classify_batch(batch):
    results = []
    for text in tqdm(batch["text"], desc="Classifying texts"):
        prompt = (
            f"Classify the following parliamentary speech into two categories: 'governing' or 'opposition'. "
            f"Based on the content of the speech, determine whether the speaker’s party is currently governing (0) or in opposition (1).\n\n"
            f"Respond with only the category, and say 'the answer is 'governing.' or 'the answer is 'opposition.'. Use these examples as a guide:\n\n"
            f"Examples:\n"
            f"1. 'We are proud of the reforms our government has implemented to improve education.' => governing\n"
            f"2. 'This government has failed to address the housing crisis effectively.' => opposition\n"
            f"3. 'Our administration’s policies have boosted the national economy significantly.' => governing\n"
            f"4. 'The government must take immediate action to tackle rising inflation.' => opposition\n\n"
            f"Speech: {text}\n\n"
            f"Answer:"
        )
        output = generator(prompt, max_new_tokens=50, temperature=0.7, num_return_sequences=1, do_sample=True)
        raw_prediction = output[0]["generated_text"].strip()

        # Extract the response after "Answer:"
        prediction = raw_prediction.split("Answer:")[1].strip().lower()

        if "governing" in prediction.lower():
            results.append(0)
        else:
            results.append(1)

    return {"predictions": results}

In [None]:
# Apply batch processing
dataset = dataset.map(classify_batch, batched=True, batch_size=8)

Map:   0%|          | 0/1738 [00:00<?, ? examples/s]


Classifying texts:   0%|          | 0/8 [00:00<?, ?it/s][A
Classifying texts:  12%|█▎        | 1/8 [00:01<00:10,  1.44s/it][A
Classifying texts:  25%|██▌       | 2/8 [00:03<00:11,  1.86s/it][A
Classifying texts:  38%|███▊      | 3/8 [00:05<00:08,  1.67s/it][A
Classifying texts:  50%|█████     | 4/8 [00:06<00:05,  1.47s/it][A
Classifying texts:  62%|██████▎   | 5/8 [00:07<00:04,  1.33s/it][A
Classifying texts:  75%|███████▌  | 6/8 [00:08<00:02,  1.25s/it][A
Classifying texts:  88%|████████▊ | 7/8 [00:09<00:01,  1.36s/it][A
Classifying texts: 100%|██████████| 8/8 [00:11<00:00,  1.43s/it]

Classifying texts:   0%|          | 0/8 [00:00<?, ?it/s][A
Classifying texts:  12%|█▎        | 1/8 [00:01<00:08,  1.20s/it][A
Classifying texts:  25%|██▌       | 2/8 [00:02<00:08,  1.44s/it][A
Classifying texts:  38%|███▊      | 3/8 [00:03<00:06,  1.32s/it][A
Classifying texts:  50%|█████     | 4/8 [00:05<00:04,  1.23s/it][A
Classifying texts:  62%|██████▎   | 5/8 [00:06<00:03,  1.32s/it]

In [None]:
# Generate classification report
print(classification_report(dataset["label"], dataset["predictions"]))

              precision    recall  f1-score   support

           0       0.51      0.28      0.36       845
           1       0.52      0.75      0.62       893

    accuracy                           0.52      1738
   macro avg       0.52      0.51      0.49      1738
weighted avg       0.52      0.52      0.49      1738



Now, do the same steps for the English Texts:

In [None]:
# Ensure classes are balanced when resampling
# Separate data by classes
class_0 = data[data['label'] == 0]
class_1 = data[data['label'] == 1]

# Calculate 15% of the total data size per class
# Class 1 is longer!
subset_size_class_0 = int(0.10 * len(class_0))
subset_size_class_1 = int(0.10 * len(class_1))

# Resample each class to the desired size
resampled_class_0 = resample(class_0, n_samples=subset_size_class_0, random_state=42)
resampled_class_1 = resample(class_1, n_samples=subset_size_class_1, random_state=42)

# Combine the resampled classes
resampled_data = pd.concat([resampled_class_0, resampled_class_1]).sample(frac=1, random_state=42).reset_index(drop=True)

# Extract the 'text' column and labels
texts = resampled_data["text_en"].tolist()
labels = resampled_data["label"].tolist()
print(len(resampled_class_0))
print(len(resampled_class_1))

845
893


In [None]:
# Convert resampled data to Hugging Face Dataset
dataset_en = Dataset.from_pandas(resampled_data)

In [None]:
# Define a function for batch processing with progress tracking
def classify_batch_second(batch):
    results = []
    for text in tqdm(batch["text_en"], desc="Classifying texts"):
        prompt = (
            f"Classify the following parliamentary speech into two categories: 'governing' or 'opposition'. "
            f"Based on the content of the speech, determine whether the speaker’s party is currently governing (0) or in opposition (1).\n\n"
            f"Respond with only the category, and say 'the answer is 'governing.' or 'the answer is 'opposition.'. Use these examples as a guide:\n\n"
            f"Examples:\n"
            f"1. 'We are proud of the reforms our government has implemented to improve education.' => governing\n"
            f"2. 'This government has failed to address the housing crisis effectively.' => opposition\n"
            f"3. 'Our administration’s policies have boosted the national economy significantly.' => governing\n"
            f"4. 'The government must take immediate action to tackle rising inflation.' => opposition\n\n"
            f"Speech: {text}\n\n"
            f"Answer:"
        )
        output = generator(prompt, max_new_tokens=50, temperature=0.7, num_return_sequences=1, do_sample=True)
        raw_prediction = output[0]["generated_text"].strip()

        # Extract the response after "Answer:"
        prediction = raw_prediction.split("Answer:")[1].strip().lower()

        if "governing" in prediction.lower():
            results.append(0)  # Assuming 0 = left-leaning
        else:
            results.append(1)  # Assuming 1 = right-leaning

    return {"predictions": results}

In [55]:
# Apply batch processing
dataset_en = dataset_en.map(classify_batch_second, batched=True, batch_size=8)

Map:   0%|          | 0/1738 [00:00<?, ? examples/s]


Classifying texts:   0%|          | 0/8 [00:00<?, ?it/s][A
Classifying texts:  12%|█▎        | 1/8 [00:01<00:08,  1.26s/it][A
Classifying texts:  25%|██▌       | 2/8 [00:02<00:08,  1.41s/it][A
Classifying texts:  38%|███▊      | 3/8 [00:03<00:06,  1.32s/it][A
Classifying texts:  50%|█████     | 4/8 [00:05<00:04,  1.23s/it][A
Classifying texts:  62%|██████▎   | 5/8 [00:06<00:03,  1.18s/it][A
Classifying texts:  75%|███████▌  | 6/8 [00:07<00:02,  1.15s/it][A
Classifying texts:  88%|████████▊ | 7/8 [00:08<00:01,  1.21s/it][A
Classifying texts: 100%|██████████| 8/8 [00:09<00:00,  1.23s/it]

Classifying texts:   0%|          | 0/8 [00:00<?, ?it/s][A
Classifying texts:  12%|█▎        | 1/8 [00:01<00:08,  1.16s/it][A
Classifying texts:  25%|██▌       | 2/8 [00:02<00:07,  1.24s/it][A
Classifying texts:  38%|███▊      | 3/8 [00:03<00:06,  1.20s/it][A
Classifying texts:  50%|█████     | 4/8 [00:04<00:04,  1.19s/it][A
Classifying texts:  62%|██████▎   | 5/8 [00:06<00:03,  1.23s/it]

In [56]:
print(classification_report(dataset_en["label"], dataset_en["predictions"]))

              precision    recall  f1-score   support

           0       0.63      0.22      0.33       845
           1       0.54      0.88      0.67       893

    accuracy                           0.56      1738
   macro avg       0.58      0.55      0.50      1738
weighted avg       0.58      0.56      0.50      1738

