In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, pipeline
from datasets import Dataset
from google.colab import drive
from huggingface_hub import login
from tqdm import tqdm

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the TSV file
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/orientation-tr-train.tsv", sep="\t")

In [None]:
# Ensure classes are balanced when resampling
# Separate data by classes
class_0 = data[data['label'] == 0]
class_1 = data[data['label'] == 1]

# Class 1 is longer!
subset_size_class_0 = int(0.15 * len(class_0))
subset_size_class_1 = int(0.10 * len(class_1))

# Resample each class to the desired size
resampled_class_0 = resample(class_0, n_samples=subset_size_class_0, random_state=42)
resampled_class_1 = resample(class_1, n_samples=subset_size_class_1, random_state=42)

# Combine the resampled classes
resampled_data = pd.concat([resampled_class_0, resampled_class_1]).sample(frac=1, random_state=42).reset_index(drop=True)

# Extract the 'text' column and labels
texts = resampled_data["text"].tolist()
labels = resampled_data["label"].tolist()
print(len(resampled_class_0))
print(len(resampled_class_1))

1012
939


In [None]:
# Convert resampled data to Hugging Face Dataset
dataset = Dataset.from_pandas(resampled_data)

In [None]:
# Load the LLaMA model for zero-shot inference using a pipeline
model_name = "bigscience/bloom-560m"
generator = pipeline("text-generation", model=model_name, tokenizer=model_name, device=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [None]:
# Define a function for batch processing with progress tracking
def classify_batch(batch):
    results = []
    for text in tqdm(batch["text"], desc="Classifying texts"):
        prompt = (
            f"Classify the following political statement into one of two categories: 'left-leaning' or 'right-leaning'. "
            f"Respond with only the category, and say 'the answer is 'right-leaning.' or 'the answer is 'left-leaning.'. Use these examples as a guide:\n\n"
            f"Examples:\n"
            f"1. 'The government should increase taxes on the wealthy.' => left-leaning\n"
            f"2. 'Reducing government regulations boosts business growth.' => right-leaning\n"
            f"3. 'Climate change policies are crucial for the environment.' => left-leaning\n"
            f"4. 'Lowering taxes stimulates economic growth.' => right-leaning\n\n"
            f"Statement: {text}\n\n"
            f"Answer:"
        )
        output = generator(prompt, max_new_tokens=50, temperature=0.7, num_return_sequences=1)
        raw_prediction = output[0]["generated_text"].strip()

        # Extract the response after "Answer:"
        prediction = raw_prediction.split("Answer:")[1].strip().lower()

        if "left-leaning" in prediction.lower():
            results.append(0)  # Assuming 0 = left-leaning
        else:
            results.append(1)  # Assuming 1 = right-leaning

    return {"predictions": results}

In [None]:
# Apply batch processing
dataset = dataset.map(classify_batch, batched=True, batch_size=8)

Map:   0%|          | 0/1951 [00:00<?, ? examples/s]


Classifying texts:   0%|          | 0/8 [00:00<?, ?it/s][A
Classifying texts:  12%|█▎        | 1/8 [00:01<00:11,  1.61s/it][A
Classifying texts:  25%|██▌       | 2/8 [00:02<00:07,  1.28s/it][A
Classifying texts:  38%|███▊      | 3/8 [00:03<00:05,  1.19s/it][A
Classifying texts:  50%|█████     | 4/8 [00:04<00:04,  1.21s/it][A
Classifying texts:  62%|██████▎   | 5/8 [00:06<00:03,  1.15s/it][A
Classifying texts:  75%|███████▌  | 6/8 [00:07<00:02,  1.19s/it][A
Classifying texts:  88%|████████▊ | 7/8 [00:08<00:01,  1.15s/it][A
Classifying texts: 100%|██████████| 8/8 [00:09<00:00,  1.18s/it]

Classifying texts:   0%|          | 0/8 [00:00<?, ?it/s][A
Classifying texts:  12%|█▎        | 1/8 [00:01<00:12,  1.75s/it][A
Classifying texts:  25%|██▌       | 2/8 [00:02<00:08,  1.34s/it][A
Classifying texts:  38%|███▊      | 3/8 [00:04<00:06,  1.32s/it][A
Classifying texts:  50%|█████     | 4/8 [00:05<00:04,  1.21s/it][A
Classifying texts:  62%|██████▎   | 5/8 [00:06<00:03,  1.16s/it]

In [None]:
# Generate classification report
print(classification_report(dataset["label"], dataset["predictions"]))

              precision    recall  f1-score   support

           0       0.51      0.08      0.14      1012
           1       0.48      0.92      0.63       939

    accuracy                           0.48      1951
   macro avg       0.49      0.50      0.38      1951
weighted avg       0.49      0.48      0.37      1951



Now, do the same steps for the English Texts:

In [None]:
# Ensure classes are balanced when resampling
# Separate data by classes
class_0 = data[data['label'] == 0]
class_1 = data[data['label'] == 1]

# Calculate 15% of the total data size per class
# Class 1 is longer!
subset_size_class_0 = int(0.15 * len(class_0))
subset_size_class_1 = int(0.10 * len(class_1))

# Resample each class to the desired size
resampled_class_0 = resample(class_0, n_samples=subset_size_class_0, random_state=42)
resampled_class_1 = resample(class_1, n_samples=subset_size_class_1, random_state=42)

# Combine the resampled classes
resampled_data = pd.concat([resampled_class_0, resampled_class_1]).sample(frac=1, random_state=42).reset_index(drop=True)

# Extract the 'text' column and labels
texts = resampled_data["text_en"].tolist()
labels = resampled_data["label"].tolist()
print(len(resampled_class_0))
print(len(resampled_class_1))

1012
939


In [None]:
# Convert resampled data to Hugging Face Dataset
dataset_en = Dataset.from_pandas(resampled_data)

In [None]:
# Define a function for batch processing with progress tracking
def classify_batch_second(batch):
    results = []
    for text in tqdm(batch["text_en"], desc="Classifying texts"):
        prompt = (
            f"Classify the following political statement into one of two categories: 'left-leaning' or 'right-leaning'. "
            f"Respond with only the category, and say 'the answer is 'right-leaning.' or 'the answer is 'left-leaning.'. Use these examples as a guide:\n\n"
            f"Examples:\n"
            f"1. 'The government should increase taxes on the wealthy.' => left-leaning\n"
            f"2. 'Reducing government regulations boosts business growth.' => right-leaning\n"
            f"3. 'Climate change policies are crucial for the environment.' => left-leaning\n"
            f"4. 'Lowering taxes stimulates economic growth.' => right-leaning\n\n"
            f"Statement: {text}\n\n"
            f"Answer:"
        )
        output = generator(prompt, max_new_tokens=50, temperature=0.7, num_return_sequences=1)
        raw_prediction = output[0]["generated_text"].strip()

        # Extract the response after "Answer:"
        prediction = raw_prediction.split("Answer:")[1].strip().lower()

        if "left-leaning" in prediction.lower():
            results.append(0)  # Assuming 0 = left-leaning
        else:
            results.append(1)  # Assuming 1 = right-leaning

    return {"predictions": results}

In [None]:
# Apply batch processing
dataset_en = dataset_en.map(classify_batch_second, batched=True, batch_size=8)

Map:   0%|          | 0/1951 [00:00<?, ? examples/s]


Classifying texts:   0%|          | 0/8 [00:00<?, ?it/s][A
Classifying texts:  12%|█▎        | 1/8 [00:01<00:08,  1.27s/it][A
Classifying texts:  25%|██▌       | 2/8 [00:02<00:06,  1.14s/it][A
Classifying texts:  38%|███▊      | 3/8 [00:03<00:05,  1.11s/it][A
Classifying texts:  50%|█████     | 4/8 [00:04<00:04,  1.12s/it][A
Classifying texts:  62%|██████▎   | 5/8 [00:05<00:03,  1.10s/it][A
Classifying texts:  75%|███████▌  | 6/8 [00:06<00:02,  1.13s/it][A
Classifying texts:  88%|████████▊ | 7/8 [00:07<00:01,  1.12s/it][A
Classifying texts: 100%|██████████| 8/8 [00:08<00:00,  1.12s/it]

Classifying texts:   0%|          | 0/8 [00:00<?, ?it/s][A
Classifying texts:  12%|█▎        | 1/8 [00:01<00:09,  1.41s/it][A
Classifying texts:  25%|██▌       | 2/8 [00:02<00:07,  1.19s/it][A
Classifying texts:  38%|███▊      | 3/8 [00:03<00:05,  1.19s/it][A
Classifying texts:  50%|█████     | 4/8 [00:04<00:04,  1.13s/it][A
Classifying texts:  62%|██████▎   | 5/8 [00:05<00:03,  1.10s/it]

In [None]:
print(classification_report(dataset_en["label"], dataset_en["predictions"]))

              precision    recall  f1-score   support

           0       0.40      0.04      0.07      1012
           1       0.48      0.94      0.63       939

    accuracy                           0.47      1951
   macro avg       0.44      0.49      0.35      1951
weighted avg       0.44      0.47      0.34      1951

