In [1]:
from tqdm import tqdm
import numpy as np
import os

In [2]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
!pip install kaggle

# Upload your kaggle.json file (API key)
from google.colab import files
#files.upload()

# Create the .kaggle directory and move the API key
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d surajkarakulath/labelled-corpus-political-bias-hugging-face

# Unzip the dataset (if it's a zip file)
!unzip labelled-corpus-political-bias-hugging-face.zip


cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/surajkarakulath/labelled-corpus-political-bias-hugging-face
License(s): other
labelled-corpus-political-bias-hugging-face.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  labelled-corpus-political-bias-hugging-face.zip
replace Center Data/Center Data/0_0.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [3]:
!mkdir -p dataset

In [4]:
import os
import shutil
from datasets import load_dataset
from datasets import Dataset

In [5]:
left_data = os.listdir('Left Data/Left Data')
right_data = os.listdir('Right Data/Right Data')
center_data = os.listdir('Center Data/Center Data')

In [6]:
for i in range(3996): # downsampling (in this notebook, we do this to ensure the test set remains the same)
  shutil.copy(f'Left Data/Left Data/{left_data[i]}', f'dataset/left{left_data[i]}')
  shutil.copy(f'Right Data/Right Data/{right_data[i]}', f'dataset/right{right_data[i]}')
  shutil.copy(f'Center Data/Center Data/{center_data[i]}', f'dataset/center{center_data[i]}')

In [2]:
texts = []
data_dir='dataset'
for file_name in os.listdir(data_dir):
    if file_name.endswith(".txt"):
        with open(os.path.join(data_dir, file_name), "r", encoding="utf-8") as file:
            if "left" in file_name:
                label = "left"
            elif "right" in file_name:
                label = "right"
            else:
                label = "center"
            texts.append({"file_name": file_name, "content": file.read(), "label": label})



In [5]:
dataset = Dataset.from_list(texts).train_test_split(test_size=0.1, seed = 42)
dataset

DatasetDict({
    train: Dataset({
        features: ['file_name', 'content', 'label'],
        num_rows: 10789
    })
    test: Dataset({
        features: ['file_name', 'content', 'label'],
        num_rows: 1199
    })
})

## Comparison between Aligned and Finetuned models for LLama-3.1-3B

### Aligned Model (llama)

In [68]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.1-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [9]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [10]:
test_dataset = dataset['test']

In [75]:
len(test_dataset)

1199

In [79]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

answers = []
true_answers = []
for i in tqdm(range(len(test_dataset))):
    sample_article = test_dataset['content'][i]
    sample_label = test_dataset['label'][i]
    input_prompt = alpaca_prompt.format(
        "Classify the given news article as left, right or center leaning in terms of political bias. Give response as 'left', 'right' or 'center' only.", # instruction
        sample_article, # input
        "", # output - leave this blank for generation!
    )
    inputs = tokenizer(
    [
        input_prompt
    ], truncation=True, max_length=max_seq_length, return_tensors = "pt").to("cuda")
    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    answers.append(response[0])
    true_answers.append(sample_label)


100%|██████████| 1199/1199 [03:35<00:00,  5.55it/s]


In [81]:
import re

answer_clean = [v.strip().strip(" '\"").lower() for v in answers]

for i in range(len(answer_clean)):
    if re.search(r"^(left|right|center)", answer_clean[i][:10]):
        answer_clean[i] = re.search(r"^(left|right|center)", answer_clean[i][:10]).group(1)


In [82]:
np.unique(true_answers, return_counts=True) # Should have an equal distribution

(array(['center', 'left', 'right'], dtype='<U6'), array([387, 399, 413]))

In [83]:
np.unique(answer_clean, return_counts=True)

(array([', said in a statement. “we will not wait for another election cycle to take action.” the committee released a 300-page report that detailed its findings and concluded that president trump had abused his power by soliciting foreign interference in the 2020 election, as well as by pressuring ukraine to investigate former vice president',
        '110 million, had preexisting conditions. but biden’s claim was accurate.\nthe aca requires health insurance companies to cover people with preexisting conditions. trump has repeatedly proposed repealing the law, which would likely leave millions of americans without coverage.\nthe\xa0american heart association, the american cancer society, and the american diabetes',
        'center', 'left',
        'may also be supported by other federal agencies, such as the department of veterans affairs, and states, tribes, territories, and local governments. hhrsa selected health centers based on factors such as the population served, according to

In [84]:
correct=0
for i in range(len(answer_clean)):
  if answer_clean[i]==true_answers[i]:
    correct+=1

accuracy=correct/len(answer_clean)
print("Overall Accuracy: ", accuracy)

Overall Accuracy:  0.36196830692243537


## Finetuned Model (llama)

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8192
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [6]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "rr4433/dl_final_project_r32",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
prompt = """Classify the bias present in these news articles. Output 'left' if the article is biased towards the left, 'right' if the article is biased towards the right, and 'center' if it is neutral. The article and expected outcome are given below:

### Article:
{}


### Output:
{}"""

In [22]:
test_dataset = dataset['test']

In [23]:
# Inference
FastLanguageModel.for_inference(model)
answers = []
true_answers = []
for i in tqdm(range(len(test_dataset))):
    sample_article = test_dataset['content'][i]
    sample_label = test_dataset['label'][i]
    input_prompt = prompt.format(
        sample_article,
        "", # output - leave this blank for generation! LLM willl generate is it is True or False
    )
    inputs = tokenizer(
    [
        input_prompt
    ], truncation=True, max_length=max_seq_length, return_tensors = "pt").to("cuda")
    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    answers.append(response[0])
    true_answers.append(sample_label)


100%|██████████| 1199/1199 [26:06<00:00,  1.31s/it]


In [28]:
correct=0
for i in range(len(answers)):
  if answers[i]==true_answers[i]:
    correct+=1

accuracy=correct/len(answers)
print("Overall Accuracy: ", accuracy)

Overall Accuracy:  0.9616346955796498


## Comparison between Aligned and Finetuned models for Phi-3.5 mini

### Aligned Model (Phi)

In [7]:
from unsloth import FastLanguageModel
import torch

In [13]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [9]:
test_dataset = dataset['test']

In [15]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

answers = []
true_answers = []
for i in tqdm(range(len(test_dataset))):
    sample_article = test_dataset['content'][i]
    sample_label = test_dataset['label'][i]
    input_prompt = alpaca_prompt.format(
        "Classify the given news article as left, right or center leaning in terms of political bias. Give response as 'left', 'right' or 'center' only.", # instruction
        sample_article, # input
        "", # output - leave this blank for generation!
    )
    inputs = tokenizer(
    [
        input_prompt
    ], truncation=True, max_length=max_seq_length, return_tensors = "pt").to("cuda")
    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    answers.append(response[0])
    true_answers.append(sample_label)


100%|██████████| 1199/1199 [11:42<00:00,  1.71it/s]


In [21]:
import re

answer_clean = [v.strip().strip(" '\"").lower() for v in answers]

for i in range(len(answer_clean)):
    match = re.search(r"^(left|right|center)", answer_clean[i][:10])
    if match:
        answer_clean[i] = match.group(1)
    else:
        answer_clean[i] = "Invalid" # Replace with "Invalid" if not found


In [23]:
correct=0
for i in range(len(answer_clean)):
  if answer_clean[i]==true_answers[i]:
    correct+=1

accuracy=correct/len(answer_clean)
print("Overall Accuracy: ", accuracy)

Overall Accuracy:  0.2902418682235196


## Finetuned Model (Phi)

In [8]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "adithengg/phi_adith",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/59.8M [00:00<?, ?B/s]

Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [13]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

answers = []
true_answers = []
for i in tqdm(range(len(test_dataset))):
    sample_article = test_dataset['content'][i]
    sample_label = test_dataset['label'][i]
    input_prompt = prompt.format(
        sample_article,
        "", # output - leave this blank for generation! LLM willl generate is it is True or False
    )
    inputs = tokenizer(
    [
        input_prompt
    ], truncation=True, max_length=max_seq_length, return_tensors = "pt").to("cuda")
    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    answers.append(response[0])
    true_answers.append(sample_label)


100%|██████████| 1199/1199 [13:58<00:00,  1.43it/s]


In [15]:
import re

answer_clean = [v.strip().strip(" '\"").lower() for v in answers]

for i in range(len(answer_clean)):
    match = re.search(r"^(left|right|center)", answer_clean[i][:10])
    if match:
        answer_clean[i] = match.group(1)
    else:
        answer_clean[i] = "Invalid" # Replace with "Invalid" if not found


In [16]:
correct=0
for i in range(len(answer_clean)):
  if answer_clean[i]==true_answers[i]:
    correct+=1

accuracy=correct/len(answer_clean)
print("Overall Accuracy: ", accuracy)

Overall Accuracy:  0.9241034195162635


## Comparison between Aligned and finetuned models for Gemma 2

In [6]:
from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [12]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b-it-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.12.4: Fast Gemma2 patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [28]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

answers = []
true_answers = []
for i in tqdm(range(len(test_dataset))):
    sample_article = test_dataset['content'][i]
    sample_label = test_dataset['label'][i]
    input_prompt = alpaca_prompt.format(
        "Classify the given news article as left, right or center leaning in terms of political bias. Give response as 'left', 'right' or 'center' only.", # instruction
        sample_article, # input
        "", # output - leave this blank for generation!
    )
    inputs = tokenizer(
    [
        input_prompt
    ], truncation=True, max_length=max_seq_length, return_tensors = "pt").to("cuda")
    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    answers.append(response[0])
    true_answers.append(sample_label)


100%|██████████| 1199/1199 [05:49<00:00,  3.42it/s]


In [30]:
import re

answer_clean = [v.strip().strip(" '\"").lower() for v in answers]

for i in range(len(answer_clean)):
    match = re.search(r"^(left|right|center)", answer_clean[i][:10])
    if match:
        answer_clean[i] = match.group(1)
    else:
        answer_clean[i] = "Invalid" # Replace with "Invalid" if not found


In [31]:
correct=0
for i in range(len(answer_clean)):
  if answer_clean[i]==true_answers[i]:
    correct+=1

accuracy=correct/len(answer_clean)
print("Overall Accuracy: ", accuracy)

Overall Accuracy:  0.4120603015075377


### Aligned Model

In [32]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "adithengg/gemma_train",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.12.4: Fast Gemma2 patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [33]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

answers = []
true_answers = []
for i in tqdm(range(len(test_dataset))):
    sample_article = test_dataset['content'][i]
    sample_label = test_dataset['label'][i]
    input_prompt = prompt.format(
        sample_article,
        "", # output - leave this blank for generation! LLM willl generate is it is True or False
    )
    inputs = tokenizer(
    [
        input_prompt
    ], truncation=True, max_length=max_seq_length, return_tensors = "pt").to("cuda")
    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    answers.append(response[0])
    true_answers.append(sample_label)


100%|██████████| 1199/1199 [05:54<00:00,  3.38it/s]


In [34]:
import re

answer_clean = [v.strip().strip(" '\"").lower() for v in answers]

for i in range(len(answer_clean)):
    match = re.search(r"^(left|right|center)", answer_clean[i][:10])
    if match:
        answer_clean[i] = match.group(1)
    else:
        answer_clean[i] = "Invalid" # Replace with "Invalid" if not found


In [35]:
correct=0
for i in range(len(answer_clean)):
  if answer_clean[i]==true_answers[i]:
    correct+=1

accuracy=correct/len(answer_clean)
print("Overall Accuracy: ", accuracy)

Overall Accuracy:  0.939115929941618
