In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install unsloth
# Get latest Unsloth
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

import pandas as pd
import torch

from unsloth import FastLanguageModel

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
#fourbit_models = [
#    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
#    "unsloth/gemma-7b-it-bnb-4bit",
#] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    #model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
    #model_name = "unsloth/Llama-3.2-1B-bnb-4bit",
    #model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    #model_name = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit",
    #model_name = "unsloth/gemma-2-9b-it-bnb-4bit",
    max_seq_length = 8192,
    load_in_4bit = True,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

Mounted at /content/drive
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [3]:
from transformers import TextStreamer
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Llam

In [4]:
#prompt = """
#You are an expert in European Union policies. I’m going to show you 2 EU policies, and I need to determine which policy is more economically left-leaning. Please analyze the beginning of the policies' preamble based on principles commonly associated with economically left policies, such as government intervention in the economy, redistribution of wealth, social welfare programs, progressive taxation, regulation of markets, and support for labor rights. Based on these principles, analyze which policy is more economically left-leaning and return only '1' or '2'. Do not include any explanation or additional text.#

#Example:

#EU Policy 1:
#[Preamble]

#EU Policy 2:
#[Preamble]

#Q: Which policy is more economically left? Answer with '1' or '2' based on the principles provided. Ensure your analysis is impartial and considers both policies equally.
#A: 2

#Now answer the question below:

#EU Policy 1:
#11 . 8 . 90 Official Journal of the European Communities No L 216/ 1 I (Acts whose publication is obligatory) COMMISSION REGULATION (EEC) No 2357/90 of 10 August 1990 fixing the import levies on cereals and on wheat or rye flour, groats and meal THE COMMISSION OF THE EUROPEAN COMMUNITIES, Having regard to the Treaty establishing the European Economic Community, Having regard to the Act of Accession of Spain and Portugal, Having regard to Council Regulation (EEC) No 2727/75 of 29 October 1975 on the common organization of the market in cereals ('), as last amended by Regulation (EEC) No 1340/90 (2), and in particular Article 13 (5) thereof, Having regard to Council Regulation (EEC) No 1676/85 of 11 June 1985 on the value of the unit of account and the exchange rates to be applied for the purposes of the common agricultural policy (3), as last amended by Regu ­ lation (EEC) No 2205/90 (4), and in particular Article 3 thereof, Having regard to the opinion of the Monetary Committee, Whereas the import levies on cereals, wheat and rye flour, and wheat groats and meal were fixed by Commission Regulation (EEC) No 1801 /90 0 and subsequent amending Regulations ; Whereas, if the levy system is to operate normally, levies should be calculated on the following basis :  in the case of currencies which are maintained in rela ­ tion to each other at any given moment within a band of 2,25 %, a rate of exchange based on their central rate, multiplied by the corrective factor provided for in the last paragraph of Article 3 ( 1 ) of Regulation (EEC) No 1676/85,  for other currencies, an exchange rate based on the arithmetic mean of the spot market rates of each of these currencies recorded for a given period in rela ­ tion to the Community currencies referred to in the previous indent, and the aforesaid coefficient ; Whereas these exchange rates being those recorded on 9 August 1990 ; Whereas the aforesaid corrective factor affects the entire calculation basis for the levies, including the equivalence coefficients ; Whereas it follows from applying the detailed rules contained in Regulation (EEC) No 1801 /90 to today's offer prices and quotations known to the Commission that the levies at present in force should be altered to the amounts set out in the Annex hereto, HAS

#EU Policy 2:
#27. 1 . 95 Official Journal of the European Communities No L 19/43 COMMISSION REGULATION (EC) No 136/95 of 26 January 1995 fixing the import levies on cereals and on wheat or rye flour, groats and meal January 1995, as regards floating currencies, should be used to calculate the levies ; Whereas it follows from applying the detailed rules contained in Regulation (EC) No 3035/94 to today's offer prices and quotations known to the Commission that the levies at present in force should be altered to the amounts set out in the Annex hereto, THE COMMISSION OF THE EUROPEAN COMMUNITIES, Having regard to the Treaty establishing the European Community, Having regard to Council Regulation (EEC) No 1766/92 of 30 June 1992 on the common organization of the market in cereals ('), as last amended by Regulation (EC) No 1866/94 (2), and in particular Articles 10 (5) and 11 (3) thereof, Having regard to Council Regulation (EEC) No 3813/92 of 28 December 1992 on the unit of account and the conversion rates to be applied for the purposes of the common agricultural policy (3), as amended by Regulation (EC) No 3528/93 (4), Whereas the import levies on cereals, wheat and rye flour, and wheat groats and meal were fixed by Commission Regulation (EC) No 3035/94 (*) and subsequent amending Regulations ; Whereas, in order to make it possible for the levy arrange ­ ments to function normally, the representative market rate established during the reference period from 25 HAS

#Q: Which policy is more economically left? Answer with '1' or '2' based on the principles provided. Ensure your analysis is impartial and considers both policies equally.
#A:"""

In [5]:
#messages = [
#    {"from": "human", "value": prompt},
#]

#inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")

#text_streamer = TextStreamer(tokenizer)
#_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 2, use_cache = True)

In [6]:
# File paths
input_file = "/content/drive/MyDrive/EU Policy Feedback/prompt_df_export.csv"
output_file = "/content/drive/MyDrive/EU Policy Feedback/llm_output.csv"

# Load the input CSV
prompt_df = pd.read_csv(input_file)

# Initialize a list to store the results
results = []

# Process each row in the DataFrame
for _, row in tqdm(prompt_df.iterrows(), total=len(prompt_df), desc="Processing prompts"):
    id_var = row['id_var']
    prompt = row['prompt_content_var']

    # Prepare the messages in the format required by the model
    messages = [{"from": "human", "value": prompt}]

    # Tokenize the input
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    # Generate the response
    output_ids = model.generate(
        input_ids=inputs,
        max_new_tokens=2,  # Adjust token limit as needed
        use_cache=True,
        top_k=2,        # Top-k sampling
        top_p=0.2        # Nucleus sampling
    )

    # Decode the response
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract the final line of the response
    final_line = output_text.strip().split("\n")[-1]

    # Append the result to the list
    results.append({"id_var": id_var, "output": final_line})

# Create a new DataFrame with the results
output_df = pd.DataFrame(results)

# Save the results to a new CSV
output_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")


Processing prompts:   0%|          | 0/500 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Processing prompts: 100%|██████████| 500/500 [01:09<00:00,  7.19it/s]


Results saved to /content/drive/MyDrive/EU Policy Feedback/llm_output.csv


In [None]:
torch.cuda.empty_cache()  # Free up GPU memory

In [None]:
#import pandas as pd
#from tqdm import tqdm
#import torch

# File paths
input_file = "/content/drive/MyDrive/EU Policy Feedback/prompt_df_export.csv"
output_file = "/content/drive/MyDrive/EU Policy Feedback/llm_output.csv"

# Load the input CSV
prompt_df = pd.read_csv(input_file)

# Extract prompts and IDs
prompts = prompt_df['prompt_content_var'].tolist()
id_vars = prompt_df['id_var'].tolist()

# Generation arguments
generation_args = {
    "max_new_tokens": 2,
    "top_k": 50,
    "top_p": 0.9,
    "use_cache": True,
}

# Batched inference function
def run_batched_inference(prompts, batch_size=1):
    outputs = []

    for i in tqdm(range(0, len(prompts), batch_size), desc="Processing prompts in batches"):
        # Get current batch
        batch_prompts = prompts[i:i + batch_size]

        # Tokenize the batch
        batch_inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")

        # Generate responses for the batch
        batch_outputs = model.generate(
            input_ids=batch_inputs['input_ids'],
            attention_mask=batch_inputs['attention_mask'],
            max_new_tokens=generation_args["max_new_tokens"],
            top_k=generation_args["top_k"],
            top_p=generation_args["top_p"],
            use_cache=generation_args["use_cache"]
        )

        torch.cuda.empty_cache()  # Free up GPU memory
        #model.gradient_checkpointing_enable()

        # Decode responses and skip special tokens
        batch_decoded = tokenizer.batch_decode(batch_outputs, skip_special_tokens=True)

        # Extract the final line from each output
        final_lines = [output.strip().split("\n")[-1] for output in batch_decoded]

        # Append the final lines to the outputs list
        outputs.extend(final_lines)

    return outputs

# Run batched inference
outputs = run_batched_inference(prompts, batch_size=10)

# Create a DataFrame with the results
output_df = pd.DataFrame({"id_var": id_vars, "output": outputs})

# Save to CSV
output_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")


In [None]:

# File paths
input_file = "/content/drive/MyDrive/EU Policy Feedback/prompt_df_export.csv"
output_file = "/content/drive/MyDrive/EU Policy Feedback/llm_output.csv"

# Load the input CSV
prompt_df = pd.read_csv(input_file)

# Extract prompts and IDs
prompts = prompt_df['prompt_content_var'].tolist()
id_vars = prompt_df['id_var'].tolist()

# Generation arguments
generation_args = {
    "max_new_tokens": 2,
    "top_k": 50,
    "top_p": 0.9,
    "use_cache": True,
}

# Batched inference function with mixed precision
def run_batched_inference(prompts, batch_size=1):
    outputs = []

    for i in tqdm(range(0, len(prompts), batch_size), desc="Processing prompts in batches"):
        # Get current batch
        batch_prompts = prompts[i:i + batch_size]

        # Tokenize the batch
        batch_inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")

        # Use mixed precision for memory efficiency
        with torch.cuda.amp.autocast():  # Enable FP16 precision
            # Generate responses for the batch
            batch_outputs = model.generate(
                input_ids=batch_inputs['input_ids'],
                attention_mask=batch_inputs['attention_mask'],
                max_new_tokens=generation_args["max_new_tokens"],
                top_k=generation_args["top_k"],
                top_p=generation_args["top_p"],
                use_cache=generation_args["use_cache"]
            )

        torch.cuda.empty_cache()  # Free up GPU memory after each batch

        # Decode responses and skip special tokens
        batch_decoded = tokenizer.batch_decode(batch_outputs, skip_special_tokens=True)

        # Extract the final line from each output
        final_lines = [output.strip().split("\n")[-1] for output in batch_decoded]

        # Append the final lines to the outputs list
        outputs.extend(final_lines)

    return outputs

# Run batched inference
outputs = run_batched_inference(prompts, batch_size=10)

# Create a DataFrame with the results
output_df = pd.DataFrame({"id_var": id_vars, "output": outputs})

# Save to CSV
output_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")


  with torch.cuda.amp.autocast():  # Enable FP16 precision
  with torch.cuda.amp.autocast():  # Enable FP16 precision
Processing prompts in batches:   2%|▏         | 1/50 [00:05<04:49,  5.90s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 9.50 GiB. GPU 0 has a total capacity of 39.56 GiB of which 7.27 GiB is free. Process 16366 has 32.29 GiB memory in use. Of the allocated memory 31.68 GiB is allocated by PyTorch, and 108.41 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)