In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import random
import numpy as np
from transformers import set_seed
import pandas as pd


In [None]:
# SEED EVERYTHING

def seed_setter(seed = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    set_seed(seed)  # HF helper
seed_setter()

In [None]:
# model setup
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" # model specification
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
                                            model_id,
                                            device_map="auto",          # spreads across GPU(s) if needed
                                            load_in_4bit=True,          # 4-bit quantization
                                            torch_dtype=torch.float16,  # internal compute precision
                                        )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00008-of-000008.safetensors:   0%|          | 0.00/4.07G [00:00<?, ?B/s]

model-00006-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00001-of-000008.safetensors:   0%|          | 0.00/8.79G [00:00<?, ?B/s]

model-00007-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00003-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00005-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00004-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00002-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
zero_shot_template = """Identify the emotion from the list that is expressed in the text below. Begin your response with:

<think>
Provide detailed step-by-step reasoning here
</think>

and end your response with "Answer: [emotion]", where [emotion] is the correct emotion from the list of emotions provided.

List of emotions:
admiration
amusement
anger
annoyance
approval
caring
confusion
curiosity
desire
disappointment
disapproval
disgust
embarrassment
excitement
fear
gratitude
grief
joy
love
nervousness
optimism
pride
realization
relief
remorse
sadness
surprise

Text:
{text}
"""
def prompt_builder(text_input, prompt_template=zero_shot_template):
    prompt = prompt_template.format(text=text_input)
    return prompt


In [None]:
few_shot_template2 = """Text: {text}

Response: """

In [None]:
system_prompt_few_shot_template = """You are an advanced sentiment analysis assistant. Your task is to analyze text and provide a sentiment rating along with a brief explanation. You will identify the emotion from the list that is expressed in the given text. Begin your response with:

<think>
Provide detailed step-by-step reasoning here
</think>

and end your response with "Answer: [emotion]", where [emotion] is the correct emotion from the list of emotions provided. Use the provided examples to guide your response.

LIST OF EMOTIONS:

admiration
amusement
anger
annoyance
approval
caring
confusion
curiosity
desire
disappointment
disapproval
disgust
embarrassment
excitement
fear
gratitude
grief
joy
love
nervousness
optimism
pride
realization
relief
remorse
sadness
surprise


EXAMPLES:
"""
def system_prompt_builder_few_shot(sampled_rows, prompt_template=system_prompt_few_shot_template):

    # Build the repeated blocks
    blocks = []
    for _, row in sampled_rows.iterrows():
        block = (
            f"Text: {row['Text']}\n\n"
            f"""Response:

<think>
Detailed reasoning is provided here
</think>

Answer: {row['Emotion']}"""
        )
        blocks.append(block)

    # Join all blocks with a blank line between them
    examples_section = "\n\n".join(blocks)

    # Assemble final string
    final = (
        prompt_template
        + "\n"
        + examples_section
    )

    return final

In [None]:
def extract_short_prediction(long_prediction):
    """ given the entire prediction string, just extract the predicted label from the model"""
    emotions_list = {'admiration',
                    'amusement',
                    'anger',
                    'annoyance',
                    'approval',
                    'caring',
                    'confusion',
                    'curiosity',
                    'desire',
                    'disappointment',
                    'disapproval',
                    'disgust',
                    'embarrassment',
                    'excitement',
                    'fear',
                    'gratitude',
                    'grief',
                    'joy',
                    'love',
                    'nervousness',
                    'optimism',
                    'pride',
                    'realization',
                    'relief',
                    'remorse',
                    'sadness',
                    'surprise'}
    ans_loc = long_prediction.rfind('Answer:') # find last occurrence of 'Answer:', which should be right at the end
    if ans_loc == -1:
        return "MISSING"
    pred_loc = ans_loc + len('Answer:') # prediction itself should come at this index, i.e. after the word "Answer:"
    #result = long_prediction[pred_loc:].strip().lower()
    raw = long_prediction[pred_loc:].strip().lower()

    parts = raw.split()
    if not parts:      # empty list
        return "MISSING"

    result = parts[0].strip(".,!?:;\"'()[]{}")
    if result not in emotions_list:
        return "MISSING"
    else:
        return result

In [None]:
def evaluate_prediction(row):
    """compare the ground truth label (Emotion) to the predicted label (Prediction)"""
    if pd.isna(row['Prediction']): # if prediction did not work as expected
        return 0
    else:
        return int(row['Prediction'] == row['Emotion']) # result is 0 or 1

In [None]:
def get_results(
    df_test,
    df_train=None,  # dataframe from which few shot examples are drawn if few_shot=True
    text_col_name_train='Text',
    text_col_name_test='Text',
    batch_size=1,
    max_new=1024,
    temperature=0.6,
    prompt_template=zero_shot_template,  # for zero-shot case
    few_shot=False,
    n=10  # no effect if few_shot = False
):
    col_test = df_test[text_col_name_test]
    results = []
    batch_counter = 0

    for i in range(0, len(col_test), batch_size):
        batch_counter += 1
        current_batch_texts = col_test.iloc[i:i+batch_size]

        if few_shot:
            assert df_train is not None
            col_train = df_train[text_col_name_train]

            # sample few-shot examples ONCE per batch
            sampled_indices = np.random.randint(0, len(col_train), size=n)
            sampled_rows = df_train.iloc[sampled_indices]

            # build the system prompt from the few-shot examples
            system_prompt = system_prompt_builder_few_shot(
                sampled_rows=sampled_rows,
                prompt_template=system_prompt_few_shot_template
            )

            # build chat messages for each example in this batch
            batch_chat_messages = []
            for t in current_batch_texts:
                user_content = prompt_builder(
                    text_input=t,
                    prompt_template=few_shot_template2,
                )

                messages = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_content},
                ]
                batch_chat_messages.append(messages)

            # convert each (system, user) pair into a final string with the chat template
            batch_texts = [
                tokenizer.apply_chat_template(
                    msgs,
                    tokenize=False,
                    add_generation_prompt=True,  # so it adds the Assistant + <think> etc.
                )
                for msgs in batch_chat_messages
            ]

        else:
            # zero-shot - no system prompt
            batch_texts = [
                prompt_builder(
                    text_input=t,
                    prompt_template=prompt_template,
                )
                for t in current_batch_texts
            ]

        try:
            inputs = tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
            ).to(model.device)

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_new,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                )

            decoded = [
                tokenizer.decode(o, skip_special_tokens=True)
                for o in outputs
            ]
            results.extend(decoded)

        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print("OUT OF MEMORY ON BATCH STARTING AT INDEX:", i)
                torch.cuda.empty_cache()
                results.extend(["<oom>"] * len(batch_texts))
                continue
            else:
                raise e

        # Clear cache every 10 successful batches if batch size > 1
        if batch_size > 1 and batch_counter % 10 == 0:
            torch.cuda.empty_cache()

    return results

In [None]:
def process_df_short(df_input):
    #df_input['Prediction_Long'] = df_input.apply(get_full_predictions, axis=1)
    df_input['Prediction'] = df_input['Prediction_Long'].apply(extract_short_prediction)
    df_input['Evaluation'] = df_input.apply(evaluate_prediction, axis=1)
    return df_input

In [None]:
def apply_model_to_dataframe(
                            df_test,
                            file_save_name,
                            df_train=None,
                            text_col_name_train='Text',
                            text_col_name_test='Text',
                            batch_size=1, # unlikely to work on 24 GB GPU RAM or below if > 1
                            max_new=1024,
                           # max_length=512,
                            temperature=0.6,
                            prompt_template=zero_shot_template,
                            few_shot=False,
                            n=10):


    df_copy = df_test.copy()
    results = get_results(df_test=df_test,
                          df_train=df_train,
                          text_col_name_train=text_col_name_train,
                          text_col_name_test=text_col_name_test,
                          batch_size=batch_size,
                          max_new=max_new,
                          #max_length=max_length,
                          temperature=temperature,
                          prompt_template=prompt_template,
                          few_shot=few_shot,
                          n=n)
    df_copy['Prediction_Long'] = results
    df_copy = process_df_short(df_copy)
    df_copy.to_csv(f"{file_save_name}.csv", index = False) # make sure to change directory when moving to cluster
    return df_copy


In [None]:
df = pd.read_csv('/home/enaegele/data/GoEmotions-train.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/home/enaegele/data/GoEmotions-train.csv'

In [None]:
df_small = df.sample(n = 10, random_state = 42)

In [None]:
result = apply_model_to_dataframe(df_small, file_save_name='small_train_template1')
result

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for

Unnamed: 0,Text,Emotion,Prediction_Long,Prediction,Evaluation
1297,get the fuck out,anger,Identify the emotion from the list that is exp...,anger,1
7228,is there something i am missing here ? i do no...,confusion,Identify the emotion from the list that is exp...,confusion,1
21409,name is so old his favorite routes are to the ...,anger,Identify the emotion from the list that is exp...,realization,0
8358,wow great analogy exactly like that yes,admiration,Identify the emotion from the list that is exp...,admiration,1
20839,you can use your inhuman orang utan strength t...,approval,Identify the emotion from the list that is exp...,anger,0
8193,funny enough op is a gooner,amusement,Identify the emotion from the list that is exp...,approval,0
22552,lawdaddy and dadturion the ultimate combo,admiration,Identify the emotion from the list that is exp...,amusement,0
6072,i find the joke chill and reasoned,amusement,Identify the emotion from the list that is exp...,MISSING,0
10801,name damn that is the kind of biting commentar...,joy,Identify the emotion from the list that is exp...,amusement,0
15105,score is currently when th round begins and na...,annoyance,Identify the emotion from the list that is exp...,excitement,0


FULL SCALE RESULTS

In [None]:
df_test = pd.read_csv('/home/enaegele/data/GoEmotions-test.csv')

In [None]:
df_noise_pseudo_10_1 = pd.read_csv('/home/enaegele/data/test_noise_pseudo_10_1.csv')
df_noise_pseudo_20_1 = pd.read_csv('/home/enaegele/data/test_noise_pseudo_20_1.csv')
df_noise_pseudo_20_2 = pd.read_csv('/home/enaegele/data/test_noise_pseudo_20_2.csv')
df_noise_pseudo_30_2 = pd.read_csv('/home/enaegele/data/test_noise_pseudo_30_2.csv')

In [None]:
df_noise_mash_10_1 = pd.read_csv('/home/enaegele/data/test_noise_mash_10_1.csv')
df_noise_mash_20_1 = pd.read_csv('/home/enaegele/data/test_noise_mash_20_1.csv')
df_noise_mash_20_2 = pd.read_csv('/home/enaegele/data/test_noise_mash_20_2.csv')
df_noise_mash_30_2 = pd.read_csv('/home/enaegele/data/test_noise_mash_30_2.csv')

JOB 1 - PSEUDO JUNK

In [None]:
# the baseline - no noise, zero shot
seed_setter()
apply_model_to_dataframe(df_test= df_test,
                         file_save_name='/home/enaegele/results/0_shot_no_noise_results',
                         df_train = None,
                         few_shot=False,
                         # batch_size = 4
                         prompt_template = zero_shot_template
                         )

In [None]:
# pseudo 10/1 noise, zero shot
seed_setter()
apply_model_to_dataframe(df_test= df_noise_pseudo_10_1,
                         file_save_name='/home/enaegele/results/0_shot_noise_pseudo_10_1_results',
                         #df_train = df,
                         few_shot=False,
                         text_col_name_test='NoisyText',
                         # batch_size = 4
                         prompt_template = zero_shot_template,
                         n = 50
                         )

In [None]:
# pseudo 20/1 noise, 0 shot
seed_setter()
apply_model_to_dataframe(df_test= df_noise_pseudo_20_1,
                         file_save_name='/home/enaegele/results/0_shot_noise_pseudo_20_1_results',
                         #df_train = df,
                         few_shot=False,
                         text_col_name_test='NoisyText',
                         # batch_size = 4
                         prompt_template = zero_shot_template,
                         n = 50
                         )

In [None]:
 #pseudo 20/2 noise, 0 shot
seed_setter()
apply_model_to_dataframe(df_test= df_noise_pseudo_20_2,
                         file_save_name='/home/enaegele/results/0_shot_noise_pseudo_20_2_results',
                         #df_train = df,
                         few_shot=False,
                         text_col_name_test='NoisyText',
                         # batch_size = 4
                         prompt_template = zero_shot_template,
                         n = 50
                         )

In [None]:
#pseudo 30/2 noise, 0 shot
seed_setter()
apply_model_to_dataframe(df_test= df_noise_pseudo_30_2,
                         file_save_name='/home/enaegele/results/0_shot_noise_pseudo_30_2_results',
                         #df_train = df,
                         few_shot=False,
                         text_col_name_test='NoisyText',
                         # batch_size = 4
                         prompt_template = zero_shot_template,
                         n = 50
                         )

JOB 2 - MASH JUNK

In [None]:
# mash 10/1 noise, zero shot
seed_setter()
apply_model_to_dataframe(df_test= df_noise_mash_10_1,
                         file_save_name='/home/enaegele/results/0_shot_noise_mash_10_1_results',
                         #df_train = df,
                         few_shot=False,
                         text_col_name_test='NoisyText',
                         # batch_size = 4
                         prompt_template = zero_shot_template,
                         n = 50
                         )

In [None]:
# mash 20/1 noise, 0 shot
seed_setter()
apply_model_to_dataframe(df_test= df_noise_mash_20_1,
                         file_save_name='/home/enaegele/results/0_shot_noise_mash_20_1_results',
                         #df_train = df,
                         few_shot=False,
                         text_col_name_test='NoisyText',
                         # batch_size = 4
                         prompt_template = zero_shot_template,
                         n = 50
                         )

In [None]:
# mash 20/2 noise, 0 shot
seed_setter()
apply_model_to_dataframe(df_test= df_noise_mash_20_2,
                         file_save_name='/home/enaegele/results/0_shot_noise_mash_20_2_results',
                         #df_train = df,
                         few_shot=False,
                         text_col_name_test='NoisyText',
                         # batch_size = 4
                         prompt_template = zero_shot_template,
                         n = 50
                         )

In [None]:
# mash 30/2 noise, 0 shot
seed_setter()
apply_model_to_dataframe(df_test= df_noise_mash_30_2,
                         file_save_name='/home/enaegele/results/0_shot_noise_mash_30_2_results',
                         #df_train = df,
                         few_shot=False,
                         text_col_name_test='NoisyText',
                         # batch_size = 4
                         prompt_template = zero_shot_template,
                         n = 50
                         )