# Class Oversampling

Using Deepseek, we generate new examples by paraphrasing the existing text. We use 'personas' for creating different variations.
Two approaches are taken:

1. For each subclass, we create a new sample. In this context, we will need 3 classifiers rather than a multilabel classifier.
2. We create a main zero class and then only expand the subsets.

**We do not oversample for the multilabel classifier.**

**The generated output needs some manual cleaning.**


In [None]:
import random
from typing import List

import ollama

PERSONALITIES = [
    "friendly",
    "formal",
    "humorous",
    "poetic",
    "sarcastic",
    "dramatic",
    "scientific",
    "mysterious",
    "adventurous",
    "romantic",
    "philosophical",
    "historical",
    "technical",
    "casual",
    "business-like",
    "playful",
    "empathetic",
    "authoritative",
    "inquisitive",
    "optimistic",
    "pessimistic",
    "cynical",
    "realistic",
    "idealistic",
    "whimsical",
    "nostalgic",
    "sophisticated",
    "down-to-earth",
    "witty",
    "charming",
    "enigmatic",
    "intellectual",
    "artistic",
]


class OllamaTextOversampler:
    def __init__(
        self,
        api_url: str = "http://localhost:11434",
        model: str = "deepseek-r1:7b",
        personalities: List[str] = None,
    ) -> str | List[str]:
        self.api_url = api_url
        self.model = model
        self.personalities = personalities

    def generate_alternatives(self, text: str, n: int = 1):
        output_texts = []
        for _ in range(n):
            # Randomly select a personality:
            if len(self.personalities) > 0:
                personality = random.choice(self.personalities)
                prompt = f'Just answer with the text and nothing else, generate an alternate version of the following tweet as if you were "{personality}": {text}'  # noqa: E501
            else:
                personality = None
                prompt = f'Just answer with the text and nothing else, generate an alternate version of the following tweet: "{text}"'  # noqa: E501

            # Prepare the request payload
            response = ollama.chat(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                options={
                    "temperature": 0.7,
                    "max_tokens": 100,
                    "top_p": 0.9,
                    "frequency_penalty": 0.5,
                    "presence_penalty": 0.5,
                },
            )

            # Todo add support for custom cleanup method
            if "deepseek" in self.model:
                output_texts.append(response.message.content.split("</think>")[-1].strip())
            else:
                output_texts.append(response.message.content.strip())

        # Return the generated texts:
        if len(output_texts) == 1:
            return output_texts[0]
        else:
            return output_texts

In [None]:
# Get class balanace for the following columns: "scientific_claim", "scientific_reference", "scientific_entities":
def get_class_balance(df, column):
    class_counts = df[column].value_counts()
    total_count = len(df)
    class_balance = {cls: count / total_count for cls, count in class_counts.items()}
    return class_balance


def get_class_balance_for_all_columns(df, columns):
    class_balances = {}
    for column in columns:
        class_balances[column] = get_class_balance(df, column)
    return class_balances

Class balance for scientific_claim:
{0.0: 0.7296416938110749, 1.0: 0.2703583061889251}

Class balance for scientific_reference:
{0.0: 0.8175895765472313, 1.0: 0.18241042345276873}

Class balance for scientific_entities:
{0.0: 0.750814332247557, 1.0: 0.249185667752443}


In [None]:
# Over-sample the minority classes using the paraphrasing model:

import pandas as pd
from tqdm.notebook import tqdm


def oversample_minority_classes(
    df: pd.DataFrame,
    text_column: str,
    class_column: str,
    oversampler: OllamaTextOversampler,
):
    # Get the class counts
    class_counts = df[class_column].value_counts()

    # Find the minority class
    minority_class = class_counts.idxmin()

    # Get the rows of the minority class
    minority_rows = df[df[class_column] == minority_class]

    # Get how many time each row should be oversampled and round it:
    oversample_count = round(class_counts.max() // class_counts.min())
    n = oversample_count - 1
    print(f"Over-sampling an additional {n} times for the minority class: {minority_class}")

    # Generate new samples for the minority class
    new_samples = []
    for _, row in tqdm(minority_rows.iterrows(), total=len(minority_rows)):
        text = row[text_column]
        new_texts = oversampler.generate_alternatives(text, n=n)

        if n == 1:
            new_samples = new_samples + [new_texts]
        else:
            new_samples = new_samples + new_texts

    return new_samples

## Oversampling for Each Sub-class


In [None]:
# Get trainning data:
# Create the dataset:
import os.path

import pandas as pd

subtask4a_train_df = pd.DataFrame()
subtask4a_dev_df = pd.DataFrame()
if os.path.isfile("./data/processed/task4/subtask_4a/ct_train_clean.tsv"):
    subtask4a_train_df = pd.read_csv("./data/processed/task4/subtask_4a/ct_train_clean.tsv", sep="\t")
    subtask4a_dev_df = pd.read_csv("./data/processed/task4/subtask_4a/ct_dev_clean.tsv", sep="\t")
else:
    subtask4a_train_df = pd.read_csv("../../data/processed/task4/subtask_4a/ct_train_clean.tsv", sep="\t")
    subtask4a_dev_df = pd.read_csv("../../data/processed/task4/subtask_4a/ct_dev_clean.tsv", sep="\t")

In [None]:
oversampler = OllamaTextOversampler(
    api_url="http://localhost:11434",
    model="deepseek-r1:7b",
    personalities=PERSONALITIES,
)



In [None]:
columns = ["scientific_claim", "scientific_reference", "scientific_entities"]
class_balances = get_class_balance_for_all_columns(subtask4a_train_df, columns)

print("Class balance for scientific_claim:")
print(class_balances["scientific_claim"])
print("\nClass balance for scientific_reference:")
print(class_balances["scientific_reference"])
print("\nClass balance for scientific_entities:")
print(class_balances["scientific_entities"])

In [None]:
new_samples = {}
for cl in [
    "scientific_claim",
    "scientific_reference",
    "scientific_entities",
]:  # "scientific_claim"
    if cl not in new_samples:
        new_samples[cl] = oversample_minority_classes(
            subtask4a_train_df,
            text_column="text",
            class_column=cl,
            oversampler=oversampler,
        )

        if os.path.isfile("./data/processed/task4/subtask_4a/ct_train_clean.tsv"):
            pd.DataFrame({"text": new_samples[cl], f"{cl}": 1.0}).to_csv(
                f"./data/processed/task4/subtask_4a/ct_train_oversamples_{cl}.tsv",
                sep="\t",
                index=False,
                header=False,
            )
        else:
            pd.DataFrame({"text": new_samples[cl], f"{cl}": 1.0}).to_csv(
                f"../../data/processed/task4/subtask_4a/ct_train_oversamples_{cl}.tsv",
                sep="\t",
                index=False,
                header=False,
            )

In [None]:
# Save the new samples as tsv files:
import os

for cl in new_samples.keys():
    if os.path.isfile("./data/processed/task4/subtask_4a/ct_train_clean.tsv"):
        pd.DataFrame({"text": new_samples[cl], f"{cl}": 1.0}).to_csv(
            f"./data/processed/task4/subtask_4a/ct_train_oversamples_{cl}.tsv",
            sep="\t",
            index=False,
            header=False,
        )
    else:
        pd.DataFrame({"text": new_samples[cl], f"{cl}": 1.0}).to_csv(
            f"../../data/processed/task4/subtask_4a/ct_train_oversamples_{cl}.tsv",
            sep="\t",
            index=False,
            header=False,
        )