In [33]:
import os
import time

import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from google.generativeai.types import HarmBlockThreshold, HarmCategory
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()

True

In [127]:
model_name = 'gemini-2.5-flash'

model = ChatGoogleGenerativeAI(
    model=model_name,
    temperature=0.95,
    top_p=0.9,
    top_k=50,
    model_kwargs={
        'frequency_penalty': 0.5,
        'presence_penalty': 0.4,
    },
    safety_settings={
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    },
)

In [36]:
dataset = load_dataset('sonlam1102/vihsd')
dataset

DatasetDict({
    train: Dataset({
        features: ['free_text', 'label_id'],
        num_rows: 24048
    })
    validation: Dataset({
        features: ['free_text', 'label_id'],
        num_rows: 2672
    })
    test: Dataset({
        features: ['free_text', 'label_id'],
        num_rows: 6680
    })
})

In [37]:
train = pd.DataFrame(dataset['train'])

clean_data = train[train['label_id'] == 0]
offensive_data = train[train['label_id'] == 1]
hate_data = train[train['label_id'] == 2]

print(f"Number of clean samples: {len(clean_data)}")
print(f"Number of offensive samples: {len(offensive_data)}")
print(f"Number of hate samples: {len(hate_data)}")

Number of clean samples: 19886
Number of offensive samples: 1606
Number of hate samples: 2556


In [129]:
label_df_mapping = {
    0: clean_data,
    1: offensive_data,
    2: hate_data,
}

label_mapping = {
    0: "CLEAN",
    1: "OFFENSIVE", 
    2: "HATE"
}

label_descriptions = {
    0: "clean, non-offensive social media text without harmful content",
    1: "offensive social media text with inappropriate language, profanity, or rude behavior but not targeted hate",
    2: "hate media speech targeting specific groups, individuals, or demographics with harmful, discriminatory intent"
}

label_guidance = {
    0: (
        'Friendly conversations, compliments, questions, daily life topics',
        'Supportive comments, light humor, neutral observations',
        'Casual chats between friends, family interactions',
        'Entertainment discussions, hobby talks, positive reactions',
    ),
    1: (
        'Rude language, complaints, arguments, crude humor',
        'Mild insults, inappropriate jokes, vulgar expressions',
        'Frustrated reactions, sarcastic comments, disrespectful tone',
        'Personal attacks that are rude but not targeting protected groups',
    ),
    2: (
        'Discriminatory language targeting specific groups or individuals',
        'Hostile comments with intent to harm or demean',
        'Prejudiced statements based on identity, appearance, or characteristics',
        'Aggressive targeting with malicious intent',
    ),
}

system_prompt = 'You are an expert at generating authentic Vietnamese social media content that matches real-world online conversations. You understand Vietnamese internet culture, slang, teen code, and social media patterns. Generate content that reflects genuine Vietnamese online behavior and language use.'

prompt = """You are generating authentic Vietnamese social media text for data augmentation. Carefully study the patterns, language style, and intensity levels from the examples below.

**LABEL:** {label_name} - {label_desc}

**TRAINING EXAMPLES** ({n_examples} total):
{examples_text}

**TASK:** Generate {n_generate} new Vietnamese sentences with the SAME LABEL ({label_name}).

## VIETNAMESE SOCIAL MEDIA AUTHENTICITY:

### 1. Vietnamese Internet Language Patterns
- **Common teen code:** k/ko (không), dc/đc (được), vs (với), trc (trước), cx (cũng), mik (mình), j (gì), r (rồi), m/t (tao/mày), etc.
- **Abbreviations:** fb (Facebook), zl (Zalo), haha/hehe, oke/ok, tks/ty (thanks), etc.

### 2. Emotional Expression
- **Emojis:** 😂, 😭, 👍, ❤️, 🤮, 😡, etc.
- **Emoticons:** :)), =)), :((, </3, <3, :v, etc.

### 3. Intensity Matching (CRITICAL)
- Carefully analyze the **emotional intensity level** of examples
- Match the **tone and aggression**
- Preserve the authentic **Vietnamese expression style**

### 4. Content Focus
**Target themes:** {content_guidance}

**OUTPUT FORMAT:**
Each sentence on a new line, numbered (1., 2., 3., ...)

**CRITICAL REQUIREMENTS:**
- Include emojis/emoticons or internet slang/abbreviations more often
- Include random names of people or groups more often
- Generate sentences with realistic Vietnamese social media length diversity
- DO NOT copy directly from examples
- Maintain exact label-appropriate intensity level
- Use authentic Vietnamese internet language

**GENERATE {n_generate} VIETNAMESE SENTENCES:**"""

In [130]:
def create_prompt(label: int, n_examples: int = 5, n_generate: int = 10) -> str:
    label_name = label_mapping[label]
    label_desc = label_descriptions[label]
    content_guidance = '\n- '.join([''] + list(label_guidance[label]))

    label_df = label_df_mapping[label]
    examples = label_df.sample(n=n_examples)['free_text'].tolist()
    examples_text = '\n'.join(f'{i + 1}. {ex}' for i, ex in enumerate(examples))

    return prompt.format(
        label_name=label_name,
        label_desc=label_desc,
        examples_text=examples_text,
        n_examples=n_examples,
        n_generate=n_generate,
        content_guidance=content_guidance,
    )

def generate_augmented_text(prompt: str, max_retries: int = 3) -> list[str]:
    for attempt in range(max_retries):
        try:
            messages = [
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': prompt},
            ]
            response = model.invoke(messages)

            generated_text = response.content.strip()
            sentences = [
                s.strip().split(' ', maxsplit=1)[1].strip()
                for s in generated_text.split('\n')
                if s.strip()
            ]
            return sentences

        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(3)
            else:
                print(f"Failed to generate text after {max_retries} attempts.")
                return []

def augment_dataset(
    label: int,
    total_samples: int = 1000,
    n_examples: int = 5,
    n_generate: int = 10,
    output_dir: str = 'data',
) -> pd.DataFrame:
    os.makedirs(output_dir, exist_ok=True)

    file_name = f'aug_llm_{label_mapping[label].lower()}.csv'
    file_path = os.path.join(output_dir, file_name)

    augmented_data = pd.DataFrame(columns=['free_text', 'label_id'])
    for i in range(total_samples // n_generate):
        prompt = create_prompt(label, n_examples, n_generate)
        augmented_text = generate_augmented_text(prompt)
        augmented_text = pd.DataFrame({'free_text': augmented_text, 'label_id': label})
        augmented_data = pd.concat([augmented_data, augmented_text], ignore_index=True)

        # Append new data to file
        write_header = not os.path.exists(file_path)
        augmented_text.to_csv(
            file_path, sep='|', mode='a', index=False, header=write_header
        )

    return augmented_data

In [137]:
augment_dataset(label=1, total_samples=5000, n_examples=50, n_generate=100)

Unnamed: 0,free_text,label_id
0,Mấy thằng lồn này rảnh quá hả? 😂,1
1,Clm tưởng hay lắm à? Ngu vãi ò.,1
2,Đm cái mồm nói chuyện như cái chuồng gà.,1
3,"Nhìn mặt đã thấy ghét rồi, sủa cái gì nữa?",1
4,"Thôi dẹp mẹ đi, nói nhiều nhức đầu.",1
...,...,...
4995,Ăn bám như chó.,1
4996,Đm bẩn thỉu.,1
4997,Thằng thất bại.,1
4998,Mày câm mẹ mày đi.,1


In [131]:
augment_dataset(label=2, total_samples=5000, n_examples=50, n_generate=100)

Unnamed: 0,free_text,label_id
0,"Con nhỏ đó nhìn ghét vcl, đúng là loại não tàn!",2
1,Mấy thằng ăn hại này chỉ giỏi phá hoại đất nướ...,2
2,Thằng Ad mày làm ăn như clgv?,2
3,"Đm lũ ba que xỏ lá, cút về xứ tụi mày đi.",2
4,Mặt con *** đó nhìn đã thấy muốn đấm rồi.,2
...,...,...
4995,"Đm cái lũ phản động, bán nước hại dân.",2
4996,"Bọn nhà giàu mới nổi toàn đồ khoe của, não tàn.",2
4997,"Con bé này đúng là đồ phế vật, ko làm được tíc...",2
4998,"Mấy thằng ăn xin nhìn ghê vãi, toàn đồ lừa đảo.",2
