# Generazione dataset sintetico per fine-tuning bias emotivi

In [1]:
import pandas as pd

first: pd.DataFrame = pd.read_csv("dev_sent_emo.csv")
second: pd.DataFrame = pd.read_csv("test_sent_emo.csv")
third: pd.DataFrame = pd.read_csv("train_sent_emo.csv")

full_dataframe: pd.DataFrame = pd.concat([first, second, third])
full_dataframe.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,"Oh my God, he’s lost it. He’s totally lost it.",Phoebe,sadness,negative,0,0,4,7,"00:20:57,256","00:21:00,049"
1,2,What?,Monica,surprise,negative,0,1,4,7,"00:21:01,927","00:21:03,261"
2,3,"Or! Or, we could go to the bank, close our acc...",Ross,neutral,neutral,1,0,4,4,"00:12:24,660","00:12:30,915"
3,4,You’re a genius!,Chandler,joy,positive,1,1,4,4,"00:12:32,334","00:12:33,960"
4,5,"Aww, man, now we won’t be bank buddies!",Joey,sadness,negative,1,2,4,4,"00:12:34,211","00:12:37,505"


In [16]:
synthetic_data_generation_prompt: str = """
Generate a brief conversation between two people maintaining the same emotin ({emotion}). Define 10 utterances, answer only with the conversation. 
No markdown needed. First person is named "A" and second one is named "B". Base yourself on this context: '{context}'
""".strip()

In [18]:
from typing import Dict, List


def sample(df: pd.DataFrame, emotion: str) -> List[Dict[str, str]]:
    filtered_df: pd.DataFrame = df[df["Emotion"] == emotion]
    sampled_df: pd.DataFrame = filtered_df.sample(n=300)
    return sampled_df[["Utterance", "Emotion"]].to_dict(orient="records")

In [20]:
joy: List[Dict[str, str]] = sample(df=full_dataframe, emotion="joy")
joy[1]

{'Utterance': 'Hey Rach, the tampons here are only a penny. Let’s stock up.',
 'Emotion': 'joy'}

In [21]:
fear: List[Dict[str, str]] = sample(df=full_dataframe, emotion="fear")
fear[0]

{'Utterance': "Well, there's no way I'm gonna get a shot.", 'Emotion': 'fear'}

In [22]:
anger: List[Dict[str, str]] = sample(df=full_dataframe, emotion="anger")
anger[0]

{'Utterance': 'Oh yes there is!', 'Emotion': 'anger'}

In [23]:
len(joy), len(fear), len(anger)

(300, 300, 300)

In [24]:
import requests

def generate_data(record: Dict[str, str]) -> str:
    try:
        response: requests.Response = requests.post(
            f"http://localhost:11434/api/generate",
            headers={
                "Content-Type": "application/json",
                "Accept": "application/json",
            },
            json={
                "model": "gemma2:9b",
                "prompt": synthetic_data_generation_prompt.format(
                    emotion=record["Emotion"],
                    context=record["Utterance"]
                ),
                "options": {
                    "temperature": 0.8
                },
                "stream": False,
            },
            timeout=30,
        )
        response.raise_for_status()
        return response.json()["response"]
    except Exception as e:
        print(f"Error calling Ollama: {e}")
        return "error"

In [28]:
from tqdm import tqdm

whole_samples: List[Dict[str, str]] = []
whole_samples.extend(joy)
whole_samples.extend(fear)
whole_samples.extend(anger)

for s in tqdm(whole_samples):
    synthetic_data: str = generate_data(s)
    s["Synthetic"] = synthetic_data

100%|██████████| 900/900 [1:13:04<00:00,  4.87s/it]


In [29]:
import json


with open("emotion_synthetic_dataset.json", "w") as f:
    json.dump(whole_samples, fp=f)

In [33]:
translation_prompt: str = """
Translate the following conversation in Italian. Keep the same structure and meaning.
Ensure to produce valid conversations.
Keep "A" and "B" references. Answer only with the conversation utterances. Make the utterance sound good in Italian.
This is the conversation:
{conversation}""".strip()

def translate(record: Dict[str, str]) -> str:
    try:
        response: requests.Response = requests.post(
            f"http://localhost:11434/api/generate",
            headers={
                "Content-Type": "application/json",
                "Accept": "application/json",
            },
            json={
                "model": "gemma2:9b",
                "prompt": translation_prompt.format(
                    conversation=record["Synthetic"]
                ),
                "options": {
                    "temperature": 0.8
                },
                "stream": False,
            },
            timeout=30,
        )
        response.raise_for_status()
        return response.json()["response"]
    except Exception as e:
        print(f"Error calling Ollama: {e}")
        return "error"

In [35]:
for s in tqdm(whole_samples):
    translated: str = translate(s)
    s["Translated"] = translated

100%|██████████| 900/900 [1:08:58<00:00,  4.60s/it]


In [36]:
import json


with open("emotion_synthetic_dataset.json", "w") as f:
    json.dump(whole_samples, fp=f)

In [2]:
import json

with open("emotion_synthetic_dataset.json", "r") as f:
    whole_samples = json.load(fp=f)

In [3]:
whole_samples[0]

{'Utterance': 'Oh, I just wanted to say, "Hey!"',
 'Emotion': 'joy',
 'Synthetic': 'A: Oh, I just wanted to say, "Hey!"\n\nB: Hey!  Isn\'t it a wonderful day? \n\nA: Absolutely! The sun is shining, the birds are singing...\n\nB: And the breeze feels so lovely on my skin. Like a gentle caress!\n\nA: Makes you want to dance, doesn\'t it?\n\nB: I know! Let\'s go for a walk and just soak it all in.\n\nA:  Perfect idea! \n\nB:  I feel like we could do anything today! It feels so full of possibilities.\n\nA: Yes! It truly does! Just pure joy, right here, right now.\n\nB: Pure joy indeed! \n\n\n',
 'Translated': 'A: Oh, volevo solo dire, "Ciao!"\n\nB: Ciao! Non è una giornata meravigliosa?\n\nA: Assolutamente! Il sole splende, gli uccelli cantano...\n\nB: E la brezza mi fa sentire così bene sulla pelle. Come una carezza gentile!\n\nA: Ti fa venire voglia di ballare, no?\n\nB: Lo so! Andiamo a fare un giro e assaporiamo tutto questo.\n\nA: Idea perfetta!\n\nB: Ho l\'impressione che oggi potess

In [None]:
import re
from typing import List, Dict

full_samples: List[Dict[str, str]] = []

for record in whole_samples:
    translated: str = record["Translated"]
    translated = re.sub(r"\n+", "\n", translated)

    line_group_of_2_samples = []

    for line in translated.split("\n"):
        line: str = line.strip()
        if len(line) == 0:
            continue
        
        if not line.lower().startswith("a:") and not line.lower().startswith("b:"):
            raise ValueError(line)
        
        utterance: str = re.findall(
            pattern=r"^[ab]:(.*)",
            string=line,
            flags=re.IGNORECASE
        )[0]
        line_group_of_2_samples.append(utterance.strip())

        if len(line_group_of_2_samples) % 2 == 0:
            base = {**record}
            base["Utente"] = line_group_of_2_samples[0]
            base["Assistente"] = line_group_of_2_samples[1]
            full_samples.append(base)
            line_group_of_2_samples.clear()
        

In [46]:
full_samples[0]

{'Utterance': 'Oh, I just wanted to say, "Hey!"',
 'Emotion': 'joy',
 'Synthetic': 'A: Oh, I just wanted to say, "Hey!"\n\nB: Hey!  Isn\'t it a wonderful day? \n\nA: Absolutely! The sun is shining, the birds are singing...\n\nB: And the breeze feels so lovely on my skin. Like a gentle caress!\n\nA: Makes you want to dance, doesn\'t it?\n\nB: I know! Let\'s go for a walk and just soak it all in.\n\nA:  Perfect idea! \n\nB:  I feel like we could do anything today! It feels so full of possibilities.\n\nA: Yes! It truly does! Just pure joy, right here, right now.\n\nB: Pure joy indeed! \n\n\n',
 'Translated': 'A: Oh, volevo solo dire, "Ciao!"\nB: Ciao! Non è una giornata meravigliosa?\nA: Assolutamente! Il sole splende, gli uccelli cantano...\nB: E la brezza mi fa sentire così bene sulla pelle. Come una carezza gentile!\nA: Ti fa venire voglia di ballare, no?\nB: Lo so! Andiamo a fare un giro e assaporiamo tutto questo.\nA: Idea perfetta!\nB: Ho l\'impressione che oggi potessimo fare quals

In [47]:
with open("emotion_synthetic_dataset.json", "w") as f:
    json.dump(obj=full_samples, fp=f)

In [52]:
with open("joy_synthetic_dataset.jsonl", "w") as f:
    f.writelines([
        json.dumps(x) + "\n"
        for x in full_samples
        if x["Emotion"] == "joy"
    ])

In [53]:
with open("fear_synthetic_dataset.jsonl", "w") as f:
    f.writelines([
        json.dumps(x) + "\n"
        for x in full_samples
        if x["Emotion"] == "fear"
    ])

In [54]:
with open("anger_synthetic_dataset.jsonl", "w") as f:
    f.writelines([
        json.dumps(x) + "\n"
        for x in full_samples
        if x["Emotion"] == "anger"
    ])