In [2]:
import json
import re
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from transformers import pipeline
from tqdm import tqdm

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def load_dataset(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    return data

data = load_dataset("/kaggle/input/dataset/dailydialog_test.json")

In [4]:
import pandas as pd
import re

# Your input data should already be defined as 'data'

def process_data(data):
    records = []
    for conversation_id, conversation in data.items():
        for dialogue in conversation:
            for entry in dialogue:
                records.append({
                    "conversation_id": conversation_id,
                    "turn": entry["turn"],
                    "speaker": entry["speaker"],
                    "utterance": entry["utterance"],
                    "emotion": entry.get("emotion", "neutral"),
                    "cause_evidence": entry.get("expanded emotion cause span", [])
                })
    return pd.DataFrame(records)

# Abbreviations we want to protect
protected_abbreviations = ["Mr .", "Mrs .", "Ms .", "Dr .", "Prof .", "Sr .", "Jr .", "St .", "vs ."]

# Replace protected abbreviations with temporary placeholders
def replace_abbreviations(text, reverse=False):
    if not reverse:
        for i, abbr in enumerate(protected_abbreviations):
            text = text.replace(abbr, f"__ABBR{i}__")
    else:
        for i, abbr in enumerate(protected_abbreviations):
            text = text.replace(f"__ABBR{i}__", abbr)
    return text

def split_into_clauses(text):
    text = replace_abbreviations(text)  # Protect abbreviations
    clauses = re.split(r'[.?!,;:]', text)  # Safe regex now
    clauses = [clause.strip() for clause in clauses if clause.strip()]
    clauses = [replace_abbreviations(clause, reverse=True) for clause in clauses]  # Restore abbreviations
    return clauses

# Process and transform dataset
df = process_data(data)
df['clauses'] = df['utterance'].apply(split_into_clauses)

# Save processed DataFrame to JSON file
df.to_json("processed_data.json", orient="records", indent=2)
print(df.head())


  conversation_id  turn speaker  \
0         tr_9708     1       A   
1         tr_9708     2       B   
2         tr_9708     3       A   
3         tr_9708     4       B   
4         tr_9708     5       A   

                                           utterance   emotion  \
0                         The blake's got divorced .   neutral   
1                                     Really ? Why ?   neutral   
2  Mr . black has been getting a little around as...   neutral   
3  I'm surprised . He does't look like a guy who'...  surprise   
4  No , he doesn't . But his wife found out he ha...   neutral   

                                      cause_evidence  \
0                                                 []   
1                                                 []   
2                                                 []   
3  [Mr . black has been getting a little around a...   
4                                                 []   

                                             clauses  


In [5]:
df.to_json("processed_data.json", orient="records", indent=2)

In [6]:
import json
from tqdm import tqdm
from transformers import pipeline

# Define labels
LABELS = ["emotion clause", "cause clause", "neutral clause"]

# Load zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-base-zeroshot-v1")

# Expanded emotion keywords
emotion_keywords = set([
    "happy", "joyful", "joy", "sad", "angry", "surprised", "afraid", "disgusted", "anxious", "excited",
    "pleased", "delighted", "ecstatic", "elated", "thrilled", "content",
    "depressed", "unhappy", "upset", "miserable", "down", "gloomy",
    "furious", "irritated", "frustrated", "mad", "outraged",
    "shocked", "astonished", "amazed", "startled", "stunned",
    "scared", "fearful", "terrified", "nervous", "worried", "panicked",
    "disgusted", "repulsed", "nauseated",
    "hopeful", "grateful", "thankful", "lonely", "ashamed", "guilty",
    "embarrassed", "confused", "jealous", "envious", "proud", "relieved",
    "i'm happy", "i feel great", "i'm upset", "i'm worried", "i'm scared",
    "i'm nervous", "i feel bad", "i'm thankful", "i'm shocked", "i'm sad",
    "i feel embarrassed", "i feel guilty", "i'm afraid"
])

def contains_emotion_keywords(clause):
    clause_lower = clause.lower()
    return any(word in clause_lower for word in emotion_keywords)

def classify_clause_with_hf(utterance, clause, emotion):
    # Emotion override
    if contains_emotion_keywords(clause):
        return "emotion_clause"

    # Zero-shot classification
    hypothesis = "This clause functions as an {} in the context of emotion in dialogue."
    result = classifier(clause, LABELS, hypothesis_template=hypothesis)
    label_scores = dict(zip(result["labels"], result["scores"]))

    if label_scores["emotion clause"] >= 0.5:
        return "emotion_clause"
    elif label_scores["cause clause"] >= 0.3:
        return "cause_clause"
    else:
        return "neutral_clause"

def process_dataset(data):
    results = {}

    for item in tqdm(data, desc="Processing"):
        conv_id = item["conversation_id"]
        if conv_id not in results:
            results[conv_id] = []

        utterance = item.get("utterance", "")
        emotion = item.get("emotion", "unknown")
        clauses = item.get("clauses", [])
        cause_spans = item.get("cause_evidence", [])

        classified_clauses = []
        emotion_clauses = []
        cause_clauses = []

        for clause in clauses:
            label = classify_clause_with_hf(utterance, clause, emotion)
            if label == "emotion_clause":
                emotion_clauses.append(clause)
            elif label == "cause_clause":
                cause_clauses.append(clause)

            classified_clauses.append({"clause": clause, "label": label})

        linked_emotions = []
        for e_clause in emotion_clauses:
            linked_causes = [cause for cause in cause_spans if cause in utterance]
            linked_emotions.append({"emotion_clause": e_clause, "caused_by": linked_causes})

        results[conv_id].append({
            "turn": item["turn"],
            "speaker": item["speaker"],
            "utterance": utterance,
            "emotion": emotion,
            "clauses": classified_clauses,
            "emotion_cause_mapping": linked_emotions
        })

    return results

if __name__ == "__main__":
    input_path = "processed_data.json"
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)

    results = process_dataset(input_data)

    with open("hf_results_final.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

    print("🎉 Classification results saved to hf_results_final.json")


Device set to use cuda:0
Processing:   0%|          | 4/2405 [00:02<16:16,  2.46it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing: 100%|██████████| 2405/2405 [05:57<00:00,  6.73it/s]


🎉 Classification results saved to hf_results_final.json
