In [4]:
from datasets import load_dataset

# Download le dataset Empathetic Dialogues
# https://huggingface.co/datasets/empathetic_dialogues
dataset = load_dataset("empathetic_dialogues", trust_remote_code=True)

# View of the dataset first example
print(dataset['train'][0]) # type: ignore

# Save the dataset to disk
# This will create a folder named "empathetic_dialogues" in the current directory
dataset.save_to_disk("raw/empathetic_dialogues") # type: ignore


{'conv_id': 'hit:0_conv:1', 'utterance_idx': 1, 'context': 'sentimental', 'prompt': 'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.', 'speaker_idx': 1, 'utterance': 'I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people_comma_ we felt like the only people in the world.', 'selfeval': '5|5|5_2|2|5', 'tags': ''}


Saving the dataset (1/1 shards): 100%|██████████| 76673/76673 [00:00<00:00, 942636.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12030/12030 [00:00<00:00, 599927.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10943/10943 [00:00<00:00, 534957.33 examples/s]


In [6]:
import pandas as pd
from datasets import Dataset

def clean_empathic_dialogues(dataset):
    """
    Cleans a HuggingFace-style EmpatheticDialogues dataset.

    Cleaning Steps:
    1. Convert the dataset to a pandas DataFrame.
    2. Display available columns and the initial number of samples.
    3. Remove exact duplicates based on 'utterance' and optionally 'context'.
    4. Drop rows with missing values in 'utterance' or 'emotion' (if these columns exist).
    5. Remove empty or whitespace-only utterances.
    6. Normalize emotion labels to lowercase and strip extra whitespace.
    7. Show basic statistics after cleaning.
    8. Convert the cleaned DataFrame back to a HuggingFace Dataset object.

    Parameters:
    - dataset: a HuggingFace DatasetDict object with a 'train' key.

    Returns:
    - A cleaned HuggingFace Dataset object (datasets.Dataset).
    """

    # 1. Convert the 'train' split to a pandas DataFrame
    df = pd.DataFrame(dataset['train'])

    # 2. Show available columns and initial sample count
    print(f"Available columns: {df.columns.tolist()}")
    print(f"Initial sample count: {len(df)}")

    # 3. Remove duplicate rows based on 'utterance' and 'context' if available
    subset_cols = ['utterance']
    if 'context' in df.columns:
        subset_cols.append('context')
    df = df.drop_duplicates(subset=subset_cols, keep='first')

    # 4. Drop rows with missing values in key columns (if those columns exist)
    for col in ['utterance', 'emotion']:
        if col in df.columns:
            missing = df[col].isnull().sum()
            df = df[df[col].notnull()]
            print(f"Removed {missing} missing values from '{col}'")

    # 5. Remove rows where utterance is empty or only whitespace
    df = df[df['utterance'].str.strip().astype(bool)]

    # 6. Normalize emotion labels (if the column exists)
    if 'emotion' in df.columns:
        df['emotion'] = df['emotion'].str.lower().str.strip()
        print(f"Emotion distribution:\n{df['emotion'].value_counts()}")

    # 7. Print final sample count
    print(f"Final sample count: {len(df)}")
    
    # 8. Convert cleaned DataFrame back to HuggingFace Dataset
    return Dataset.from_pandas(df, preserve_index=False)

# Example usage:
clean_dataset = clean_empathic_dialogues(dataset)


Available columns: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags']
Initial sample count: 76673
Removed 0 missing values from 'utterance'
Final sample count: 76163


In [8]:
!pip install  matplotlib

Collecting matplotlib
  Using cached matplotlib-3.10.3-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.58.5-cp312-cp312-win_amd64.whl.metadata (109 kB)
     ---------------------------------------- 0.0/109.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/109.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/109.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/109.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/109.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/109.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/109.0 kB ? eta -:--:--
     -------------------------------------


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from datasets import DatasetDict

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Tokenization function
def tokenize_function(examples):
    tokenized = tokenizer(examples["utterance"], truncation=True, max_length=128)
    if "tags" in examples:
        tokenized["tags"] = examples["tags"]
    return tokenized

# Apply tokenization
tokenized_dataset = clean_dataset.map(tokenize_function, batched=True)


# --- Plot 1: Utterance Length Distribution ---
lengths = [len(x) for x in tokenized_dataset["input_ids"]]
plt.figure(figsize=(8, 5))
plt.hist(lengths, bins=50, color='skyblue', edgecolor='black')
plt.title("Utterance Token Length Distribution")
plt.xlabel("Token Count")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.savefig("images/length_distribution.png")
plt.close()

# --- Plot 2: Tag Distribution ---
if "tags" in tokenized_dataset.column_names:
    all_tags = []
    for tags in tokenized_dataset["tags"]:
        if isinstance(tags, list):
            all_tags.extend(tags)
        elif isinstance(tags, str):
            all_tags.append(tags)

    tag_series = pd.Series(all_tags)
    tag_counts = tag_series.value_counts()

    plt.figure(figsize=(10, 6))
    tag_counts.plot(kind='bar', color='orchid', edgecolor='black')
    plt.title("Tag Distribution")
    plt.xlabel("Tag")
    plt.ylabel("Count")
    plt.grid(axis='y')
    plt.tight_layout()
    plt.savefig("images/tag_distribution.png")
    plt.close()
else:
    print("⚠️ 'tags' column not found in the tokenized dataset.")

# --- Split into train/validation/test (80/10/10) ---
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
test_valid = split_dataset['test'].train_test_split(test_size=0.5)

# Create DatasetDict for HuggingFace compatibility
final_dataset = DatasetDict({
    'train': split_dataset['train'],
    'valid': test_valid['train'],
    'test': test_valid['test']
})

# --- Save processed dataset ---
final_dataset.save_to_disk("processed_empathetic_dialogues")
print("✅ Dataset successfully saved to: data/processed_empathetic_dialogues")


Map: 100%|██████████| 76163/76163 [00:03<00:00, 19261.24 examples/s]
  plt.tight_layout()
Saving the dataset (1/1 shards): 100%|██████████| 60930/60930 [00:01<00:00, 48779.21 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 7616/7616 [00:00<00:00, 46174.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 7617/7617 [00:00<00:00, 41889.64 examples/s]

✅ Dataset successfully saved to: data/processed_empathetic_dialogues





In [30]:
# Get dataset statistics
print(f"📊 Total samples: {len(tokenized_dataset)}")
print(f"📏 Average utterance length: {sum(lengths)/len(lengths):.1f} tokens")

# Show top tags if available
if "tags" in tokenized_dataset.column_names:
    print("🏷️ Top tags:")
    print(tag_counts.head(5))
else:
    print("⚠️ No 'tags' column available to show top labels.")


📊 Total samples: 76163
📏 Average utterance length: 17.2 tokens
🏷️ Top tags:
                       75500
<UNIGRAM>                297
<HI>                     155
<POLITICAL>              126
<UNIGRAM> <NUMERAL>       60
Name: count, dtype: int64


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Initialize model and tokenizer
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Create a text generation pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available, else CPU
)


# Function to create a structured prompt from context and emotion
def create_prompt(context, emotion):
    return f"""[Context] {context}
[Emotion] {emotion}
[Dialogue] 
Person 1: 
Person 2:"""


# Example test with a real context and emotion
test_context = "I lost my job this week"
test_emotion = "sadness"

prompt = create_prompt(test_context, test_emotion)

# Generate text with specific parameters
generated = generator(
    prompt,
    max_length=200,
    num_return_sequences=1,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id  # To avoid warnings with GPT-2
)

print(generated[0]['generated_text'])


# Simple function to calculate repetition rate in generated text
def calculate_repetition(text):
    words = text.split()
    unique_words = set(words)
    return 100 * (1 - len(unique_words) / len(words)) if words else 0


# Function to analyze generation results
def analyze_generation(original_prompt, generated_text):
    print("=== Original Prompt ===")
    print(original_prompt)
    print("\n=== Generated Text ===")
    print(generated_text)
    print("\n=== Analysis ===")
    print(f"Length: {len(generated_text.split())} words")
    print(f"Repetition rate: {calculate_repetition(generated_text):.2f}%")


# Analyze the generated output
analyze_generation(prompt, generated[0]['generated_text'])


# Test generation with different temperature values to control creativity
for temp in [0.5, 0.7, 1.0]:
    generated = generator(
        prompt,
        max_length=200,
        num_return_sequences=1,
        temperature=temp,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    print(f"\n--- Temperature {temp} ---")
    print(generated[0]['generated_text'][:200] + "...")


# Test multiple cases with different contexts and emotions
test_cases = [
    {"context": "My dog died", "emotion": "sadness"},
    {"context": "I passed my exam!", "emotion": "joy"}
]

for case in test_cases:
    prompt = create_prompt(case['context'], case['emotion'])
    generated = generator(
        prompt,
        max_length=200,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    print(f"\nContext: {case['context']} | Emotion: {case['emotion']}")
    print(generated[0]['generated_text'])


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
