# 🎭 The Empathy Engine - End-to-End Emotional AI Voice - Jupyter Notebook

**Transform text into emotionally expressive speech with interactive UI and downloadable audio!**

> *Features included:*
> - Multi-modal emotion analysis (Hugging Face + VADER)
> - Dynamic voice modulation (rate, pitch, volume, breathing)
> - 3D emotion space visualization
> - Responsive UI (Gradio) with MP3 audio output and quick examples
> - Single-file implementation: just run and deploy!


In [4]:
# Cell 1: Install dependencies (uncomment for Colab)
!pip install -q gradio transformers torch vaderSentiment pyttsx3 gtts soundfile pydub


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
# Cell 2: Imports
import gradio as gr
import torch
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pyttsx3
from gtts import gTTS
import tempfile, os, time
from pydub import AudioSegment
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# Check if TTS engine is available
print("🎭 Empathy Engine - Libraries loaded successfully!")

🎭 Empathy Engine - Libraries loaded successfully!


In [6]:
# Cell 3: Emotion Analyzer Class
class EmotionAnalyzer:
    def __init__(self):
        self.emotion_classifier = pipeline(
            'text-classification',
            model='j-hartmann/emotion-english-distilroberta-base',
            device=0 if torch.cuda.is_available() else -1
        )
        self.vader = SentimentIntensityAnalyzer()
        self.emotion_3d_map = {
            'joy': {'valence': 0.8, 'arousal': 0.7, 'dominance': 0.6},
            'sadness': {'valence': -0.7, 'arousal': -0.4, 'dominance': -0.5},
            'anger': {'valence': -0.6, 'arousal': 0.8, 'dominance': 0.7},
            'fear': {'valence': -0.8, 'arousal': 0.6, 'dominance': -0.8},
            'surprise': {'valence': 0.2, 'arousal': 0.8, 'dominance': 0.1},
            'disgust': {'valence': -0.7, 'arousal': 0.3, 'dominance': 0.2},
            'love': {'valence': 0.9, 'arousal': 0.5, 'dominance': 0.3},
            'neutral': {'valence': 0.0, 'arousal': 0.0, 'dominance': 0.0}
        }
    def analyze(self, text):
        emotion_out = self.emotion_classifier(text)[0]
        emo = emotion_out['label'].lower()
        conf = emotion_out['score']
        vader_scores = self.vader.polarity_scores(text)
        intensity = abs(vader_scores['compound'])
        emo3d = self.emotion_3d_map.get(emo, self.emotion_3d_map['neutral'])
        scaled_emo3d = {k: v * intensity for k, v in emo3d.items()}
        return {'emotion': emo, 'confidence': conf, 'emotion_3d': scaled_emo3d, 'intensity': intensity, 'vader': vader_scores}
emotion_analyzer = EmotionAnalyzer()


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


In [7]:
# Cell 4: VoiceParameterCalculator
class VoiceParameterCalculator:
    def __init__(self):
        self.base_rate = 200        # words per minute
        self.base_pitch = 0.5       # 0–1 scale
        self.base_volume = 0.8      # 0–1 scale

    def calculate(self, emotion_3d, emotion_name):
        val = emotion_3d['valence']
        aro = emotion_3d['arousal']
        dom = emotion_3d['dominance']
        # Rate: scale by arousal
        rate = int(self.base_rate * (1 + aro * 0.5))
        rate = min(max(rate, 50), 400)
        # Pitch: combine valence & arousal
        pitch = self.base_pitch + val * 0.3 + aro * 0.2
        pitch = min(max(pitch, 0.1), 0.9)
        # Volume: combine dominance & arousal
        volume = self.base_volume + dom * 0.2 + aro * 0.15
        volume = min(max(volume, 0.1), 1.0)
        # Breathing frequency & pause length
        breath = 0.1 + aro * 0.05 + (abs(val)*0.02 if val<0 else 0)
        pause = max(0.1, 0.5 - aro*0.3 - (abs(dom)*0.2 if dom<0 else 0))
        # Tremor for fear/sadness
        tremor = min(0.4, aro * abs(val)) if emotion_name in ['fear','sadness'] and val<0 else 0
        return {
            'rate': rate,
            'pitch': pitch,
            'volume': volume,
            'breath_freq': breath,
            'pause_len': pause,
            'tremor': tremor
        }

voice_calculator = VoiceParameterCalculator()
print("🎛️ VoiceParameterCalculator ready")


🎛️ VoiceParameterCalculator ready


In [27]:
# Cell 5: AdvancedTTSEngine (gTTS only)
class AdvancedTTSEngine:
    def __init__(self):
        from pydub import AudioSegment  # for future use if needed
        print("🔊 AdvancedTTSEngine initialized (gTTS)")

    def _apply_effects(self, text, params):
        # Insert simple pauses to simulate breathing/emotion
        words = text.split()
        out, cnt = [], 0
        breath_every = max(1, int(1/params['breath_freq']))
        for w in words:
            out.append(w)
            cnt += 1
            if cnt >= breath_every and params['breath_freq'] > 0:
                out.append("....")  # short pause
                cnt = 0
            if w.endswith(('.', '!', '?')) and params['pause_len'] > 0.6:
                out.append(".........")  # longer pause
        return " ".join(out)

    def synthesize(self, text, params, emo_name):
        # 1) Apply text effects
        proc_text = self._apply_effects(text, params)

        # 2) Generate MP3 with gTTS
        tts = gTTS(proc_text, lang="en", slow=False)
        tmp_mp3 = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
        tts.save(tmp_mp3)

        return tmp_mp3

# Re-initialize TTS engine
tts_engine = AdvancedTTSEngine()


🔊 AdvancedTTSEngine initialized (gTTS)


## SECOND VERSION:

In [67]:
# Cell 5: AdvancedTTSEngine (gTTS with audible breathing and 10% slower speech)

import os
import tempfile
from gtts import gTTS
from pydub import AudioSegment, effects

BREATH_SOUND_PATH = "/kaggle/input/voicceee/Breathvpice-1.wav"
if not os.path.exists(BREATH_SOUND_PATH):
    raise FileNotFoundError(f"Breath sound file not found: {BREATH_SOUND_PATH}")
breath_sound = AudioSegment.from_file(BREATH_SOUND_PATH)

class AdvancedTTSEngine:
    def __init__(self):
        print("🔊 AdvancedTTSEngine initialized (gTTS, audible breathing, 10% slower)")

    def synthesize(self, text, params, emo_name):
        words = text.split()
        chunks, wcnt, curr = [], 0, []
        base_interval = max(1, int(1 / params["breath_freq"]))
        breath_interval = base_interval * 2  # breaths twice less frequent than base frequency

        for w in words:
            curr.append(w)
            wcnt += 1
            if wcnt >= breath_interval:
                chunks.append((" ".join(curr), "BREATH"))
                curr, wcnt = [], 0
            if w.endswith((".", "!", "?")):
                chunks.append((" ".join(curr), "PAUSE"))
                curr, wcnt = [], 0
        if curr:
            chunks.append((" ".join(curr), None))

        final_audio = AudioSegment.empty()
        intensity = params.get("tremor", 0) + params.get("breath_freq", 0)

        for segment_text, marker in chunks:
            # Generate speech chunk using gTTS
            tts = gTTS(segment_text, lang="en", slow=False)
            tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
            tts.save(tmp_file)
            seg_audio = AudioSegment.from_file(tmp_file, format="mp3")

            # Amplify stressed segments (with '!' or '?') when intensity high
            if intensity > 0.8 and ("!" in segment_text or "?" in segment_text):
                seg_audio += 50

            final_audio += seg_audio

            # Insert pauses or breaths
            if marker == "PAUSE":
                final_audio += AudioSegment.silent(duration=600)  # 0.6 seconds pause
            elif marker == "BREATH":
                final_audio += breath_sound + 2  # mildly louder breathing sound

        # Slow down entire audio to 90% speed (playback duration ~1.11x longer)
        final_audio = final_audio._spawn(final_audio.raw_data, overrides={
            "frame_rate": int(final_audio.frame_rate * 0.9)
        }).set_frame_rate(final_audio.frame_rate)

        # Normalize and compress
        final_audio = effects.normalize(final_audio)
        final_audio = effects.compress_dynamic_range(final_audio, threshold=-20.0, ratio=2.5)

        # Export enhanced MP3
        out_path = tempfile.NamedTemporaryFile(suffix="_enhanced.mp3", delete=False).name
        final_audio.export(out_path, format="mp3")

        return out_path

# Reinitialize TTS engine
tts_engine = AdvancedTTSEngine()


🔊 AdvancedTTSEngine initialized (gTTS, audible breathing, 10% slower)


In [41]:
# Cell 6: Visualization functions
def plot_3d(emotion_3d, emo, conf):
    fig = go.Figure()
    fig.add_trace(go.Scatter3d(
        x=[emotion_3d['valence']], y=[emotion_3d['arousal']],
        z=[emotion_3d['dominance']], mode='markers+text',
        marker=dict(size=12, color=conf, colorscale='Viridis', showscale=True),
        text=[f"{emo.title()}<br>{conf:.0%}"], textposition="top center"
    ))
    # reference points
    ref = emotion_analyzer.emotion_3d_map
    xs, ys, zs, names = [],[],[],[]
    for e,(v,a,d) in ref.items():
        xs.append(v); ys.append(a); zs.append(d); names.append(e)
    fig.add_trace(go.Scatter3d(
        x=xs, y=ys, z=zs, mode='markers+text',
        marker=dict(size=6, color='lightgray', opacity=0.6),
        text=names, textposition="bottom right", name="refs"
    ))
    fig.update_layout(
        scene=dict(
            xaxis_title="Valence", yaxis_title="Arousal", zaxis_title="Dominance"
        ), width=600, height=500, title="3D Emotion Space"
    )
    return fig

def plot_radar(emotion_3d, emo):
    vals = [emotion_3d[k] for k in ['valence','arousal','dominance']]
    norm = [(v+1)/2 for v in vals]
    cat = ['Valence','Arousal','Dominance']
    fig = go.Figure(go.Scatterpolar(
        r=norm+[norm[0]], theta=cat+[cat[0]], fill='toself',
        name=emo.title(), line=dict(color='deeppink')
    ))
    fig.update_layout(polar=dict(radialaxis=dict(range=[0,1])), title="Emotion Radar", width=400, height=400)
    return fig

print("📊 Visualization ready")


📊 Visualization ready


In [42]:
# Cell 7: process(text) → audio, markdown, plots, DataFrame
def process(text):
    if not text.strip():
        return None, "Please enter some text.", None, None, pd.DataFrame()
    # Emotion analysis
    e = emotion_analyzer.analyze(text)
    emo, conf, emo3d = e['emotion'], e['confidence'], e['emotion_3d']
    # Voice params
    vp = voice_calculator.calculate(emo3d, emo)
    # Audio MP3
    audio_path = tts_engine.synthesize(text, vp, emo)
    # Visuals
    fig3d = plot_3d(emo3d, emo, conf)
    figr = plot_radar(emo3d, emo)
    # Summary markdown
    md = f"""
**Emotion:** {emo.title()} ({conf:.1%})  
**Rate:** {vp['rate']} WPM • **Pitch:** {vp['pitch']:.1%} • **Volume:** {vp['volume']:.1%}  
"""
    # Table
    df = pd.DataFrame([
        ['Valence', f"{emo3d['valence']:.3f}"],
        ['Arousal', f"{emo3d['arousal']:.3f}"],
        ['Dominance', f"{emo3d['dominance']:.3f}"],
        ['Intensity', f"{e['intensity']:.3f}"]
    ], columns=['Metric','Value'])
    return audio_path, md, fig3d, figr, df

print("⚙️ Main pipeline ready")


⚙️ Main pipeline ready


In [68]:
# Cell 8: Gradio interface (fixed Audio component)
with gr.Blocks(title="🎭 The Empathy Engine") as demo:
    gr.Markdown("## 🎭 Empathy Engine: Emotional TTS")
    with gr.Row():
        txt = gr.Textbox(lines=4, placeholder="Type your text here...")
        btn = gr.Button("Generate Speech")
    out_audio = gr.Audio(type="filepath", label="Emotional Speech (MP3)")
    out_md    = gr.Markdown()
    with gr.Row():
        plot3d = gr.Plot(label="3D Emotion Space")
        radar   = gr.Plot(label="Emotion Radar")
    table    = gr.DataFrame(headers=["Metric","Value"], label="Emotion Details")
    btn.click(fn=process, inputs=txt,
              outputs=[out_audio, out_md, plot3d, radar, table])
demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7875
* Running on public URL: https://0903f257802e2d7960.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
“Ugh, I’m stuck in traffic again and it’s driving me crazy!”

“I just got promoted at work—I’m over the moon about this!”

“I can’t believe they canceled the event last minute; I’m really disappointed.”

“This new opportunity feels both thrilling and a bit nerve-wracking.”

“I’m so proud of you for everything you’ve accomplished.”



“The movie’s plot twist left me speechless and amazed.”

“I’m worried about tomorrow’s presentation; what if I mess up?”

“That joke was hilarious—I haven’t laughed that hard in ages!”

“I feel a deep sense of loss after saying goodbye today.”

“Wow, that surprise party was the best birthday gift ever!”