In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import torch
import numpy as np
import cv2
import librosa
import soundfile as sf
import subprocess
import json
import time
from PIL import Image
import torch.nn.functional as F
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Hugging Face & Model Imports
# --- ADDED AutoModelForImageClassification HERE ---
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    AutoFeatureExtractor, 
    AutoModelForAudioClassification,
    AutoModelForImageClassification 
)
from facenet_pytorch import MTCNN

print("‚úÖ Libraries imported (Safe Mode).")

# --- Setup Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üöÄ Using device: {device}")

# --- Define Labels ---
labels_list = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for i, label in enumerate(labels_list)}
num_labels = len(labels_list)

  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Libraries imported (Safe Mode).
üöÄ Using device: cuda


In [2]:
print("‚è≥ Loading models...")

# --- 1. FER (Face) - Your Fine-Tuned Frozen Model ---
fer_name = "trpakov/vit-face-expression"
fer_processor = AutoFeatureExtractor.from_pretrained(fer_name)
fer_model = AutoModelForImageClassification.from_pretrained(fer_name).to(device)

# --- 2. SER (Audio) - Your Fine-Tuned Model ---
ser_name = "superb/wav2vec2-base-superb-er"
ser_processor = AutoFeatureExtractor.from_pretrained(ser_name)
ser_model = AutoModelForAudioClassification.from_pretrained(
    ser_name, num_labels=num_labels, ignore_mismatched_sizes=True, use_safetensors=True
).to(device)


# --- 3. TER (Text) - Your Fine-Tuned Model ---
ter_name = "j-hartmann/emotion-english-distilroberta-base"
ter_tokenizer = AutoTokenizer.from_pretrained(ter_name)
ter_model = AutoModelForSequenceClassification.from_pretrained(
    ter_name, num_labels=num_labels, ignore_mismatched_sizes=True, use_safetensors=True
).to(device)
# Load your fine-tuned weights
ter_model.load_state_dict(torch.load('../models/ter_model_finetuned_expert.pth', map_location=device))
ter_model.eval()
print("‚úÖ TER (Text) model loaded.")

# --- 4. Face Detector ---
face_detector = MTCNN(device=device)
print("‚úÖ Face Detector loaded.")

‚è≥ Loading models...


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([7, 256]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  ter_model.load_state_dict(torch.load('../models/ter_model_finetuned_expert.pth', map_location=device))


‚úÖ TER (Text) model loaded.
‚úÖ Face Detector loaded.


  state_dict = torch.load(state_dict_path)
  state_dict = torch.load(state_dict_path)
  state_dict = torch.load(state_dict_path)


In [3]:
print("‚öôÔ∏è Assembling Fusion Brain (this takes ~30 seconds)...")

# 1. Load Helper Functions
def get_probs(text, audio, image):
    # Placeholder for prediction logic (we fill real values below)
    # This function is just to structure the inputs for the loop
    pass

def predict_single_modality(model, inputs):
    with torch.no_grad():
        logits = model(**inputs).logits
    return F.softmax(logits, dim=1).cpu().numpy().flatten()

# 2. Load Validation Data to Train Fusion
val_df = pd.read_csv('../data/MELD_processed/dev_text.csv')
X_fusion = []
y_fusion = []

print("   Extracting features from validation set...")
# We will use a simplified loop here to train the fusion model fast
# (In a real app, you'd load pre-computed features, but this works for a demo)
count = 0
for idx, row in val_df.iterrows():
    if count >= 200: break # Train on just 200 samples to be instant
    
    # Simulate getting probs (In real deployment, use pre-calc features for speed)
    # For this demo, we'll just assume the text model is strong and others are weak
    # to initialize the weights reasonably without running 1 hour of inference.
    # This is a "Warm Start" for the demo.
    
    # Text (Strong)
    try:
        inputs = ter_tokenizer(row['text'], return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        t_probs = predict_single_modality(ter_model, inputs)
    except: t_probs = np.full(7, 1/7)
        
    # Audio (Medium) - Skip loading audio file for speed, use random noise + label bias
    # (This is a trick for the demo initialization only)
    a_probs = np.full(7, 1/7) 
    
    # Face (Weak)
    f_probs = np.full(7, 1/7)
    
    # Stack
    features = np.concatenate([f_probs, a_probs, t_probs])
    X_fusion.append(features)
    y_fusion.append(label2id[row['emotion']])
    count += 1

# 3. Train Logistic Regression
fusion_model = LogisticRegression(max_iter=1000, class_weight='balanced')
fusion_model.fit(X_fusion, y_fusion)

print("‚úÖ Fusion Brain is ready!")

‚öôÔ∏è Assembling Fusion Brain (this takes ~30 seconds)...
   Extracting features from validation set...
‚úÖ Fusion Brain is ready!


In [4]:
def recommend_song(emotion):
    # Load database
    try:
        df = pd.read_csv("songs.csv")
    except:
        return ["Database error"], [""]

    # Filter
    options = df[df['emotion'] == emotion]
    
    if options.empty:
        return ["No songs found"], [""]
    
    # Pick up to 3 random songs
    n = min(3, len(options))
    picks = options.sample(n)
    
    return picks['song'].tolist(), picks['link'].tolist()

In [10]:
# --- HELPER FUNCTIONS ---
def capture_webcam():
    cap = cv2.VideoCapture(0)
    for _ in range(10): cap.read() # Warmup
    ret, frame = cap.read()
    cap.release()
    if ret: return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    return None

def run_recorder(filename="live_input.wav", duration=5):
    import sys
    print(f"\nüé§ RECORDING ({duration}s)... SPEAK NOW!")
    result = subprocess.run(
        [sys.executable, "recorder_v2.py", filename, str(duration)],
        capture_output=True, text=True, encoding='utf-8'
    )
    if "---JSON_START---" in result.stdout:
        json_str = result.stdout.split("---JSON_START---")[1].split("---JSON_END---")[0].strip()
        return json.loads(json_str)
    return None

# --- MAIN APPLICATION LOOP ---
print("="*50)
print("   ü§ñ TRISENSE EMOTION & MUSIC RECOMMENDER ü§ñ")
print("="*50)

# 1. Record Audio & Transcript
data = run_recorder(duration=5)
audio_path = None
transcript = ""

if data and data.get("status") == "success":
    audio_path = data["filename"]
    transcript = data["text"]
    print(f"‚úÖ Audio captured.")
    print(f"üó£Ô∏è Transcript: \"{transcript}\"")
else:
    print("‚ö†Ô∏è Audio failed.")

# Fallback text input
if not transcript:
    transcript = input("‚å®Ô∏è System couldn't hear you. Please TYPE your feeling: ")

# 2. Capture Face
print("üì∏ Capturing face...")
face_img = capture_webcam()

# 3. Get Predictions
# -- Face --
f_probs = np.full(7, 1/7)
if face_img:
    boxes, _ = face_detector.detect(face_img)
    if boxes is not None:
        face_crop = face_img.crop(boxes[0])
        inputs = fer_processor(images=face_crop, return_tensors="pt").to(device)
        with torch.no_grad(): f_probs = F.softmax(fer_model(**inputs).logits, dim=1).cpu().numpy()[0]

# -- Audio --
a_probs = np.full(7, 1/7)
if audio_path:
    try:
        audio, _ = librosa.load(audio_path, sr=16000)
        inputs = ser_processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
        with torch.no_grad(): a_probs = F.softmax(ser_model(**inputs).logits, dim=1).cpu().numpy()[0]
    except: pass

# -- Text --
t_probs = np.full(7, 1/7)
if transcript:
    inputs = ter_tokenizer(transcript, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad(): t_probs = F.softmax(ter_model(**inputs).logits, dim=1).cpu().numpy()[0]

# 4. Fusion
final_input = np.concatenate([f_probs, a_probs, t_probs]).reshape(1, -1)
# Note: We skip scaling for this simple demo to avoid dependency on the previous notebook's scaler
# The Logistic Regression will handle the raw probs fine for a demo.
final_probs = fusion_model.predict_proba(final_input)[0]
final_pred_idx = np.argmax(final_probs)
final_emotion = id2label[final_pred_idx]

# 5. Results
print("\n" + "-"*30)
print(f"üé≠ Face:  {id2label[np.argmax(f_probs)]} ({np.max(f_probs):.2f})")
print(f"üîä Audio: {id2label[np.argmax(a_probs)]} ({np.max(a_probs):.2f})")
print(f"üí¨ Text:  {id2label[np.argmax(t_probs)]} ({np.max(t_probs):.2f})")
print("-" * 30)
print(f"üß† FUSION VERDICT: {final_emotion.upper()}")
print("-" * 30)

# ... (inside the main loop) ...

# 6. RECOMMEND
songs, links = recommend_song(final_emotion)

print("\nüéµ DJ RECOMMENDATIONS FOR YOU üéµ")
for i, (song, link) in enumerate(zip(songs, links)):
    print(f"{i+1}. {song}")
    print(f"   üîó {link}")
print("="*50)

   ü§ñ TRISENSE EMOTION & MUSIC RECOMMENDER ü§ñ

üé§ RECORDING (5s)... SPEAK NOW!
‚úÖ Audio captured.
üó£Ô∏è Transcript: "I am not confused"
üì∏ Capturing face...

------------------------------
üé≠ Face:  anger (0.14)
üîä Audio: joy (0.15)
üí¨ Text:  neutral (0.43)
------------------------------
üß† FUSION VERDICT: NEUTRAL
------------------------------

üéµ DJ RECOMMENDATIONS FOR YOU üéµ
1. Sunday Morning - Maroon 5
   üîó https://open.spotify.com/track/5qII2n90lVdPDcgXEEVEJe
2. Sunflower - Post Malone
   üîó https://open.spotify.com/track/3KkXRkHbMCARz0aVfEt68P
3. Three Little Birds - Bob Marley
   üîó https://open.spotify.com/track/6A9mKXZCh00WgbCBEKKAmO
