<a href="https://colab.research.google.com/github/RudrikaSingh/interviewinsights/blob/main/CodeSample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install deepface

Collecting deepface
  Downloading deepface-0.0.93-py3-none-any.whl.metadata (30 kB)
Collecting flask-cors>=4.0.1 (from deepface)
  Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting mtcnn>=0.1.0 (from deepface)
  Downloading mtcnn-1.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting retina-face>=0.0.1 (from deepface)
  Downloading retina_face-0.0.17-py3-none-any.whl.metadata (10 kB)
Collecting fire>=0.4.0 (from deepface)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gunicorn>=20.1.0 (from deepface)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting lz4>=4.3.3 (from mtcnn>=0.1.0->deepface)
  Downloading lz4-4.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading deepface-0.0.93-py3-none-any.whl (108 kB)
[2K   [90m━

In [None]:
import cv2  # OpenCV for video processing
import librosa #For audio processing
import librosa.display
import soundfile as sf  # For reading audio files
import numpy as np

def get_video_frame(video_path):
    """Extracts a frame from a video."""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError(f"Could not open video: {video_path}")

    ret, frame = cap.read()
    cap.release()
    if not ret:
        raise ValueError("Could not read frame from video.")
    return frame


def read_audio(audio_path):
  """Reads an audio file."""
  try:
    audio, sr = librosa.load(audio_path) #sr is sample rate
    return audio, sr
  except Exception as e:
    raise ValueError(f"Could not read audio file: {audio_path} - {e}")



# Example usage
#video_frame = get_video_frame("path/to/your/video.mp4")
#audio, sample_rate = read_audio("path/to/your/audio.wav")

In [None]:
from deepface import DeepFace

def analyze_facial_expression(image):
    """Analyzes facial expression using DeepFace."""
    try:
        analysis = DeepFace.analyze(img_path = image, actions = ['emotion'], enforce_detection=False) #enforce_detection=False allows for analysis even if no face is detected
        dominant_emotion = analysis[0]['dominant_emotion']  # Access the dominant emotion
        return dominant_emotion, analysis[0]['emotion']  # Return dominant and all emotions
    except Exception as e:
        print(f"Error analyzing facial expression: {e}")
        return None, None

# Example usage:
#dominant_emotion, all_emotions = analyze_facial_expression(video_frame)

25-02-08 13:54:07 - Directory /root/.deepface has been created
25-02-08 13:54:07 - Directory /root/.deepface/weights has been created


In [None]:
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def extract_audio_features(audio, sample_rate):
    """Extracts MFCCs from audio."""
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)  # Adjust n_mfcc as needed
    return np.mean(mfccs.T, axis=0) #Averages over time


def train_audio_classifier(feature_vectors, labels): #You must create labeled data
    """Trains an SVM classifier for audio emotion."""
    X_train, X_test, y_train, y_test = train_test_split(feature_vectors, labels, test_size=0.2, random_state=42)  #Split data for train/test

    model = SVC(kernel='linear', probability=True) #SVM classifier with probability estimates
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Audio classifier accuracy: {accuracy}")

    return model #Return trained model for later predictions

def analyze_voice_tone(audio, sample_rate, model): #Requires trained model
    """Analyzes voice tone using extracted features and a trained classifier."""
    features = extract_audio_features(audio, sample_rate)
    features = features.reshape(1, -1)  # Reshape for single sample prediction
    predicted_emotion = model.predict(features)[0]
    probabilities = model.predict_proba(features)[0]  # Get probabilities for each class
    return predicted_emotion, probabilities

# Example Usage (after training):
#audio_emotion, probabilities = analyze_voice_tone(audio, sample_rate, audio_model)

In [None]:
!pip install SpeechRecognition


Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.1


In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import speech_recognition as sr
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np

# Speech-to-Text
def transcribe_audio(audio_path):
    """Transcribes audio to text."""
    r = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = r.record(source)
        try:
            text = r.recognize_google(audio_data)  # Requires internet connection
            return text
        except sr.UnknownValueError:
            return "Could not understand audio"
        except sr.RequestError as e:
            return f"Could not request results from Google Speech Recognition service; {e}"

# Embedding and Retrieval
def create_embeddings(sentences, model_name="sentence-transformers/all-mpnet-base-v2"):
    """Creates sentence embeddings using a transformer model."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0] #Use the CLS token for embedding.  Mean pooling could also be used

    return sentence_embeddings.numpy()


def rag_word_classification(text, knowledge_base): #Text will be a sentence from transcription
  """Classifies words in text related to emotions using RAG."""
  text_embedding = create_embeddings([text]) #Embed the sentence to classify
  knowledge_embeddings = create_embeddings(knowledge_base) #Embed KB

  dimension = knowledge_embeddings.shape[1]
  index = faiss.IndexFlatL2(dimension) #L2 distance index
  index.add(knowledge_embeddings)

  k = 3 #Top k matches from KB
  distances, indices = index.search(text_embedding, k)

  retrieved_terms = [knowledge_base[i] for i in indices[0]]
  return retrieved_terms

# Example Usage
# audio_text = transcribe_audio("path/to/your/audio.wav")
# emotion_knowledge_base = ["happy", "joyful", "sad", "depressed", "angry", "frustrated", "scared", "anxious"]
# retrieved_emotions = rag_word_classification(audio_text, emotion_knowledge_base)
# print(f"Retrieved Emotions: {retrieved_emotions}")

In [None]:
import re

def count_emotion_words(text, emotion_words):
    """Counts occurrences of emotion words in text."""
    word_counts = {}
    text = text.lower() #Lower case to make the counting not case sensitive

    for word in emotion_words:
        word_counts[word] = len(re.findall(r'\b' + re.escape(word) + r'\b', text)) #Whole word match using regular expressions

    return word_counts

# Example usage:
# emotion_counts = count_emotion_words(audio_text, retrieved_emotions) #Use retrieved terms in text
# print(f"Emotion word counts: {emotion_counts}")

In [None]:
def fuse_emotions(facial_emotion, voice_emotion, text_emotions, facial_confidence=0.5, voice_confidence=0.5): #Weights can be adjusted based on performance
    """Fuses emotion predictions from different modalities."""

    #Simple weighted average
    fused_emotion = {}
    total_confidence = facial_confidence + voice_confidence

    #Facial
    if facial_emotion:
      fused_emotion[facial_emotion] = fused_emotion.get(facial_emotion, 0) + facial_confidence

    #Voice
    if voice_emotion:
      fused_emotion[voice_emotion] = fused_emotion.get(voice_emotion, 0) + voice_confidence

    #Text based on counts
    for emotion, count in text_emotions.items():
      fused_emotion[emotion] = fused_emotion.get(emotion, 0) + count * 0.1 #Scale the text contributions down


    #Normalize:
    for emotion in fused_emotion:
      fused_emotion[emotion] /= total_confidence


    final_emotion = max(fused_emotion, key=fused_emotion.get) #Find emotion with the highest weighted average confidence
    confidence = fused_emotion[final_emotion]

    return final_emotion, confidence
# Example Usage:
# final_emotion, confidence = fuse_emotions(dominant_emotion, audio_emotion, emotion_counts)
# print(f"Final Emotion: {final_emotion}, Confidence: {confidence}")

In [None]:
# Main Function
def emotion_detection_pipeline(video_path, audio_path):
    """Main pipeline for emotion detection."""

    # 1. Input
    video_frame = get_video_frame(video_path)
    audio, sample_rate = read_audio(audio_path)

    # 2. Facial Expression
    dominant_emotion, all_emotions = analyze_facial_expression(video_frame)

    # 3. Voice Tone (Assuming you have trained an audio model: audio_model)
    audio_emotion, probabilities = analyze_voice_tone(audio, sample_rate, audio_model) #Needs a trained audio model

    # 4. RAG-based Word Classification
    audio_text = transcribe_audio(audio_path)
    emotion_knowledge_base = ["happy", "joyful", "sad", "depressed", "angry", "frustrated", "scared", "anxious"]  #Expand!
    retrieved_emotions = rag_word_classification(audio_text, emotion_knowledge_base)
    emotion_counts = count_emotion_words(audio_text, retrieved_emotions)

    # 5. Emotion Fusion
    final_emotion, confidence = fuse_emotions(dominant_emotion, audio_emotion, emotion_counts)

    # 6. Output
    print(f"Final Emotion: {final_emotion}, Confidence: {confidence}")
    print(f"Emotion Word Counts: {emotion_counts}")


# Example usage
# emotion_detection_pipeline("path/to/video.mp4", "path/to/audio.wav")