In [1]:
#!pip install facenet-pytorch opencv-python

In [2]:
import cv2
from facenet_pytorch import MTCNN

# Loading the MTCNN (Multi-task Cascaded Convolutional Networks) face detection model
mtcnn = MTCNN(keep_all=True)

# Loading the video
video_capture = cv2.VideoCapture('Random Conversation Between Two Friends.mp4')

# Getting the video properties
frame_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(video_capture.get(cv2.CAP_PROP_FPS))

# Creating a VideoWriter object to save the processed video
fourcc = cv2.VideoWriter_fourcc(*'XVID')
detection_output = cv2.VideoWriter('Face_detection_output.mp4', fourcc, fps, (frame_width, frame_height))

objects = []

while True:
    # Reading each frame from the video
    ret, frame = video_capture.read()
    if not ret:
        break

    # Converting the frame to RGB (MTCNN requires RGB images)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detecting faces in the frame
    boxes, _ = mtcnn.detect(rgb_frame)
    objects.append(boxes)

    # Drawing rectangles around the detected faces
    if boxes is not None:
        for box in boxes:
            x, y, w, h = map(int, box)
            cv2.rectangle(frame, (x, y), (w, h), (255, 0, 0), 2)
    
    # Writing the frame to the output video
    detection_output.write(frame)

    # Displaying the frame with the detected faces
    cv2.imshow('Video', frame)

    # Breaking the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Releasing the video capture object, the output video object, and closing all windows
video_capture.release()
detection_output.release()
cv2.destroyAllWindows()

In [3]:
# Creating empty list to store no of faces detected in each frame
no_of_object=[]

for i in objects:
    try:
        no_of_object.append(len(i))
    except Exception as e:
        continue

In [4]:
import numpy as np

# Calculating total number of speakers
no_of_speaker = np.round(np.mean(no_of_object))

In [5]:
import assemblyai as aai

# API key
aai.settings.api_key = "a7dd5ffaa8f348a1a82ecbe74fbe0732"

# Audio path
audio_url = 'temp.wav'

transcriber = aai.Transcriber()

In [6]:
config = aai.TranscriptionConfig(speaker_labels = True, speakers_expected = no_of_speaker)

transcript = transcriber.transcribe(audio_url, config)

with open('Subtitles.txt','a') as f:
    f.write(transcript.text)

In [7]:
start_time = []
end_time = []
speaker = []
text = []

for utterance in transcript.utterances:
    with open('Speaker Diarization.txt','a') as file:
        file.write(f"Speaker {utterance.speaker}: {utterance.text}\n")
        file.write(f"Start time: {utterance.start/1000} | End time: {utterance.end/1000}\n\n")
    
    start_time.append(utterance.start)
    end_time.append(utterance.end)
    speaker.append(utterance.speaker)
    text.append(utterance.text)


In [8]:
import pandas as pd 

df = pd.DataFrame({'Speaker':speaker,'Text':text,'Start time[ms]':start_time,'End time[ms]':end_time})

In [9]:
from moviepy.editor import AudioFileClip

# Loading the audio file
audio = AudioFileClip('temp.wav')

In [10]:
import os

for i in df['Speaker'].unique():
    directory = f'C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_{i}'
    if not os.path.exists(directory):
        os.mkdir(directory)

In [11]:
# Iterating over speaker labels and segment the audio
for index, row in df.iterrows():
    start_time = row['Start time[ms]']/1000   # Start time in seconds
    end_time = row['End time[ms]']/1000      # End time in seconds
    speaker = row['Speaker']

    # Extracting the segment
    segment = audio.subclip(start_time, end_time)

    # Exporting the segment to a file 
    for i in df['Speaker'].unique():
        if speaker == i:
            os.chdir(f'C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_{i}')
            segment.write_audiofile(f"speaker_{speaker}_segment_{index}.wav", codec='pcm_s16le')

os.chdir('C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection')

MoviePy - Writing audio in speaker_A_segment_0.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_segment_1.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_A_segment_2.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_segment_3.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_A_segment_4.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_segment_5.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_A_segment_6.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_segment_7.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_A_segment_8.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_segment_9.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_A_segment_10.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_segment_11.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_A_segment_12.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_segment_13.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_A_segment_14.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_segment_15.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_A_segment_16.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_segment_17.wav


                                                                                                                       

MoviePy - Done.




In [12]:
from moviepy.editor import AudioFileClip, concatenate_audioclips
import os

for folder in os.listdir('C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker'):
    count = 0
    audio = dict()
    # Load the audio files
    for file in os.listdir(f'C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\{folder}'):
        audio_count = AudioFileClip(os.path.join(f'C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\{folder}', file))
        audio.update({count:audio_count})
        count += 1
    audio = audio.values()
    # Concatenate the audio files
    joined_audio = concatenate_audioclips(audio)
    
    # Export the joined audio to a new file
    joined_audio.write_audiofile(f"C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\{folder}_joined_audio.wav")


MoviePy - Writing audio in C:\Users\HOME\DS\0. PROJECTS\Dialogue conversation with face detection\speaker_A_joined_audio.wav


                                                                                                                       

MoviePy - Done.




MoviePy - Writing audio in C:\Users\HOME\DS\0. PROJECTS\Dialogue conversation with face detection\speaker_B_joined_audio.wav


                                                                                                                       

MoviePy - Done.




In [13]:
os.chdir('C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection')

In [14]:
from moviepy.editor import AudioFileClip, concatenate_audioclips

# Load the audio files
audio1 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_A\\speaker_A_segment_0.wav")
audio2 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_A\\speaker_A_segment_2.wav")
audio3 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_A\\speaker_A_segment_4.wav")
audio4 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_A\\speaker_A_segment_6.wav")
audio5 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_A\\speaker_A_segment_8.wav")
audio6 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_A\\speaker_A_segment_10.wav")
audio7 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_A\\speaker_A_segment_12.wav")
audio8 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_A\\speaker_A_segment_14.wav")
audio9 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_A\\speaker_A_segment_16.wav")

audio10 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_B\\speaker_B_segment_1.wav")
audio11 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_B\\speaker_B_segment_3.wav")
audio12 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_B\\speaker_B_segment_5.wav")
audio13 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_B\\speaker_B_segment_7.wav")
audio14 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_B\\speaker_B_segment_9.wav")
audio15 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_B\\speaker_B_segment_11.wav")
audio16 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_B\\speaker_B_segment_13.wav")
audio17 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_B\\speaker_B_segment_15.wav")
audio18 = AudioFileClip("C:\\Users\\HOME\\DS\\0. PROJECTS\\Dialogue conversation with face detection\\speaker\\speaker_B\\speaker_B_segment_17.wav")

# Concatenate the audio files
joined_audio1 = concatenate_audioclips([audio1, audio2, audio3, audio4, audio5, audio6, audio7, audio8, audio9])
joined_audio2 = concatenate_audioclips([audio10, audio11, audio12, audio13, audio14, audio15, audio16, audio17, audio18])

# Export the joined audio to a new file
joined_audio1.write_audiofile("speaker_A_audio.wav")
joined_audio2.write_audiofile("speaker_B_audio.wav")


MoviePy - Writing audio in speaker_A_audio.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in speaker_B_audio.wav


                                                                                                                       

MoviePy - Done.


