In [1]:
import torch
import mediapipe as mp
import soundfile as sf
import cv2
import subprocess
import os
import sys
import subprocess

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
    # Facial and Upper  Body Detection - MediaPipe eken krnne
    mp_face_detection = mp.solutions.face_detection

    mp_pose = mp.solutions.pose
    # Initialize MediaPipe Face Detection and Pose
    face_detection = mp_face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.5)
    pose = mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5)

In [3]:
def generate_lip_sync(video_path, audio_path, output_path):
    subprocess.run([
        sys.executable, './Wav2Lip/inference.py', '--checkpoint_path', 'checkpoints/wav2lip.pth',
        '--face', video_path, '--audio', audio_path, '--outfile', output_path
    ], check=True)
    print("Lip Sync Done")


In [4]:
#Hand gesture detection
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Process each frame for hand gestures
def detect_hand_gestures(image):
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)
    return results

In [6]:
def text_to_speech(text, audio_path):
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

    inputs = processor(text=text, return_tensors="pt")

    # load xvector containing speaker's voice characteristics from a dataset
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    sf.write("speech2.wav", speech.numpy(), samplerate=16000)


In [7]:
from pdfminer.high_level import extract_text

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

In [8]:
def process_pdf(pdf_path, video_path, output_video_path):
    try:
        text = extract_text_from_pdf(pdf_path)
        print("Text extraction completed.")
        
        audio_path = "./train_audio.WAV"
        text_to_speech(text, audio_path)
        print(f"Audio file generated at: {audio_path}")

        if not os.path.exists(audio_path):
            print("Audio file was not generated. Please check the text_to_speech function.")
            return
        
        print("Text to Speech generation completed.")
        generate_lip_sync(video_path, audio_path, output_video_path)
        print("Lip sync process completed.")

        if not os.path.exists(output_video_path):
            print("Output video was not generated. Please check the lip sync process.")
            return

        print("Processing completed successfully. Check the output video.")

    except Exception as e:
        print(f"An error occurred: {e}")


In [9]:
if __name__ == "__main__":
    pdf_path = "./cognita_test_lite.pdf"
    video_path = "./train_vid.mp4"
    output_video_path = "./output_video.mp4"

    process_pdf(pdf_path, video_path, output_video_path)
    print("Processing complete. Check the output video.")

Text extraction completed.
Audio file generated at: ./train_audio.WAV
Audio file was not generated. Please check the text_to_speech function.
Processing complete. Check the output video.


In [1]:
import torch
print(torch.__version__)

2.3.1+cu121
