In [1]:
import sys

sys.path.append("./wav2lip")
import os

import openai
import requests

from pydub import AudioSegment
from dotenv import load_dotenv
from flask import Flask, request, url_for, Response
from flask_cors import CORS

from wav2lip.video import generate

Using cuda for inference.


In [2]:
load_dotenv()


class Video:
    def __init__(self,
                 final_video_location='./static/result_video.mp4',
                 idle_video_location='./static/input_video.mp4',
                 input_audio_location='./static/input_audio.wav'):
        self.final_video_location = final_video_location
        self.idle_video_location = idle_video_location
        self.input_audio_location = input_audio_location
        self.url_prefix = "http://localhost:8081/static/"

    def store_video(self):
        status = generate(face=self.idle_video_location,
                          input_audio=self.input_audio_location,
                          checkpoint_path='./wav2lip/wav2lip_gan.pth',
                          outfile=self.final_video_location,
                          fps=25.,
                          resize_factor=1,
                          rotate=False,
                          crop=[0, -1, 0, -1])
        if status == 0:
            print('video is generated.')

    def get_final_video_url(self):
        return self.url_prefix + "result_video.mp4"

    def get_final_audio_url(self):
        return self.url_prefix + "input_audio.wav"


class Text:
    def __init__(self,
                 model_name="gpt-3.5-turbo",
                 system_setting="You are a helpful assistant \
                     that translates Chinese to English.",
                 user_template="Translate the following Chinese\
                     text to English: "):
        # Load your API key from an environment variable
        # or secret management service
        openai.api_key = os.getenv("OPENAI_API_KEY")
        self.chat_history = [
            {"role": "system", "content": system_setting},
        ]
        self.model_name = model_name
        self.system_setting = system_setting
        self.user_template = user_template

    def response(self, user_message):
        self.chat_history.append({
            "role": "user",
            "content": self.user_template + user_message
        })

        response = openai.ChatCompletion.create(
            model=self.model_name,
            messages=self.chat_history
        )

        self.chat_history.append({
            "role": response['choices'][0]['message']['role'],
            "content": response['choices'][0]['message']['content']
        })
        return response['choices'][0]['message']['content']


class Speech:
    def __init__(self, text='Please say something'):
        self.text = text

    def store_audio(self):
        pass


class ElevenLabs(Speech):
    def __init__(self):
        super().__init__()
        self.api_endpoint = "https://api.elevenlabs.io/v1/text-to-speech/ErXwobaYiN019PkySvjV"
        self.api_key = os.getenv("ELEVENLABS_KEY")

    def store_audio(self, text, audio_location='./static/input_audio.mp3'):
        headers = {
                'accept': 'audio/mpeg',
                'xi-api-key': self.api_key,
                'Content-Type': 'application/json'
        }
        data = {
            "text": text,
            "voice_settings": {
                "stability": 0,
                "similarity_boost": 0
            }
        }
        r = requests.post(self.api_endpoint, headers=headers, json=data)
        r.raise_for_status()

        assert r.headers["Content-Type"] == "audio/mpeg"

        with open(audio_location, "wb") as file:
            file.write(r.content)
        
        # convert mp3 to wav
        
        sound = AudioSegment.from_mp3(audio_location)
        sound.export("./static/input_audio.wav", format="wav")
        print("audio is generated.")

In [3]:
user_message = "我要睡觉了"
print(user_message)

我要睡觉了


In [4]:
# user message -> bot message
response_generator = Text()
response = response_generator.response(user_message)
response

'"I\'m going to sleep now."'

In [5]:
# bot message -> speech
speech = ElevenLabs()
speech.store_audio(response)

audio is generated.


In [6]:
# speech + idle video -> active video
video_generator = Video()
video_generator.store_video()

Reading video frames...
Number of frames available for inference: 220
(80, 134)
Length of mel chunks: 37


100%|██████████| 3/3 [00:04<00:00,  1.52s/it]


Load checkpoint from: ./wav2lip/wav2lip_gan.pth
Model loaded


100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


video is generated.


In [3]:
# Server
app = Flask(__name__)
CORS(app)


@app.route("/chat", methods=['POST'])
def chat():
    user_message = request.json.get('user_message')
    print(user_message)

    # user message -> bot message
    print('generating response...')
    response_generator = Text()
    response = response_generator.response(user_message)

    # bot message -> speech
    print('generating speech...')
    speech = ElevenLabs()
    speech.store_audio(response)

    # speech + idle video -> active video
    print('generating video...')
    video_generator = Video()
    video_generator.store_video()

    # url_for('static', filename='result_video.mp4')
    # url_for('static', filename='input_audio.wav')
    
    return {
        "bot_message": response,
        "audio_source": video_generator.get_final_audio_url(),
        "video_source": video_generator.get_final_video_url()
    }, 200
    
    
if __name__ == '__main__':
    from werkzeug.serving import run_simple
    run_simple('localhost', 8080, app)

 * Running on http://localhost:8080
Press CTRL+C to quit
127.0.0.1 - - [11/Mar/2023 13:00:36] "OPTIONS /chat HTTP/1.1" 200 -


你好
generating response...
generating speech...
audio is generated.
generating video...
Reading video frames...
Number of frames available for inference: 220
(80, 278)
Length of mel chunks: 80


100%|██████████| 5/5 [00:07<00:00,  1.47s/it]


Load checkpoint from: ./wav2lip/wav2lip_gan.pth
Model loaded


100%|██████████| 1/1 [00:09<00:00,  9.57s/it]
127.0.0.1 - - [11/Mar/2023 13:00:49] "POST /chat HTTP/1.1" 200 -


video is generated.


127.0.0.1 - - [11/Mar/2023 13:01:42] "OPTIONS /chat HTTP/1.1" 200 -


今天是星期六
generating response...
generating speech...
audio is generated.
generating video...
Reading video frames...
Number of frames available for inference: 220
(80, 398)
Length of mel chunks: 116


100%|██████████| 8/8 [00:03<00:00,  2.61it/s]


Load checkpoint from: ./wav2lip/wav2lip_gan.pth
Model loaded


100%|██████████| 1/1 [00:04<00:00,  4.99s/it]
127.0.0.1 - - [11/Mar/2023 13:01:51] "POST /chat HTTP/1.1" 200 -


video is generated.


127.0.0.1 - - [11/Mar/2023 13:02:19] "OPTIONS /chat HTTP/1.1" 200 -


马上去吃午饭
generating response...
generating speech...
audio is generated.
generating video...
Reading video frames...
Number of frames available for inference: 220
(80, 354)
Length of mel chunks: 103


100%|██████████| 7/7 [00:03<00:00,  2.27it/s]


Load checkpoint from: ./wav2lip/wav2lip_gan.pth
Model loaded


100%|██████████| 1/1 [00:04<00:00,  4.91s/it]
127.0.0.1 - - [11/Mar/2023 13:02:29] "POST /chat HTTP/1.1" 200 -


video is generated.
