# Install what we need

In [None]:
!pip install gradio
!pip install openai-whisper
!pip install transformers # LM'

In [3]:
# google drive를 colab에 연결
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Our model

In [5]:
import os
import time
import json

import whisper
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

class Model():
    def __init__(self) -> None:
      self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      self.stt_model = whisper.load_model("base").to(self.device)
      self.lm = None
      self.lm_config = None
      self.tokenizer = None
      self.id2label = None

      self.load_model()

    def load_model(self):
          
      path = os.path.join(os.getcwd(), 'output')

      with open(os.path.join(path, 'config.json'), 'r', encoding='utf-8') as f:
        self.lm_config = json.load(f)

      self.lm = AutoModelForSequenceClassification.from_pretrained(path).to(self.device)
      self.tokenizer = AutoTokenizer.from_pretrained(self.lm_config['_name_or_path'])
      self.id2label = self.lm_config['id2label']

    def stt(self, audio):

      audio = whisper.load_audio(audio)
      audio = whisper.pad_or_trim(audio)

      mel = whisper.log_mel_spectrogram(audio).to(self.device)

      options = whisper.DecodingOptions(task="transcribe", language="ko")
      result = whisper.decode(self.stt_model, mel, options)

      return result.text

    def tti(self, text):
        inputs = self.tokenizer(text, padding="max_length", truncation=True, return_tensors="pt").to(self.device)
        result = self.lm(**inputs)
        logit, id = result['logits'].max(dim=-1)
        return self.id2label[str(id.tolist()[0])]

    def sti(self, audio):
      start_time = time.time()

      text = self.stt(audio)
      label  = self.tti(text)

      end_time = time.time()
      print(f"걸린 시간: {end_time - start_time}")
      return text, label

    def reset(self):
        return "", ""

# Application

In [6]:
import gradio as gr

gr.close_all()

app = gr.Blocks()

with app:
    model = Model()

    # audio = gr.Microphone(label="Voice", type="filepath", streaming=True)
    audio = gr.Microphone(label="Voice", type="filepath")
    upload_button = gr.Button("Upload recording")
    text = gr.Textbox(label="Text")
    label = gr.Label()

    upload_button.click(model.sti, audio, outputs=[text, label])

    text.change(model.tti, text, label)
    # audio.change(model.sti, audio, outputs=[text, label])
    # audio.clear(model.reset, outputs=[text, label])

if __name__ == "__main__":
    app.launch(share=True, debug=True)


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 140MiB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://511676d8f0785794ad.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


걸린 시간: 5.575927257537842
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://511676d8f0785794ad.gradio.live
