In [None]:
%%writefile diarization.py

import whisperx
import gc
import pandas as pd
import base64

def diarize(base64_audio, number):
  device = "cuda"
  batch_size = 4
  compute_type = "float16"

  audio_bytes = base64.b64decode(base64_audio)
  with open("output.wav", "wb") as wav_file:
    wav_file.write(audio_bytes)

  model = whisperx.load_model("large-v2", device, compute_type=compute_type)
  audio = whisperx.load_audio("output.wav")
  result1 = model.transcribe(audio, batch_size=batch_size)
  model_a, metadata = whisperx.load_align_model(language_code=result1["language"], device=device)
  result = whisperx.align(result1["segments"], model_a, metadata, audio, device, return_char_alignments=False)
  diarize_model = whisperx.DiarizationPipeline(use_auth_token="HF_TOKEN", device=device)
  diarize_segments = diarize_model(audio, min_speakers=number, max_speakers=number)
  result2 = whisperx.assign_word_speakers(diarize_segments, result)

  return result2

In [None]:
%%writefile summarization.py

import google.generativeai as genai

genai.configure(api_key="API_KEY")

SYSTEM_PROMPT = (
    "YOU ARE A SUMMARIZATION ASSISTANT. YOUR TASK IS TO:"

        "1. READ THE PROVIDED TEXT CAREFULLY."
        "2. SUMMARIZE IT INTO A MAXIMUM OF 200 WORDS."
        "3. IF YOU THINK THAT THE INPUT TEXT IS TOO SMALL FOR SUMMARIZATION DONT OUTPUT UNNECESSARY TEXT OR WARNINGS OR ERRORS RATHER JUST OUTPUT THE ORIGINAL TEXT, WITHOUT TAMPERING"

    "WRITE THE SUMMARY IN AN ABSTRACT STYLE, ENSURING THE CORE MESSAGE AND KEY IDEAS ARE RETAINED BUT WITHOUT COPYING ANY LINES VERBATIM FROM THE INPUT TEXT."
    "OUTPUT RULES:"

        "1. RETURN ONLY THE SUMMARY."
        "2. DO NOT INCLUDE ANY INTRODUCTIONS, EXPLANATIONS, OR FORMATTING IN YOUR RESPONSE."
        "3. THE OUTPUT MUST CONSIST ONLY OF THE ABSTRACT SUMMARY."
)

def summarize(text):
    prompt = f"""### System Prompt:
    {SYSTEM_PROMPT}

    ### User Prompt:
    Text to be SUMMARIZED: {text}

    ### Assistant Response:"""

    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    return response.text.replace("  "," ").strip().strip('\n')

In [None]:
!pip install whisperx pyngrok flask

In [None]:
from pyngrok import ngrok
from flask import Flask, request
from diarization import diarize
from summarization import summarize

In [None]:
port_no = 5000
app = Flask(__name__)
ngrok.set_auth_token("NGROK_AUTH_TOKEN")
public_url = "NGROK_PUBLIC_URL"

@app.route("/generate", methods=["POST"])
def generate():
  base64_audio = request.json["audio"]
  number = request.json["number"]
  diarized_data = diarize(base64_audio,number)

  sum_dict={}

  for i in range(number):
    con_text=""
    for j in range (len(diarized_data["segments"])):
      if i <= 9:
        num = '0' + str(i)
      else:
        num = str(i)

      if diarized_data["segments"][j]["speaker"] == "SPEAKER_"+num:
        con_text = con_text + " " + diarized_data["segments"][j]["text"]

    sum_dict["SPEAKER_"+num] = summarize(con_text)

  return {"d_data":diarized_data, "summary":sum_dict}, 201

ngrok_tunnel = ngrok.connect(port_no, bind_tls=True, hostname="NGROK_HOSTNAME")
if __name__ == '__main__':
  print(f"Ngrok tunnel: {ngrok_tunnel.public_url}")
  app.run(host="0.0.0.0", debug=True, port=port_no, use_reloader=False)