In [None]:
# Upload audio file
# from google.colab import files
# uploaded = files.upload()
# path = next(iter(uploaded))

path = "/content/audio.wav"
num_speakers = 4  # You can adjust the number of speakers here

language = 'English'  # You can specify the language here

model_size = 'tiny'  # You can choose the model size here

model_name = model_size
if language == 'English' and model_size != 'large':
    model_name += '.en'

# Install required packages
#!pip install -q git+https://github.com/openai/whisper.git > /dev/null
#!pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null

import whisper
import subprocess
import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import wave
import contextlib
import random
from pydub import AudioSegment
import pyaudio
import os
from datetime import timedelta
import requests
from transformers import pipeline
import numpy as np
from sklearn.cluster import AgglomerativeClustering


# Define a function to generate a random name
def generate_name():
    first_names = ["Emma", "Liam", "Ava", "Noah", "Sophia", "William", "Isabella", "James", "Mia", "Benjamin", "Charlotte", "Oliver", "Amelia", "Evelyn", "Henry", "Harper", "Ella", "Alexander", "Abigail", "Michael"]
    last_names = ["Smith", "Johnson", "Brown", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Perez", "Taylor", "Anderson", "Wilson", "Jackson", "White", "Harris", "Martin", "Thompson"]
    first_name = random.choice(first_names)
    last_name = random.choice(last_names)
    return f"{first_name} {last_name}"

def speaker_diarization(path, num_speakers, language, model_size):
    # Define the embedding model
    embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb",device=torch.device("cuda"))

    # Check if the input path is not a WAV file, convert it to WAV
    if path[-3:] != 'wav':
        subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
        path = 'audio.wav'

    # Load the Whisper model
    model = whisper.load_model(model_size)

    # Transcribe the audio using Whisper
    result = model.transcribe(path)
    segments = result["segments"]

    # Extract audio duration
    with contextlib.closing(wave.open(path, 'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)

    audio = Audio()

    def segment_embedding(segment):
        start = segment["start"]
        end = min(duration, segment["end"])
        clip = Segment(start, end)
        waveform, samplerate = audio.crop(path, clip)
        return embedding_model(waveform[None])

    embeddings = np.zeros(shape=(len(segments), 192))
    for i, segment in enumerate(segments):
        embeddings[i] = segment_embedding(segment)

    embeddings = np.nan_to_num(embeddings)

    # Perform speaker clustering
    clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
    labels = clustering.labels_
    for i in range(len(segments)):
        segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

    def time(secs):
        return timedelta(seconds=round(secs))

    # Generate transcript
    with open("transcript.txt", "w", encoding='utf-8') as f:
        speakers = []
        distinct_speakers = []
        for (i, segment) in enumerate(segments):
            if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
                f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
            f.write(segment["text"][1:] + ' ')
            temp = set(speakers)
            for x in temp:
                if x not in distinct_speakers:
                    distinct_speakers.append(segment["speaker"])
                    distinct_speakers.append(segment["start"])
                    distinct_speakers.append(segment["end"])

    # Process speaker names and audio segments
    speaker_names = []
    val = len(distinct_speakers) // 3
    for i in range(val):
        j = 1 + i * 3
        start_time_str = str(distinct_speakers[j])
        end_time_str = str(distinct_speakers[j + 1])
        start_time = float(start_time_str)
        end_time = float(end_time_str)

        # Load the audio file as an AudioSegment
        audio_segment = AudioSegment.from_wav(path)

        # Extract the desired segment
        segment = audio_segment[int(start_time * 1000):int(end_time * 1000)]

        # Export the segment to an MP3 file
        name = generate_name()
        speaker_names.append(name)
        export_file = name + ".mp3"
        segment.export(export_file, format="mp3")

    i = 0
    dialogues = {}

    for (i, segment) in enumerate(segments):
        if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
            temp1 = time(segment["start"])
            temp2 = time(segment["end"])
            # Initialize a dialogue list for this speaker
            dialogues[segment["speaker"]] = []

        dialogues[segment["speaker"]].append(segment["text"][1:])

    # Save dialogues for each speaker to separate files
    for speaker, speaker_dialogues in dialogues.items():
        with open(f"{speaker}.txt", "w", encoding='utf-8') as f:
            f.write("\n".join(speaker_dialogues))

    # Load text for summarization
    with open("transcript.txt", "r", encoding='utf-8') as f:
        text = f.read()

    # Summarize the text
    # summarizer = pipeline("summarization", model="knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM-AMI")
    # summary = summarizer(text)
    hf_name = 'pszemraj/led-large-book-summary'
    summarizer = pipeline("summarization", hf_name,device=0 if torch.cuda.is_available() else -1,encoder_no_repeat_ngram_size=3 )
    with open("transcript.txt", "r", encoding='utf-8') as f:
        text = f.read()

    summary = summarizer(text, max_length=150)  # You can adjust max_length as needed

    # Save the summary
    with open("summary.txt", "w", encoding='utf-8') as f:
        f.write(summary[0]["summary_text"])

    # Save the summary
    with open("summary.txt", "w", encoding='utf-8') as f:
        for sentence in summary:
            f.write(str(sentence))

    # Process summaries for each speaker
    for speaker_name in speaker_names:
        if os.path.exists(f"{speaker_name}.txt"):
            with open(f"{speaker_name}.txt", "r", encoding='utf-8') as f:
                text = f.read()
                summary = summarizer(text)
                with open(f"{speaker_name}_summary.txt", "w", encoding='utf-8') as f:
                    for sentence in summary:
                        f.write(str(sentence))
        else:
            break

    # One AI API call (provide your API key)
    api_key = "f4e72987-82dd-44d4-8d57-bea926956674"
    url = "https://api.oneai.com/api/v0/pipeline"
    headers = {
        "api-key": api_key,
        "content-type": "application/json"
    }

    # Process the main transcript using One AI
    with open("transcript.txt", "r", encoding='utf-8') as f:
        text = f.read()

    payload = {
        "input": text,
        "input_type": "article",
        "output_type": "json",
        "multilingual": {
            "enabled": True
        },
        "steps": [
            {
                "skill": "action-items"
            }
        ],
    }

    r = requests.post(url, json=payload, headers=headers)
    data = r.json()
    input_dict = data
    labels = input_dict['output'][0]['labels']
    action_items = [label['value'] for label in labels if label['type'] == 'action-item']

    with open("actions.txt", "w") as f:
        f.write(str(action_items))

    # Process summaries for each speaker using One AI
    for speaker_name in speaker_names:
        with open(f"{speaker_name}_summary.txt", "r", encoding='utf-8') as f:
            text = f.read()
            payload = {
                "input": text,
                "input_type": "article",
                "output_type": "json",
                "multilingual": {
                    "enabled": True
                },
                "steps": [
                    {
                        "skill": "action-items"
                    }
                ],
            }

            r = requests.post(url, json=payload, headers=headers)
            data = r.json()
            input_dict = data
            labels = input_dict['output'][0]['labels']
            action_items = [label['value'] for label in labels if label['type'] == 'action-item']

        with open(f"{speaker_name}_actions.txt", "w", encoding='utf-8') as f:
            f.write(str(action_items))

# Example usage:
speaker_diarization(path, num_speakers, language, model_size)
