In [None]:
!pip install yt-dlp
!apt install ffmpeg

import yt_dlp
import subprocess
import os

url = "https://youtu.be/6TbZsShiK40"
output_path = "/content/temporary"
audio_filename = "konandeath.mp3"
audio_file = f"{output_path}/{audio_filename}"

# Ensure the output path exists
os.makedirs(output_path, exist_ok=True)

try:
    # Define the WebM file path
    webm_file = os.path.join(output_path, 'audio.webm')

    # Download the full video using yt-dlp
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': webm_file,
        'noplaylist': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
        print(f"Download completed! The file is saved in {webm_file}.")

    # Convert WebM audio to MP3
    mp3_file = os.path.join(output_path, audio_filename)
    result = subprocess.run([
        'ffmpeg', '-i', webm_file,
        '-b:a', '128k', mp3_file
    ], capture_output=True, text=True)

    if result.returncode == 0:
        print(f"Conversion to MP3 completed! The file is saved in {mp3_file}.")
    else:
        print(f"Conversion failed. Error: {result.stderr}")

    # Clean up the WebM file
    if os.path.exists(webm_file):
        os.remove(webm_file)

except Exception as e:
    print(f"An error occurred: {e}")

# Verify the MP3 file exists
if os.path.exists(audio_file):
    print(f"MP3 file exists at {audio_file}")
else:
    print(f"MP3 file does not exist at {audio_file}")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
[youtube] Extracting URL: https://youtu.be/6TbZsShiK40
[youtube] 6TbZsShiK40: Downloading webpage
[youtube] 6TbZsShiK40: Downloading ios player API JSON
[youtube] 6TbZsShiK40: Downloading web creator player API JSON
[youtube] 6TbZsShiK40: Downloading player bd3293c9
[youtube] 6TbZsShiK40: Downloading m3u8 information
[info] 6TbZsShiK40: Downloading 1 format(s): 251
[download] Destination: /content/temporary/audio.webm
[download] 100% of    6.00MiB in 00:00:00 at 25.39MiB/s  
Download completed! The file is saved in /content/temporary/audio.webm.
Conversion to MP3 completed! The file is saved in /content/temporary/konandeath.mp3.
MP3 file exists at /content/temporary/konandeath.mp3


In [None]:
!pip install git+https://github.com/m-bain/whisperx.git

import whisperx



Collecting git+https://github.com/m-bain/whisperx.git
  Cloning https://github.com/m-bain/whisperx.git to /tmp/pip-req-build-b8fadgvy
  Running command git clone --filter=blob:none --quiet https://github.com/m-bain/whisperx.git /tmp/pip-req-build-b8fadgvy
  Resolved https://github.com/m-bain/whisperx.git to commit 58f00339af7dcc9705ef49d97a1f40764b7cf555
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
model_name = "medium"
device = "cpu"
batch_size = 14
compute_type = "int8"

In [None]:
model = whisperx.load_model(model_name, device, compute_type=compute_type)

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.3.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.


In [None]:
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)

Detected language: en (0.84) in first 30s of audio...


In [None]:
print(result["segments"])

[{'text': " Are you trying to find something? Yes. Just tell me where you've hidden Nagato's corpse.", 'start': 7.568, 'end': 14.804}, {'text': " I was sure that you would show up looking for it sooner or later. I've been waiting for you. Now that you're here, I will stop you. Naruto Uzumaki. You truly believe he's worthy of all this? He is the light. That's why we all can carry flowers of hope. You bare your fangs at me, yet you wear that cloak. You still care for the Akatsuki.", 'start': 44.292, 'end': 72.073}, {'text': " The red clouds on these cloaks symbolize the unending wars that rained blood down upon the hidden rain. Yahiko was the one who founded the Akatsuki, not you. These cloaks are our legacy, not yours. And the Rinnegan is something awakened by Nagato. Again, it doesn't belong to you. They are this village's treasure!", 'start': 72.346, 'end': 91.8}, {'text': " Since you're about to die, I shall enlighten you. You're mistaken on two points. I encouraged and pushed Yahiko

In [None]:
result

{'segments': [{'text': " Are you trying to find something? Yes. Just tell me where you've hidden Nagato's corpse.",
   'start': 7.568,
   'end': 14.804},
  {'text': " I was sure that you would show up looking for it sooner or later. I've been waiting for you. Now that you're here, I will stop you. Naruto Uzumaki. You truly believe he's worthy of all this? He is the light. That's why we all can carry flowers of hope. You bare your fangs at me, yet you wear that cloak. You still care for the Akatsuki.",
   'start': 44.292,
   'end': 72.073},
  {'text': " The red clouds on these cloaks symbolize the unending wars that rained blood down upon the hidden rain. Yahiko was the one who founded the Akatsuki, not you. These cloaks are our legacy, not yours. And the Rinnegan is something awakened by Nagato. Again, it doesn't belong to you. They are this village's treasure!",
   'start': 72.346,
   'end': 91.8},
  {'text': " Since you're about to die, I shall enlighten you. You're mistaken on two p

In [None]:
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a,
                        metadata,
                        audio,
                        device,
                        return_char_alignments=False)

print(result["segments"]) # after alignment

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth
100%|██████████| 360M/360M [00:03<00:00, 107MB/s] 


[{'start': 7.648, 'end': 9.172, 'text': ' Are you trying to find something?', 'words': [{'word': 'Are', 'start': 7.648, 'end': 7.748, 'score': 0.94}, {'word': 'you', 'start': 7.768, 'end': 7.889, 'score': 0.992}, {'word': 'trying', 'start': 7.929, 'end': 8.189, 'score': 0.863}, {'word': 'to', 'start': 8.229, 'end': 8.31, 'score': 0.832}, {'word': 'find', 'start': 8.41, 'end': 8.67, 'score': 0.716}, {'word': 'something?', 'start': 8.771, 'end': 9.172, 'score': 0.847}]}, {'start': 9.733, 'end': 10.154, 'text': 'Yes.', 'words': [{'word': 'Yes.', 'start': 9.733, 'end': 10.154, 'score': 0.784}]}, {'start': 11.537, 'end': 14.443, 'text': "Just tell me where you've hidden Nagato's corpse.", 'words': [{'word': 'Just', 'start': 11.537, 'end': 11.817, 'score': 0.686}, {'word': 'tell', 'start': 11.918, 'end': 12.218, 'score': 0.83}, {'word': 'me', 'start': 12.258, 'end': 12.379, 'score': 0.666}, {'word': 'where', 'start': 12.399, 'end': 12.699, 'score': 0.584}, {'word': "you've", 'start': 12.739,

In [None]:
result["segments"]

[{'start': 7.648,
  'end': 9.172,
  'text': ' Are you trying to find something?',
  'words': [{'word': 'Are', 'start': 7.648, 'end': 7.748, 'score': 0.94},
   {'word': 'you', 'start': 7.768, 'end': 7.889, 'score': 0.992},
   {'word': 'trying', 'start': 7.929, 'end': 8.189, 'score': 0.863},
   {'word': 'to', 'start': 8.229, 'end': 8.31, 'score': 0.832},
   {'word': 'find', 'start': 8.41, 'end': 8.67, 'score': 0.716},
   {'word': 'something?', 'start': 8.771, 'end': 9.172, 'score': 0.847}]},
 {'start': 9.733,
  'end': 10.154,
  'text': 'Yes.',
  'words': [{'word': 'Yes.', 'start': 9.733, 'end': 10.154, 'score': 0.784}]},
 {'start': 11.537,
  'end': 14.443,
  'text': "Just tell me where you've hidden Nagato's corpse.",
  'words': [{'word': 'Just', 'start': 11.537, 'end': 11.817, 'score': 0.686},
   {'word': 'tell', 'start': 11.918, 'end': 12.218, 'score': 0.83},
   {'word': 'me', 'start': 12.258, 'end': 12.379, 'score': 0.666},
   {'word': 'where', 'start': 12.399, 'end': 12.699, 'score':

In [None]:
diarize_model = whisperx.DiarizationPipeline(use_auth_token="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",
                                             device=device)

config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

In [None]:
minspks = 0 #@param {type:"integer"}
maxspks = 0 #@param {type:"integer"}

In [None]:
diarize_segments = diarize_model(audio, min_speakers=minspks, max_speakers=maxspks)

In [None]:
diarize_segments

Unnamed: 0,segment,label,speaker,start,end
0,[ 00:00:07.563 --> 00:00:09.210],A,SPEAKER_01,7.563667,9.210526
1,[ 00:00:09.702 --> 00:00:10.466],B,SPEAKER_00,9.702886,10.466893
2,[ 00:00:11.434 --> 00:00:14.660],C,SPEAKER_00,11.434635,14.660441
3,[ 00:00:44.269 --> 00:00:47.614],D,SPEAKER_01,44.269949,47.614601
4,[ 00:00:47.767 --> 00:00:49.176],E,SPEAKER_01,47.767402,49.176570
...,...,...,...,...,...
72,[ 00:06:12.062 --> 00:06:16.833],BU,SPEAKER_01,372.062818,376.833616
73,[ 00:06:33.981 --> 00:06:35.865],BV,SPEAKER_00,393.981324,395.865874
74,[ 00:06:36.782 --> 00:06:37.224],BW,SPEAKER_00,396.782683,397.224109
75,[ 00:06:37.682 --> 00:06:41.332],BX,SPEAKER_00,397.682513,401.332767


In [None]:
diarize_segments.speaker.unique()

array(['SPEAKER_01', 'SPEAKER_00'], dtype=object)

In [None]:
result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"])

                              segment label     speaker       start  \
0   [ 00:00:07.563 -->  00:00:09.210]     A  SPEAKER_01    7.563667   
1   [ 00:00:09.702 -->  00:00:10.466]     B  SPEAKER_00    9.702886   
2   [ 00:00:11.434 -->  00:00:14.660]     C  SPEAKER_00   11.434635   
3   [ 00:00:44.269 -->  00:00:47.614]     D  SPEAKER_01   44.269949   
4   [ 00:00:47.767 -->  00:00:49.176]     E  SPEAKER_01   47.767402   
..                                ...   ...         ...         ...   
72  [ 00:06:12.062 -->  00:06:16.833]    BU  SPEAKER_01  372.062818   
73  [ 00:06:33.981 -->  00:06:35.865]    BV  SPEAKER_00  393.981324   
74  [ 00:06:36.782 -->  00:06:37.224]    BW  SPEAKER_00  396.782683   
75  [ 00:06:37.682 -->  00:06:41.332]    BX  SPEAKER_00  397.682513   
76  [ 00:06:41.825 -->  00:06:41.876]    BY  SPEAKER_01  401.825127   

           end  intersection       union  
0     9.210526   -391.646474  393.494333  
1    10.466893   -390.390107  391.355114  
2    14.660441   -

In [None]:
result

{'segments': [{'start': 7.648,
   'end': 9.172,
   'text': ' Are you trying to find something?',
   'words': [{'word': 'Are',
     'start': 7.648,
     'end': 7.748,
     'score': 0.94,
     'speaker': 'SPEAKER_01'},
    {'word': 'you',
     'start': 7.768,
     'end': 7.889,
     'score': 0.992,
     'speaker': 'SPEAKER_01'},
    {'word': 'trying',
     'start': 7.929,
     'end': 8.189,
     'score': 0.863,
     'speaker': 'SPEAKER_01'},
    {'word': 'to',
     'start': 8.229,
     'end': 8.31,
     'score': 0.832,
     'speaker': 'SPEAKER_01'},
    {'word': 'find',
     'start': 8.41,
     'end': 8.67,
     'score': 0.716,
     'speaker': 'SPEAKER_01'},
    {'word': 'something?',
     'start': 8.771,
     'end': 9.172,
     'score': 0.847,
     'speaker': 'SPEAKER_01'}],
   'speaker': 'SPEAKER_01'},
  {'start': 9.733,
   'end': 10.154,
   'text': 'Yes.',
   'words': [{'word': 'Yes.',
     'start': 9.733,
     'end': 10.154,
     'score': 0.784,
     'speaker': 'SPEAKER_00'}],
   'sp

In [None]:
def extract_text_by_speaker(transcription):
    # Iterate over segments
    for segment in transcription.get('segments', []):
        speaker = segment.get('speaker')
        text = segment.get('text', '')

        if speaker:
            # Print each speaker's text segment
            print(f"{speaker}: {text}")


In [None]:
extract_text_by_speaker(result)

SPEAKER_01:  Are you trying to find something?
SPEAKER_00: Yes.
SPEAKER_00: Just tell me where you've hidden Nagato's corpse.
SPEAKER_01:  I was sure that you would show up looking for it sooner or later.
SPEAKER_01: I've been waiting for you.
SPEAKER_01: Now that you're here, I will stop you.
SPEAKER_00: Naruto Uzumaki.
SPEAKER_00: You truly believe he's worthy of all this?
SPEAKER_01: He is the light.
SPEAKER_01: That's why we all can carry flowers of hope.
SPEAKER_00: You bare your fangs at me, yet you wear that cloak.
SPEAKER_00: You still care for the Akatsuki.
SPEAKER_01:  The red clouds on these cloaks symbolize the unending wars that rained blood down upon the hidden rain.
SPEAKER_01: Yahiko was the one who founded the Akatsuki, not you.
SPEAKER_01: These cloaks are our legacy, not yours.
SPEAKER_01: And the Rinnegan is something awakened by Nagato.
SPEAKER_01: Again, it doesn't belong to you.
SPEAKER_01: They are this village's treasure!
SPEAKER_00:  Since you're about to die,

In [None]:
import re
import spacy

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

def clean_transcription(transcription):
    # Function to remove filler words and repetitions
    def filter_text(text):
        # Remove short and irrelevant segments
        if len(text.split()) < 3:
            return ''
        # Remove trailing punctuation or filler phrases
        return re.sub(r'[.,!?]+$', '', text.strip())

    dialogue = []
    for segment in transcription.get('segments', []):
        speaker = segment.get('speaker')
        text = segment.get('text', '').strip()

        if speaker and text:
            # Tokenize and filter sentences
            sentences = [filter_text(sent.text) for sent in nlp(text).sents]
            # Add non-empty, filtered sentences to dialogue
            for sentence in sentences:
                if sentence:
                    dialogue.append(f"{speaker}: {sentence}")

    # Join all the dialogue lines
    cleaned_transcription = "\n".join(dialogue)
    return cleaned_transcription

# Process and print the cleaned transcription
cleaned_transcription = clean_transcription(result)
print(cleaned_transcription)

SPEAKER_01: Are you trying to find something
SPEAKER_00: Just tell me where you've hidden Nagato's corpse
SPEAKER_01: I was sure that you would show up looking for it sooner or later
SPEAKER_01: I've been waiting for you
SPEAKER_01: Now that you're here, I will stop you
SPEAKER_00: You truly believe he's worthy of all this
SPEAKER_01: He is the light
SPEAKER_01: That's why we all can carry flowers of hope
SPEAKER_00: You bare your fangs at me, yet you wear that cloak
SPEAKER_00: You still care for the Akatsuki
SPEAKER_01: The red clouds on these cloaks symbolize the unending wars that rained blood down upon the hidden rain
SPEAKER_01: Yahiko was the one who founded the Akatsuki, not you
SPEAKER_01: These cloaks are our legacy, not yours
SPEAKER_01: And the Rinnegan is something awakened by Nagato
SPEAKER_01: Again, it doesn't belong to you
SPEAKER_01: They are this village's treasure
SPEAKER_00: Since you're about to die, I shall enlighten you
SPEAKER_00: You're mistaken on two points


In [None]:
import pandas as pd

In [None]:
def parse_text_to_df(text):
    # Split the text by lines and extract speaker and text
    lines = text.strip().split('\n')
    data = {'speaker': [], 'text': []}

    for line in lines:
        match = re.match(r'(\S+):\s*(.*)', line)
        if match:
            speaker, text = match.groups()
            data['speaker'].append(speaker)
            data['text'].append(text)

    # Create DataFrame
    df = pd.DataFrame(data)
    return df

# Convert the input text to a DataFrame
df = parse_text_to_df(cleaned_transcription)

# Print the DataFrame
print(df)

       speaker                                               text
0   SPEAKER_01                   Are you trying to find something
1   SPEAKER_00   Just tell me where you've hidden Nagato's corpse
2   SPEAKER_01  I was sure that you would show up looking for ...
3   SPEAKER_01                          I've been waiting for you
4   SPEAKER_01              Now that you're here, I will stop you
..         ...                                                ...
65  SPEAKER_01     I too believe in Naruto, and it's his turn now
66  SPEAKER_01                 Naruto will be the bridge to peace
67  SPEAKER_01  I am expendable, I am only a mere flower, but ...
68  SPEAKER_00         When Genjutsu is finished, you will be too
69  SPEAKER_00  After I've made you tell me where the Rinnegan is

[70 rows x 2 columns]


In [None]:
speaker_mapping = {
    'SPEAKER_01': '',
    'SPEAKER_00': ''
}

df['speaker'] = df['speaker'].replace(speaker_mapping)

print(df)

   speaker                                               text
0    Konan                   Are you trying to find something
1     Tobi   Just tell me where you've hidden Nagato's corpse
2    Konan  I was sure that you would show up looking for ...
3    Konan                          I've been waiting for you
4    Konan              Now that you're here, I will stop you
..     ...                                                ...
65   Konan     I too believe in Naruto, and it's his turn now
66   Konan                 Naruto will be the bridge to peace
67   Konan  I am expendable, I am only a mere flower, but ...
68    Tobi         When Genjutsu is finished, you will be too
69    Tobi  After I've made you tell me where the Rinnegan is

[70 rows x 2 columns]


In [None]:
# Define the speaker to remove
speaker_to_remove = 'Tobi'

# Filter the DataFrame to exclude rows where the speaker is the one you want to remove
filtered_df = df[df['speaker'] != speaker_to_remove]

# Print or save the filtered DataFrame
print(filtered_df)

   speaker                                               text
0    Konan                   Are you trying to find something
2    Konan  I was sure that you would show up looking for ...
3    Konan                          I've been waiting for you
4    Konan              Now that you're here, I will stop you
6    Konan                                    He is the light
7    Konan        That's why we all can carry flowers of hope
10   Konan  The red clouds on these cloaks symbolize the u...
11   Konan  Yahiko was the one who founded the Akatsuki, n...
12   Konan             These cloaks are our legacy, not yours
13   Konan   And the Rinnegan is something awakened by Nagato
14   Konan                    Again, it doesn't belong to you
15   Konan                   They are this village's treasure
22   Konan             I'll take you with me to the afterlife
29   Konan                          Madara, I have a question
30   Konan  Do you want me to tell you why Nagato and I de...
31   Kon

In [None]:
# Drop the 'speaker' column
newdf = filtered_df.drop(columns=['speaker'])

# Reset index after dropping rows
newdf = newdf.reset_index(drop=True)

# Add numbering to the text column
newdf['text'] = [f"{i + 1}. {text}" for i, text in enumerate(newdf['text'])]

print(newdf)

                                                 text
0                 1. Are you trying to find something
1   2. I was sure that you would show up looking f...
2                        3. I've been waiting for you
3            4. Now that you're here, I will stop you
4                                  5. He is the light
5      6. That's why we all can carry flowers of hope
6   7. The red clouds on these cloaks symbolize th...
7   8. Yahiko was the one who founded the Akatsuki...
8           9. These cloaks are our legacy, not yours
9   10. And the Rinnegan is something awakened by ...
10                11. Again, it doesn't belong to you
11               12. They are this village's treasure
12         13. I'll take you with me to the afterlife
13                      14. Madara, I have a question
14  15. Do you want me to tell you why Nagato and ...
15     16. And why instead we put our faith in Naruto
16      17. I realized something after meeting Naruto
17                          

In [None]:
# Drop the 'speaker' column
newdf = filtered_df.drop(columns=['speaker'])

# Reset index after dropping rows
newdf = newdf.reset_index(drop=True)

# Add numbering to the text column
newdf['text'] = [f"{i + 1}. {text}" for i, text in enumerate(newdf['text'])]

print(newdf)

                                                 text
0                 1. Are you trying to find something
1   2. I was sure that you would show up looking f...
2                        3. I've been waiting for you
3            4. Now that you're here, I will stop you
4                                  5. He is the light
5      6. That's why we all can carry flowers of hope
6   7. The red clouds on these cloaks symbolize th...
7   8. Yahiko was the one who founded the Akatsuki...
8           9. These cloaks are our legacy, not yours
9   10. And the Rinnegan is something awakened by ...
10                11. Again, it doesn't belong to you
11               12. They are this village's treasure
12         13. I'll take you with me to the afterlife
13                      14. Madara, I have a question
14  15. Do you want me to tell you why Nagato and ...
15     16. And why instead we put our faith in Naruto
16      17. I realized something after meeting Naruto
17                          

In [None]:
# Define the text file path
direct_text_file_path = 'Konan6Reg.txt'

# Convert the DataFrame to a text file directly
with open(direct_text_file_path, 'w') as file:
    for text in newdf['text']:
        file.write(text + '\n')

print(f"DataFrame saved directly as text file at {direct_text_file_path}")

DataFrame saved directly as text file at Konan6Reg.txt
