In [None]:
!pip install pydub
!apt-get install ffmpeg -y

import os
import librosa
import pandas as pd
from google.colab import drive
from transformers import pipeline
from sklearn.model_selection import train_test_split
from pydub import AudioSegment

drive.mount('/content/drive')

asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=-1)
asr_pipeline.model.config.forced_decoder_ids = asr_pipeline.tokenizer.get_decoder_prompt_ids(language="en", task="transcribe")

audio_dir = "/content/drive/MyDrive/audiototext/audiomp3"

# Convert MP3 files to WAV format
for file in os.listdir(audio_dir):
    if file.endswith(".mp3"):
        mp3_path = os.path.join(audio_dir, file)
        wav_path = os.path.join(audio_dir, os.path.splitext(file)[0] + ".wav")
        audio = AudioSegment.from_mp3(mp3_path)
        audio.export(wav_path, format="wav")
        print(f"Converted {file} to {wav_path}")

# Filter for WAV files only
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".wav")]

# Split files into train and test sets
train_files, test_files = train_test_split(audio_files, test_size=0.2, random_state=42)

# Function to transcribe audio files
def transcribe_files(file_list):
    transcriptions = []
    for audio_file in file_list:
        audio_path = os.path.join(audio_dir, audio_file)
        audio, _ = librosa.load(audio_path, sr=16000)

        # Perform ASR on the audio
        transcription = asr_pipeline(audio)["text"]
        transcriptions.append({"filename": audio_file, "transcription": transcription})
        print(f"Processed {audio_file}: {transcription}")
    return transcriptions

# Transcribe training and testing sets
train_transcriptions = transcribe_files(train_files)
test_transcriptions = transcribe_files(test_files)

# Convert transcriptions to DataFrames
df_train = pd.DataFrame(train_transcriptions)
df_test = pd.DataFrame(test_transcriptions)

# Save CSV and TXT files
df_train.to_csv("/content/drive/MyDrive/audiototext/train_transcriptions.csv", index=False)
df_test.to_csv("/content/drive/MyDrive/audiototext/test_transcriptions.csv", index=False)

with open("/content/drive/MyDrive/audiototext/train_transcriptions.txt", "w") as f:
    for index, row in df_train.iterrows():
        f.write(f"{row['filename']}: {row['transcription']}\n")

with open("/content/drive/MyDrive/audiototext/test_transcriptions.txt", "w") as f:
    for index, row in df_test.iterrows():
        f.write(f"{row['filename']}: {row['transcription']}\n")

print("Training and testing transcriptions saved to train_transcriptions.csv, test_transcriptions.csv, train_transcriptions.txt, and test_transcriptions.txt")
