In [1]:
# Install necessary libraries

!apt-get update
!apt-get install portaudio19-dev python3-pyaudio
!pip install SpeechRecognition pydub pyaudio wave keyboard
!pip install sounddevice soundfile
!pip install ipywidgets
!pip install transformers


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Ign:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [2,858 kB]
Get:14 http://archive.u

In [2]:
# Import the necessary libraries
import speech_recognition as sr
import json
import pyaudio
import wave
import keyboard
from pydub import AudioSegment
from datetime import datetime
import hashlib
import sounddevice as sd
from google.colab import output
from IPython.display import display, Javascript
import base64
import io
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import os

In [4]:
def load_config(json_file):
    """
    Load configuration from a JSON file.

    Parameters:
    json_file (str): Path to the JSON file containing configuration.

    Returns:
    dict: Configuration dictionary.
    """
    with open(json_file, 'r') as file:
        config = json.load(file)
    return config

def speech_recognition_to_text(audio_file_path, language):
    """
    Convert audio to text using SpeechRecognition.

    Parameters:
    audio_file_path (str): Path to the audio file.

    Returns:
    str: Recognized text from the audio.
    """
    recognizer = sr.Recognizer()

    # Load the audio file
    audio_file = AudioSegment.from_file(audio_file_path)
    audio_file.export("temp.wav", format="wav")

    # Use the audio file as the audio source
    with sr.AudioFile("temp.wav") as source:
        audio = recognizer.record(source)

    # Recognize speech using Google Web Speech API
    text = recognizer.recognize_google(audio, language=language)

    return text

def whisper_to_text(audio_file_path):
    """
    Placeholder for converting audio to text using Whisper.
    This will be implemented once Whisper is available.

    Parameters:
    audio_file_path (str): Path to the audio file.

    Returns:
    None
    """
    pass

def audio_to_text(audio_file_path, config):
    """
    Convert audio to text based on the specified configuration.

    Parameters:
    audio_file_path (str): Path to the audio file.
    config (dict): Configuration dictionary specifying the model to use.

    Returns:
    str: Recognized text from the audio.
    """
    if config['model'] == 'SpeechRecognition':
        language = config['language']
        return speech_recognition_to_text(audio_file_path, language)
    elif config['model'] == 'Whisper':
        return whisper_to_text(audio_file_path)
    else:
        raise ValueError("Unsupported model specified in the config file")

def generate_audio_filename():
    """
    Generate a unique filename for the audio file based on the current date and time.

    Returns:
    str: Generated filename.
    """
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    hash_object = hashlib.md5(current_time.encode())
    hash_hex = hash_object.hexdigest()
    filename = f"audio-{hash_hex}.wav"
    return filename

def record_audio():
    """
    Record audio from the microphone and save it as a WAV file with a unique filename.
    Recording will continue until the user presses 'q' to stop.

    Returns:
    str: Path to the saved audio file.
    """
    filename = generate_audio_filename()
    samplerate = 44100  # Hertz
    channels = 1

    # List all available audio devices
    print(sd.query_devices())

    # Choose the appropriate input device (you may need to change the index)
    input_device_index = 0  # Change this to the correct index for your input device

    print('Recording. Press "q" to stop recording.')
    recording = []

    def callback(indata, frames, time, status):
        recording.append(indata.copy())
        if keyboard.is_pressed('q'):
            raise sd.CallbackAbort

    with sd.InputStream(samplerate=samplerate, channels=channels, callback=callback, device=input_device_index):
        try:
            sd.sleep(1000000)  # Keep the recording stream alive
        except sd.CallbackAbort:
            pass

    print('Finished recording')

    recording = np.concatenate(recording)
    sf.write(filename, recording, samplerate)

    return filename



In [5]:
# JavaScript to record audio
RECORD_AUDIO_JS = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(reader.result)
  reader.readAsDataURL(blob)
})

var record = () => new Promise(resolve => {
  navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
    const recorder = new MediaRecorder(stream)
    const data = []
    recorder.ondataavailable = event => data.push(event.data)
    recorder.start()
    const stop = () => {
      recorder.stop()
      stream.getAudioTracks()[0].stop()
    }
    const startButton = document.getElementById('startButton')
    const stopButton = document.getElementById('stopButton')
    startButton.disabled = true
    stopButton.disabled = false
    stopButton.onclick = () => {
      stop()
      startButton.disabled = false
      stopButton.disabled = true
    }
    recorder.onstop = async () => {
      const audioBlob = new Blob(data, { type: 'audio/wav' })
      const text = await b2text(audioBlob)
      resolve(text)
    }
  })
})

if (!document.getElementById('startButton')) {
  const startButton = document.createElement('button')
  startButton.id = 'startButton'
  startButton.textContent = 'Start Recording'
  document.body.appendChild(startButton)
  const stopButton = document.createElement('button')
  stopButton.id = 'stopButton'
  stopButton.textContent = 'Stop Recording'
  stopButton.disabled = true
  document.body.appendChild(stopButton)
  startButton.onclick = async () => {
    const data = await record()
    google.colab.kernel.invokeFunction('notebook.uploadAudio', [data], {})
  }
}
"""

def generate_audio_filename():
    """
    Generate a unique filename for the audio file based on the current date and time.

    Returns:
    str: Generated filename.
    """
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    hash_object = hashlib.md5(current_time.encode())
    hash_hex = hash_object.hexdigest()
    filename = f"audio-{hash_hex}.wav"
    return filename

def upload_audio(b64_audio):
    audio_data = base64.b64decode(b64_audio.split(',')[1])
    filename = generate_audio_filename()
    with open(filename, "wb") as f:
        f.write(audio_data)
    print(f"Recording saved as {filename}")



In [17]:
def load_gpt2_model():
    """
    Load the GPT-2 model and tokenizer.

    Returns:
    model: The GPT-2 model.
    tokenizer: The GPT-2 tokenizer.
    """
    model_name = 'gpt2'
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    return model, tokenizer

def generate_text_gpt2(prompt):
    """
    Generate text based on a given prompt using GPT-2.

    Parameters:
    prompt (str): The input text prompt.

    Returns:
    str: The generated text.
    """
    model, tokenizer = load_gpt2_model()
    inputs = tokenizer.encode(prompt, return_tensors='pt', truncation=True)
    input_length = inputs.shape[1]
    max_length = model.config.n_positions  # GPT-2 max position embeddings
    max_new_tokens = max_length - input_length
    # Truncate the input if it exceeds the model's max position embeddings
    if input_length > max_length:
        inputs = inputs[:, :max_length]
        input_length = max_length

    # Check if pad_token_id is defined, otherwise use eos_token_id
    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

    # Generate the attention mask
    attention_mask = (inputs != pad_token_id).long()

    outputs = model.generate(
        inputs, attention_mask=attention_mask, max_new_tokens=max_new_tokens, num_return_sequences=1,
        pad_token_id=pad_token_id
    )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def generate_text_gpt4(prompt, max_length=150):
    """
    Placeholder for generating text using GPT-4.

    Parameters:
    prompt (str): The input text prompt.
    max_length (int): The maximum length of the generated text. Default is 150.

    Returns:
    str: The generated text.
    """
    pass

def adapt_transcribed_text_to_report(transcribed_text, config):
    """
    Adapt the transcribed text to a medical radiology report format using the specified LLM.

    Parameters:
    transcribed_text (str): The transcribed text.
    config (dict): Configuration dictionary specifying the model to use.

    Returns:
    str: The adapted report text.
    """
    """prompt = (
        f"Adapt the following text into a structured radiology report:\n"
        f"{transcribed_text}\n\nAdapted Report:\n"
    )"""
    prompt_template = config['prompt_template']
    prompt = prompt_template.replace("[Insert the transcribed text here]", transcribed_text)
    print("Prompt:\n",prompt)

    if config['llm'] == 'gpt2':
        return generate_text_gpt2(prompt)
    elif config['llm'] == 'gpt4':
        return generate_text_gpt4(prompt)
    else:
        raise ValueError("Unsupported LLM specified in the config file")

def save_text_to_file(text, base_filename, suffix):
    """
    Save text to a file with a specific suffix, preserving the base filename.

    Parameters:
    text (str): The text to save.
    base_filename (str): The base filename to use for the saved file.
    suffix (str): The suffix to add to the base filename for the saved file.

    Returns:
    str: The path to the saved file.
    """
    filename = f"{base_filename}_{suffix}.txt"
    with open(filename, "w") as file:
        file.write(text)
    return filename

In [6]:
def main(json_config_path, audio_file_path=None, record_new_audio=False):
    """
    Main function to load configuration, record audio (if specified), and convert audio to text.

    Parameters:
    json_config_path (str): Path to the JSON configuration file.
    audio_file_path (str): Path, including filename, to the audio file. If recording new audio, this is where it will be saved.
    record_new_audio (bool): Whether to record new audio. Default is False.

    Returns:
    None
    """
    config = load_config(json_config_path)

    transcribed_text = audio_to_text(audio_file_path, config)
    print("You said:", transcribed_text)

    adapted_report = adapt_transcribed_text_to_report(transcribed_text, config)
    print("Adapted Report:\n", adapted_report)

    base_filename = os.path.splitext(audio_file_path)[0]
    transcribed_text_filename = save_text_to_file(transcribed_text, base_filename, "transcription")

    adapted_report = adapt_transcribed_text_to_report(transcribed_text, config)

    # Save adapted report to file
    adapted_report_filename = save_text_to_file(adapted_report, base_filename, "report")

    print(f"Transcription saved as: {transcribed_text_filename}")
    print(f"Adapted report saved as: {adapted_report_filename}")


    return 1

In [7]:
# Recording audio
output.register_callback('notebook.uploadAudio', upload_audio)
display(Javascript(RECORD_AUDIO_JS))

<IPython.core.display.Javascript object>

Recording saved as audio-085b8f83d6ea36f35c8f985a22595cbc.wav


In [20]:
# Example usage
# Save your configuration in a JSON file, e.g., config.json:
# {
#     "model": "SpeechRecognition",
#     "language": "es-ES"  # Spanish
# }
json_config_path = "/content/config.json"
audio_file_path = "/content/audio-085b8f83d6ea36f35c8f985a22595cbc.wav"

# Record new audio and transcribe it
main(json_config_path, audio_file_path)

# Or use an existing audio file
# audio_file_path = "path/to/your/audio_file.wav"
# main(json_config_path, audio_file_path)

You said: informe de resonancia magnética de la columna lumbar paciente Juan Pérez fecha de nacimiento 15 de marzo de 1975 fecha del Estudio 23 de julio de 2024 tipo de estudios resonancia magnética RM de la columna lumbar indicación clínica dolor lumbar persistente y radiculopatía los hallazgos son vértebras vértebras lumbares presentan Arturo en relación normal no se observan fracturas ni lesiones óseas focales discos intervertebrales l1 L2 disco Arturo y señales normal L2 l3 mínima disminución de la altura del disco con leve de producción discal sin compromiso significativo del Canal espinal o forámenes neurales l3 l4 producción discal central y para medial derecha con contacto leve a la raíz nerviosa l4 derecha l4 l5 producción discal difusa que contacte de forma el saco tocalli a la izquierda que comprime las likes nerviosa s1 izquierda conclusión el Mariscal para mediar izquierda l5 s1 con comprensión de la raíz nerviosa s1 izquierda recomendaciones se ensucie se sugiere correlac

1