In [None]:
# Step 1 : Setup Your Environment
!pip install torch torchvision transformers torchaudio pillow matplotlib

In [None]:
!pip install numpy --upgrade

In [None]:
!pip install --upgrade torch torchvision torchaudio transformers

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
# Step 2: Load Required Libraries
import torch
from PIL import Image

In [None]:
pip install numpy==1.23.5

In [None]:
from transformers import CLIPProcessor, CLIPModel

In [None]:
import torchaudio
import torchaudio.transforms as T

In [None]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
from torchaudio.models.tacotron2 import Tacotron2

In [None]:
pip install transformers

In [None]:
pip install TTS

In [None]:
pip install numpy==1.23.5

In [None]:
from TTS.api import TTS
import torch

tts = TTS(model_name= "tts_models/en/ljspeech/tacotron2-DDC", gpu=torch.cuda.is_available())

def generate_voice_output_hf(medicine_name):
    # Generate speech audio
    tts.tts_to_file(medicine_name, file_path="medicine_output.wav")
    print(f"Voice output saved as 'medicine_output.wav'")

In [None]:
from PIL import Image

In [None]:
 # Step 3: Dataset Preparation
 def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    return inputs

In [None]:
# Step 4: Encode Text Prompts

medicine_names = ["Paracetamol", "Ibuprofen", "Antazol", "Zimacal"]
text_prompts = [f"A photo of {medicine}" for medicine in medicine_names]

def encode_text(prompts):
    text_inputs = clip_processor(text=text_prompts, return_tensors="pt", padding=True)
    text_features = clip_model.get_text_features(**text_inputs)
    return text_features

In [None]:
# Step 5: Image Recognition
def recognize_medicine(image_path, text_features):
    image_inputs = preprocess_image(image_path)
    image_features = clip_model.get_image_features(**image_inputs)

    # Normalize features
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

    # Compute similarity scores
    similarity = torch.matmul(image_features, text_features.T)
    predicted_index = similarity.argmax().item()
    predicted_name = medicine_names[predicted_index]
    return predicted_name

In [None]:
# Step 6: Generate Voice Output
import os
# Directory to save the outputs
output_dir = "medicine_audio_outputs"
os.makedirs(output_dir, exist_ok=True)

def generate_voice_output(medicine_name):
    """
    Generate speech audio for a given medicine name and save it as a .wav file.
    Args:
        medicine_name (str): The name of the medicine to convert to speech.
    """
    output_path = os.path.join(output_dir, f"{medicine_name}.wav")
    tts.tts_to_file(text=medicine_name, file_path=output_path)
    print(f"Voice output for '{medicine_name}' saved to {output_path}")

In [None]:
# Step 7: Complete

def pipeline(image_path):
    # Encode text prompts
    text_features = encode_text(text_prompts)

    # Recognize medicine
    medicine_name = recognize_medicine(image_path, text_features)
    print(f"Recognized Medicine: {medicine_name}")

    # Generate voice output
    waveform = generate_voice_output(medicine_name)

    # Play the voice output (Optional)
    torchaudio.save("medicine_output.wav", waveform, sample_rate=22050)
    print(f"Voice output saved as 'medicine_output.wav'")

In [None]:
if __name__ == "__main__":
    image_path = "/content/medicine.jpg"
    pipeline(image_path)