In [None]:
!pip install --upgrade transformers

In [None]:
!pip install numpy==1.23.5

In [None]:
import torch

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("pkdarabi/the-drug-name-detection-dataset")

print("Path to dataset files:", path)

In [None]:
# Dataset paths
data_dir = "/root/.cache/kagglehub/datasets/pkdarabi/the-drug-name-detection-dataset/versions/1"

In [None]:
from torchvision import datasets, transforms
# Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [None]:
# Load dataset
dataset = datasets.ImageFolder(data_dir, transform=transform)

In [None]:
# Split dataset
train_size = int(0.7 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size, test_size]
)

In [None]:
from torch.utils.data import DataLoader

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Function to visualize a few images from the dataset
def show_sample_images(loader, classes):
    data_iter = iter(loader)
    images, labels = next(data_iter)
    images = images[:6]  # Show first 6 images
    labels = labels[:6]

    fig, axes = plt.subplots(1, 6, figsize=(15, 5))
    for i in range(6):
        img = images[i].permute(1, 2, 0).numpy() * 0.5 + 0.5  # Denormalize
        axes[i].imshow(img)
        axes[i].set_title(classes[labels[i]])
        axes[i].axis('off')
    plt.show()

# Visualize samples
show_sample_images(train_loader, dataset.classes)


In [None]:
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

train_dataset.dataset.transform = train_transform

In [None]:
from collections import Counter

# Get class distribution
class_counts = Counter([label for _, label in dataset.samples])
for cls, count in class_counts.items():
    print(f"{dataset.classes[cls]}: {count}")


In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
import numpy
from transformers import CLIPProcessor, CLIPModel
import torch

In [None]:
# Step 2 : Fine - Tune the CLIP Model
from torch.optim import AdamW
from transformers import CLIPProcessor, CLIPModel
import torch
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
import torch.nn as nn

# Load pre-trained CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# for param in clip_model.parameters():
#     param.requires_grad = False

# # Fine-tune only the classification layer or specific layers
# for param in clip_model.text_projection.parameters():
#     param.requires_grad = True

# # Enable gradient computation for the vision encoder
# for param in clip_model.vision_model.parameters():
#     param.requires_grad = True  # Fine-tune all layers of the visual encoder

# # If you also want to fine-tune the text encoder:
# for param in clip_model.text_model.parameters():
#     param.requires_grad = True  # Fine-tune all layers of the text encoder

# Fine-tuning settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model.to(device)
optimizer = Adam(clip_model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [None]:
# De-normalize images for CLIPProcessor
def denormalize(images, mean, std):
    mean = torch.tensor(mean).view(1, 3, 1, 1).to(images.device)
    std = torch.tensor(std).view(1, 3, 1, 1).to(images.device)
    return images * std + mean

epochs = 10

# Training loop
for epoch in range(epochs):
    clip_model.train()
    for images, labels in train_loader:
        # De-normalize images
        images = denormalize(images, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

        # Convert images to PIL-compatible format using CLIPProcessor
        inputs = clip_processor(images=images, return_tensors="pt", do_rescale=False).to(device)
        labels = labels.to(device)

        # Forward pass
        image_features = clip_model.get_image_features(pixel_values=inputs["pixel_values"])
        loss = criterion(image_features, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs} - Loss: {loss.item()}")

In [None]:
from sklearn.metrics import accuracy_score
import torch

# Ensure the model is in evaluation mode
clip_model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for images, labels in test_loader:
        # De-normalize images for CLIPProcessor
        images = denormalize(images, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

        # Clamp pixel values to [0, 1]
        images = torch.clamp(images, min=0.0, max=1.0)

        # Process images
        inputs = clip_processor(images=images.permute(0, 2, 3, 1).cpu(), return_tensors="pt", do_rescale=False)
        pixel_values = inputs["pixel_values"].to(device)

        # Extract image features and classify
        image_features = clip_model.get_image_features(pixel_values=pixel_values)

        # Use a classification layer if added during training
        preds = torch.argmax(image_features, dim=-1)

        # Store predictions and true labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Image Recognition Accuracy: {accuracy * 100:.2f}%")



In [None]:
from TTS.api import TTS

# Initialize Tacotron2
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=torch.cuda.is_available())

def generate_voice_output(medicine_name, output_path="medicine_output.wav"):
    tts.tts_to_file(medicine_name, file_path=output_path)
    print(f"Voice output saved as {output_path}")

# Example usage
generate_voice_output("Paracetamol")


In [None]:
import numpy as np

# Example listener scores
listener_scores = {
    "listener_1": [5, 4, 4],
    "listener_2": [4, 4, 5],
    "listener_3": [5, 5, 4]
}

# Calculate Mean Opinion Score
mos_score = np.mean([np.mean(scores) for scores in listener_scores.values()])
print(f"Mean Opinion Score (MOS): {mos_score:.1f}")

In [None]:
pip install gTTS

In [None]:
!pip install TTS

In [None]:
# Resnet

import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load ResNet-50
class ResNetClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ResNetClassifier, self).__init__()
        self.backbone = models.resnet50(pretrained=True)
        self.backbone.fc = nn.Linear(self.backbone.fc.in_features, num_classes)

    def forward(self, x):
        return self.backbone(x)

# Initialize ResNet model
# Use the dataset object to get the number of classes
resnet_model = ResNetClassifier(num_classes=len(dataset.classes)).to(device)
resnet_optimizer = torch.optim.Adam(resnet_model.parameters(), lr=1e-4)
resnet_criterion = nn.CrossEntropyLoss()

In [None]:
# Training loop for ResNet
epochs = 10
for epoch in range(epochs):
    resnet_model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = resnet_model(images)
        loss = resnet_criterion(outputs, labels)

        # Backward pass
        resnet_optimizer.zero_grad()
        loss.backward()
        resnet_optimizer.step()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item()}")

In [None]:
from sklearn.metrics import accuracy_score

resnet_model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = resnet_model(images)
        preds = torch.argmax(outputs, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy_resnet = accuracy_score(all_labels, all_preds)
print(f"ResNet Image Recognition Accuracy: {accuracy_resnet * 100:.2f}%")


In [None]:
from gtts import gTTS
import os

def basic_tts(medicine_name, output_path="basic_tts_output.wav"):
    tts = gTTS(text=medicine_name, lang='en')
    tts.save(output_path)
    print(f"Basic TTS Voice Output Saved: {output_path}")

# Example usage
basic_tts("Paracetamol")

In [None]:
import numpy as np

# Example listener scores
listener_scores_resnet = {
    "listener_1": [3, 4, 3],
    "listener_2": [4, 4, 3],
    "listener_3": [3, 4, 4]
}

# Calculate MOS
mos_resnet = np.mean([np.mean(scores) for scores in listener_scores_resnet.values()])
print(f"Mean Opinion Score (MOS) for ResNet + Basic TTS: {mos_resnet:.1f}")


In [None]:
# Baseline SOTA
!pip install timm

In [None]:
!pip install numpy==1.23.5

In [None]:
import torch.nn as nn
from torch.optim import Adam
from torchvision import transforms, datasets
from transformers import ViTForImageClassification, ViTImageProcessor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize Vision Transformer (ViT)
model_name = "google/vit-base-patch16-224-in21k"
feature_extractor = ViTImageProcessor.from_pretrained(model_name)
vit_model = ViTForImageClassification.from_pretrained(model_name, num_labels=len(dataset.classes)).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(vit_model.parameters(), lr=1e-4)

# Training Loop
epochs = 10
for epoch in range(epochs):
    vit_model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        # Move images and labels to device
        images, labels = images.to(device), labels.to(device)

        # Ensure pixel values are in the correct range
        images = torch.clamp(images, 0, 1)

        # Process images with the feature extractor
        inputs = feature_extractor(images=images.permute(0, 2, 3, 1).cpu(), return_tensors="pt", do_rescale=False).to(device)
        optimizer.zero_grad()

        # Forward pass
        outputs = vit_model(**inputs)
        loss = criterion(outputs.logits, labels)

        # Backward pass
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs} - Loss: {running_loss/len(train_loader):.4f}")



In [None]:
from sklearn.metrics import accuracy_score

vit_model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for images, labels in test_loader:
        # Move images and labels to the appropriate device
        images, labels = images.to(device), labels.to(device)

        # Rescale images to the expected range [0, 1]
        images = torch.clamp((images + 1) / 2, 0, 1)  # Convert from [-1, 1] to [0, 1]

        # Convert images for the ViT feature extractor
        inputs = feature_extractor(images=images.permute(0, 2, 3, 1).cpu(), return_tensors="pt").to(device)

        # Pass through the ViT model
        outputs = vit_model(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1)

        # Store predictions and labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"ViT Image Recognition Accuracy: {accuracy * 100:.2f}%")


In [None]:
# Generate Voice Output with gTTS
def generate_voice_output_gtts(text, output_path="output.mp3"):
    tts = gTTS(text=text, lang="en")
    tts.save(output_path)
    print(f"Voice output saved at {output_path}")

# Example Usage
medicine_name = "Paracetamol"
generate_voice_output_gtts(f"The medicine name is {medicine_name}.", output_path="medicine_name.mp3")


In [None]:
listener_scores = {
    "listener_1": [4, 5, 4, 4],
    "listener_2": [5, 4, 4, 5],
    "listener_3": [4, 4, 5, 4]
}

import numpy as np

# Calculate MOS score
mos_score = np.mean([np.mean(scores) for scores in listener_scores.values()])
print(f"Mean Opinion Score (MOS) for ViT: {mos_score:.1f}")


In [None]:
import matplotlib.pyplot as plt

models = ["CLIP + Tacotron2", "ResNet + Basic TTS", "Baseline (SOTA)"]
accuracy = [72.13, 56.28, 67.76]  # Replace with your accuracy values

plt.figure(figsize=(8, 6))
plt.bar(models, accuracy, color=['blue', 'orange', 'green'])
plt.ylabel("Image Recognition Accuracy (%)")
plt.title("Accuracy Comparison Across Models")
plt.ylim(0, 100)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()


In [None]:
mos_scores = [4.4, 3.8, 4.3]  # Replace with your MOS values

plt.figure(figsize=(8, 6))
plt.bar(models, mos_scores, color=['blue', 'orange', 'green'])
plt.ylabel("Mean Opinion Score (MOS)")
plt.title("MOS Score Comparison Across Models")
plt.ylim(0, 5)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()
