In [None]:
!git clone https://github.com/dame-cell/clip-hindi.git
%cd  clip-hindi
!pip install -r requirements.txt

In [None]:

from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="damerajee/clip-hindi", filename="model.pt",local_dir="model")

In [None]:
import torch 

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH ="/teamspace/studios/this_studio/clip-hindi/model/model.pt"

In [None]:
from datasets import load_dataset

ds = load_dataset("microsoft/cats_vs_dogs",split='train')

In [None]:
def convert_labels(example):
    label_mapping = {
        0: "बिल्ली",  # Hindi for "billi" (cat)
        1: "कुत्ता"   # Hindi for "kutta" (dog)
    }
    
    # Keep the original numerical label
    example['string_labels'] = label_mapping.get(example['labels'], "Unknown")
    return example

df = ds.map(convert_labels,batched=False)
data = df.train_test_split(0.4)
train_dataset = data['train']
val_dataset = data['test']

In [None]:
from torch.utils.data import Dataset
from torchvision import transforms

class CatDog_Dataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ])
        self.label_mapping = {"बिल्ली": 0, "कुत्ता": 1}  # Map Hindi labels to numerical values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image = item['image']
        image = image.convert("RGB")
        label = self.label_mapping[item['string_labels']]  # Convert string label to numerical
        return self.transform(image), label


In [None]:
from torch.utils.data import DataLoader

# Create DataLoader for training and validation sets
train_loader = DataLoader(CatDog_Dataset(train_dataset), batch_size=32, shuffle=True)
val_loader = DataLoader(CatDog_Dataset(val_dataset), batch_size=32, shuffle=False)


In [None]:

from clip.modeling_clip import CLIPModel

import torch.nn as nn
model =CLIPModel().to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))


# Modify the model to include a classifier for subcategories
class CLIPFineTuner(nn.Module):
    def __init__(self, model,hidden_dim, num_classes):
        super(CLIPFineTuner, self).__init__()
        self.model = model
        self.classifier = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        with torch.no_grad():
            features = self.model.image_encoder(x).float()  # Convert to float32
        return self.classifier(features)




In [None]:
num_classes = 2
model_ft = CLIPFineTuner(model,hidden_dim=2048 ,num_classes=num_classes).to(DEVICE)


In [None]:
import torch.optim as optim

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_ft, lr=1e-4)

In [None]:
from tqdm.notebook import tqdm

NUM_EPOCHS = 1

for epoch in range(NUM_EPOCHS):
    model_ft.train()
    running_loss = 0.0  
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: 0.0000"):
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model_ft(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()  # Update running loss
        

    print(f'Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {running_loss/len(train_loader):.4f}')

    model_ft.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)  # Move images and labels to the device
            outputs = model_ft(images)  # Forward pass: compute predicted outputs by passing inputs to the model
            _, predicted = torch.max(outputs.data, 1)  # Get the class label with the highest probability
            total += labels.size(0)  # Update total samples
            correct += (predicted == labels).sum().item()  # Update correct predictions

    print(f'Validation Accuracy: {100 * correct / total:.2f}%') 

In [None]:
image = ds['image'][23000]

In [None]:
from torchvision import transforms
import torch

# Define class labels
class_labels = {0: "बिल्ली", 1: "कुत्ता"}

# Set model to evaluation mode
model_ft.eval()

# Ensure the model won't compute gradients during inference
with torch.no_grad():
    # Example: Assume 'image' is the input image you want to classify
    # Preprocess the image (resize, normalize, etc.)
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ])

    # Apply the preprocessing to your image
    preprocessed_image = transform(image).unsqueeze(0).to(DEVICE)  # Add batch dimension

    # Perform inference
    outputs = model_ft(preprocessed_image)

    # Get the predicted class
    _, predicted_class = torch.max(outputs, 1)

    # Optionally, convert logits to probabilities
    probabilities = torch.softmax(outputs, dim=1)

    # Get the label for the predicted class
    predicted_label = class_labels[predicted_class.item()]

    # Print the predicted class label
    print(f"Predicted class: {predicted_label}")

    # If you want to see the probabilities for each class
    print(f"Class probabilities: {probabilities.cpu().numpy()}")
