In [22]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
from PIL import Image

In [7]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=models.resnet50(pretrained=True)
model.eval()
modules=list(model.children())[:-1]
model=nn.Sequential(*modules).to(device)

In [8]:
preprocess=transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
def get_img_embed(img_path):
    img=Image.open(img_path).convert('RGB')
    img_tensor=preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        img_emb=model(img_tensor)

    return img_emb.squeeze(0)

In [9]:
img_path='/Users/deepmalikpalthya/Downloads/Designer.jpeg'
print(len(get_img_embed(img_path)))

2048


In [26]:
class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(Decoder, self).__init__()
        self.embed=nn.Embedding(vocab_size, embed_size)
        self.lstm=nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear=nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        captions = captions[:, :-1]
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens[:, :-1, :])
        return outputs

In [36]:
captions={}
with open('captions.txt', 'r') as f:
    for line in f:
        if line.strip():
            filename, caption = line.strip().split(',', 1)
            if filename not in captions:
                captions[filename]=[]
            captions[filename].append(caption)

In [37]:
vocabulary = set()
for filename, caption_list in captions.items():
    updated_captions = []  
    for caption in caption_list:
        words = caption.strip().split()
        updated_captions.append(words)
        vocabulary.update(words)
    captions[filename] = updated_captions
vocabulary.add('<start>')
vocabulary.add('<end>')

In [38]:
embed_size = 256
hidden_size = 512
vocab_size = len(vocabulary) 
num_epochs = 10
batch_size = 32
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

data_loader = []
for filename, captions in captions.items():
    img_embed = get_img_embed(f'Images/{filename}')
    for caption in captions:
        caption_idx = [word_to_idx[word] for word in ['<start>'] + caption + ['<end>']]
        data_loader.append((img_embed, torch.tensor(caption_idx)))

model = Decoder(embed_size, hidden_size, vocab_size, num_layers=1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

KeyboardInterrupt: 

In [None]:
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    batches = [data_loader[i:i+batch_size] for i in range(0, len(data_loader), batch_size)]
    for batch in batches:
        optimizer.zero_grad()
        
        img_embeds = torch.stack([data[0] for data in batch]).to(device)
        captions = torch.stack([data[1] for data in batch]).to(device)

        outputs = model(img_embeds, captions)

        captions_shifted = captions[:, 1:].contiguous().view(-1)
        outputs_reshaped = outputs.view(-1, vocab_size)
        loss = criterion(outputs_reshaped, captions_shifted)

        loss.backward()
        optimizer.step()
        
    print(f'Loss: {loss.item()}')

torch.save(model.state_dict(), 'model.pth')