In [1]:
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import requests
import re
from torchvision import models
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder

In [2]:
# Dataset class for loading image and text data
class ImageTextDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.transform = transform
        self.label_encoder = LabelEncoder()
        self.data_frame['encoded_units'] = self.label_encoder.fit_transform(self.data_frame['entity_value'].apply(self.extract_unit))
    
    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_url = self.data_frame.iloc[idx, 0]
        entity_value = self.data_frame.iloc[idx, 3]
        image = Image.open(requests.get(img_url, stream=True).raw)

        if self.transform:
            image = self.transform(image)

        # Use regex to extract the numeric part of entity_value
        match = re.search(r'\d+(\.\d+)?', entity_value)  # Match float or integer numbers
        if match:
            value = float(match.group())  # Extract the numeric part as a float
        else:
            raise ValueError(f"No numeric value found in entity_value: {entity_value}")

        unit = self.extract_unit(entity_value)
        encoded_unit = self.label_encoder.transform([unit])[0]

        # Return image, numeric value, and encoded unit
        return image, torch.tensor(value, dtype=torch.float32), torch.tensor(encoded_unit, dtype=torch.long)

    @staticmethod
    def extract_unit(entity_value):
        # Extract unit from entity_value (improve as needed)
        match = re.search(r'([a-zA-Z]+)', entity_value)
        return match.group() if match else ''

In [3]:
# Define image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [4]:
# Load dataset
train_dataset = ImageTextDataset(csv_file='training_data.csv', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [5]:
# Create a simple ResNet model with separate branches for value and unit prediction
class ResNetWithUnits(torch.nn.Module):
    def __init__(self):
        super(ResNetWithUnits, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        self.resnet.fc = torch.nn.Linear(self.resnet.fc.in_features, 512)  # Intermediate layer
        
        self.value_head = torch.nn.Linear(512, 1)
        self.unit_head = torch.nn.Linear(512, len(train_dataset.label_encoder.classes_))

    def forward(self, x):
        features = self.resnet(x)
        value = self.value_head(features)
        unit = self.unit_head(features)
        return value, unit

model = ResNetWithUnits()



In [6]:
# Training the model (simplified)
criterion_value = torch.nn.MSELoss()  # Mean Squared Error for regression
criterion_unit = torch.nn.CrossEntropyLoss()  # Cross-Entropy Loss for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [7]:
# Training loop
for epoch in range(10):  # Example: Train for 10 epochs
    for images, labels_value, labels_unit in train_loader:
        optimizer.zero_grad()

        outputs_value, outputs_unit = model(images)
        loss_value = criterion_value(outputs_value.squeeze(), labels_value)
        loss_unit = criterion_unit(outputs_unit, labels_unit)
        loss = loss_value + loss_unit
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


Epoch 1, Loss: 36930.13671875
Epoch 2, Loss: 7418.09912109375
Epoch 3, Loss: 640889.75
Epoch 4, Loss: 7154.5400390625
Epoch 5, Loss: 120255.4765625
Epoch 6, Loss: 67426.6484375
Epoch 7, Loss: 75930.3359375
Epoch 8, Loss: 32710.63671875
Epoch 9, Loss: 61815.08984375
Epoch 10, Loss: 14663.91015625


In [12]:
torch.save(model.state_dict(), 'trained_model.pth')

In [14]:
model.eval()  # Set model to evaluation mode
image = Image.open('images/10.jpg')
image = transform(image).unsqueeze(0)  # Apply transformations and add batch dimension
with torch.no_grad():
    predicted_value, predicted_unit = model(image)
    unit = train_dataset.label_encoder.inverse_transform([predicted_unit.argmax(dim=1).item()])[0]
    print(f'Predicted Value: {predicted_value.item()}, Unit: {unit}')

Predicted Value: 111.89685821533203, Unit: gram
