In [1]:
import os
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F


In [47]:
train_source = ['2004', '2006', '2008', '2010', '2012', '2014']
train_source = ['2004']
test_source = ['2015', '2017', '2018']

image_folder = '../spectrograms/2006'
matrix_folder = '../midi-processed-values/2006'


In [156]:
image_list = []
for filename in sorted(os.listdir(image_folder)):
  if filename.endswith('.png'):
    image = Image.open(os.path.join(image_folder, filename)).convert('RGB')

    image = np.array(image) / 255.0
    image = np.transpose(image, (2, 0, 1))
    image_list.append(image)
images = np.array(image_list)
print(images.shape)

(115, 3, 500, 1400)


In [149]:
matrices = []
for filename in sorted(os.listdir(matrix_folder)):
  if filename.endswith('.npy'):
    matrix = np.load(os.path.join(matrix_folder, filename))
    print(matrix.shape)
    matrices.append(matrix)

(66776, 17)
(13312, 17)
(30746, 17)
(16225, 17)
(15936, 17)
(24620, 17)
(10186, 17)
(33688, 17)
(26961, 17)
(37508, 17)
(26769, 17)
(33357, 17)
(40172, 17)
(20053, 17)
(32254, 17)
(33127, 17)
(54311, 17)
(18405, 17)
(26059, 17)
(45673, 17)
(39918, 17)
(17423, 17)
(40226, 17)
(29467, 17)
(19603, 17)
(66648, 17)
(51879, 17)
(25576, 17)
(21057, 17)
(47320, 17)
(43979, 17)
(23441, 17)
(13470, 17)
(9675, 17)
(26129, 17)
(23468, 17)
(17429, 17)
(56193, 17)
(27997, 17)
(12352, 17)
(38157, 17)
(15934, 17)
(32702, 17)
(30592, 17)
(32716, 17)
(28887, 17)
(10428, 17)
(28188, 17)
(49863, 17)
(63988, 17)
(36844, 17)
(10978, 17)
(40465, 17)
(20371, 17)
(68524, 17)
(23315, 17)
(3535, 17)
(10146, 17)
(5500, 17)
(16231, 17)
(12534, 17)
(8396, 17)
(16567, 17)
(37227, 17)
(7566, 17)
(25329, 17)
(17932, 17)
(71282, 17)
(27858, 17)
(15241, 17)
(22619, 17)
(24817, 17)
(15258, 17)
(35043, 17)
(12778, 17)
(58672, 17)
(33069, 17)
(15094, 17)
(34176, 17)
(17036, 17)
(27930, 17)
(14222, 17)
(24252, 17)
(9033, 17

In [169]:
for i, matrix in enumerate(matrices[:5]):  # Check first 5 matrices
    print(f"Matrix {i} shape: {matrix.shape}")


Matrix 0 shape: torch.Size([32254, 17])
Matrix 1 shape: torch.Size([32254, 17])
Matrix 2 shape: torch.Size([32254, 17])
Matrix 3 shape: torch.Size([32254, 17])
Matrix 4 shape: torch.Size([32254, 17])


In [158]:
assert len(images) == len(matrices)

In [159]:
X_train, X_temp, y_train, y_temp = train_test_split(images, matrices, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [83]:
# Defining the data loader
class CustomDataset(Dataset):
    def __init__(self, images, matrices, max_rows=None):
        self.images = images
        self.matrices = matrices
        self.max_rows = max_rows

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        matrix = self.matrices[idx]

        if self.max_rows:
            padded_matrix = F.pad(torch.tensor(matrix), (0, 0, 0, self.max_rows - matrix.shape[0]))
        else:
            padded_matrix = matrix

        image_tensor = torch.tensor(image).float()

        return image_tensor, padded_matrix

In [136]:
def custom_collate_fn(batch):
    images, matrices = zip(*batch)

    # Pad the matrices to have consistent row lengths
    max_rows = max([matrix.shape[0] for matrix in matrices])  # Find the max rows in the batch
    padded_matrices = []

    for matrix in matrices:
        # Pad each matrix (if necessary) to the max row length
        padded_matrix = F.pad(torch.tensor(matrix), (0, 0, 0, max_rows - matrix.shape[0]), value=0)
        padded_matrices.append(padded_matrix)

    # Stack the padded matrices
    padded_matrices = torch.stack(padded_matrices)

    # Stack images (assuming they are already in the right format)
    images = torch.stack([torch.tensor(image).float() for image in images])

    return images, padded_matrices


In [164]:
# Creating Datasets and Data Loaders
train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=custom_collate_fn)


In [170]:
for images, matrices in train_loader:
  print(f"Images shape: {images.shape}, Matrices shape: {matrices[0].shape}")
  print(matrices[0][0])

  images = torch.stack([torch.tensor(image).float() for image in images])


Images shape: torch.Size([32, 3, 500, 1400]), Matrices shape: torch.Size([68524, 17])
tensor([  1, 384, 938,   2,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
         -1,  -1,  -1])
Images shape: torch.Size([32, 3, 500, 1400]), Matrices shape: torch.Size([62688, 17])
tensor([  1, 384, 684,   2,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
         -1,  -1,  -1])
Images shape: torch.Size([5, 3, 500, 1400]), Matrices shape: torch.Size([32254, 17])
tensor([  1, 384, 658,   2,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
         -1,  -1,  -1])


In [171]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 8 * 8, 512)  # Adjust based on your image size
        self.fc2 = nn.Linear(512, 8500)  # 100 * 17

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(-1, 47913600)  # Adjust based on your image size
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.view(-1, 100, 17)
        return x


In [142]:
print(X_train.shape)

torch.Size([1, 3, 500, 1400])


In [148]:
# Example usage
model = CNNModel()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Assuming X_train is your image tensor and Y_train is your 100x17 matrix
for epoch in range(10):  # Number of epochs
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/10], Loss: {loss.item():.4f}')

RuntimeError: shape '[-1, 47913600]' is invalid for input of size 694400

In [13]:
# Initialize the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, loss function, optimizer
model = CNNModel(input_channels=3, output_dim=10)  # Modify according to your use case
criterion = nn.CrossEntropyLoss()  # Assuming classification task, modify if necessary
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode

    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for images, labels in train_loader:  # Assuming 'train_loader' is your DataLoader
        images, labels = images.to(device), labels.to(device)  # Move data to GPU if available

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)

        # Compute the loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Track loss and accuracy
        running_loss += loss.item()

        # Get the predicted class
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

    # Print statistics for the current epoch
    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct_predictions / total_predictions

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")


  images = torch.stack([torch.tensor(image).float() for image in images])


RuntimeError: 0D or 1D target tensor expected, multi-target not supported