In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import os
import seaborn as sns 
import numpy.typing
from glob import glob
import librosa
import librosa.display
import IPython.display as ipd
from itertools import cycle
import torch.nn as nn
from torchsummary import summary
import commons
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split


sns.set_theme(style="white", palette=None)
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [29]:
# Assuming you're in mtg-jamendo-dataset
input_file = r"C:\Users\User\Desktop\Computational Data Science\CDS Project\mtg-jamendo-dataset\data\autotagging_moodtheme.tsv"
tracks, tags, extra = commons.read_file(input_file)

tracks

Reading: 18486 tracks, 4506 albums, 1533 artists


{948: {'artist_id': 87,
  'album_id': 149,
  'path': '48/948.mp3',
  'duration': 212.7,
  'tags': ['mood/theme---background'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'background'}},
 950: {'artist_id': 87,
  'album_id': 149,
  'path': '50/950.mp3',
  'duration': 248.0,
  'tags': ['mood/theme---background'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'background'}},
 951: {'artist_id': 87,
  'album_id': 149,
  'path': '51/951.mp3',
  'duration': 199.7,
  'tags': ['mood/theme---background'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'background'}},
 2165: {'artist_id': 326,
  'album_id': 347,
  'path': '65/2165.mp3',
  'duration': 229.0,
  'tags': ['mood/theme---film'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'film'}},
 2263: {'artist_id': 320,
  'album_id': 366,
  'path': '63/2263.mp3',
  'duration': 494.7,
  'tags': ['mood/theme---melancholic'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'melancholic'}},

In [73]:
import librosa
import numpy as np
import os

audio_folder = r"C:/Users/User/Desktop/Computational Data Science/CDS Project/Extracted Data/"

def split_audio(y, sr, duration=30):
    """
    Split audio into segments of specified duration and pad if necessary.
    
    Args:
    - y (np.ndarray): Audio waveform.
    - sr (int): Sampling rate.
    - duration (int): Duration of each segment in seconds.
    
    Returns:
    - segments (list): List of audio segments.
    """
    segment_length = duration * sr
    num_segments = len(y) // segment_length
    remainder = len(y) % segment_length
    segments = []
    
    for i in range(num_segments):
        segment = y[i * segment_length : (i + 1) * segment_length]
        segments.append(segment)
    
    # Pad the last segment if necessary
    if remainder > 0:
        padding = np.zeros(segment_length - remainder)
        padded_segment = np.concatenate([y[-remainder:], padding])
        segments.append(padded_segment)
    
    return segments

def get_features_and_labels(data, max_len=None, segment_duration=30):
    features = []
    labels = []
    max_len = 0

    for track_id, track_data in data.items():
        if track_data['path'].startswith('00/') or track_data['path'].startswith('01/') or track_data['path'].startswith('02/') or track_data['path'].startswith('03/'):
            if all(tag in ['energetic', 'relaxing', 'emotional', 'dark', 'love', 'sad'] for tag in track_data['mood/theme']):
                try:
                    # Load the audio file
                    filename = track_data['path']
                    filepath = os.path.join(audio_folder, filename)
                    y, sr = librosa.load(filepath, sr=10000)

                    # Split audio into 30-second segments
                    audio_segments = split_audio(y, sr, duration=segment_duration)
                    num_segments = len(audio_segments)  # Number of segments generated

                    for segment in audio_segments:
                        # Extract Mel spectrograms
                        S = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=15)
                        S_dB = librosa.power_to_db(S, ref=np.max)

                        # Update max_len if necessary
                        max_len = max(max_len, S_dB.shape[1])

                        # Append features
                        features.append(S_dB)
                        
                    # Replicate label for each segment
                    labels.extend([track_data['mood/theme']] * num_segments)

                except Exception as e:
                    print(f"Error processing track {track_id}: {e}")

    # Pad or truncate MFCCs arrays to ensure they all have the same length
    padded_features = []
    for S_dB in features:
        padded_mel = np.pad(S_dB, ((0, 0), (0, max_len - S_dB.shape[1])), mode='constant')
        padded_features.append(padded_mel)

    # Convert lists to numpy arrays
    features_array = np.array(padded_features)

    return features_array, labels

# Assuming 'tracks' is the dictionary containing track data
X, y = get_features_and_labels(tracks)


In [74]:
print("Features shape:", X.shape)
# print(X)
print("Labels:", y)
print("Labels Length:", len(y))

Features shape: (918, 15, 586)
Labels: [{'dark'}, {'dark'}, {'dark'}, {'dark'}, {'dark'}, {'dark'}, {'dark'}, {'dark'}, {'dark'}, {'dark'}, {'dark'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {'emotional', 'energetic'}, {'emotional', 'energetic'}, {'emotional', 'energetic'}, {'emotional', 'energetic'}, {'emotional', 'energetic'}, {'emotional', 'energetic'}, {'emotional', 'energetic'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'emotional'}, {'relaxing'}, {'relaxing'}, {'relaxing'}, {

In [75]:


mlb = MultiLabelBinarizer()

# Fit the MultiLabelBinarizer and transform labels into binary arrays
labels_array = mlb.fit_transform(y)

print(labels_array)

[[1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 ...
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]]


In [76]:
# Inverse transform binary arrays back into sets of labels
decoded_labels = mlb.inverse_transform(labels_array)

print(len(decoded_labels))
len(labels_array)

918


918

In [77]:
X_normalised = np.array((X-np.min(X))/(np.max(X)-np.min(X)))
# X_normalised = X_normalised/np.std(X_normalised)

In [78]:
X_normalised

array([[[0.03728857, 0.11141443, 0.11922512, ..., 0.92255341,
         0.92416434, 0.92641158],
        [0.        , 0.04015694, 0.05751486, ..., 0.80655234,
         0.80175324, 0.80926977],
        [0.        , 0.11932096, 0.14628524, ..., 0.756614  ,
         0.74907704, 0.74342883],
        ...,
        [0.        , 0.        , 0.        , ..., 0.66856218,
         0.66405609, 0.53373688],
        [0.        , 0.        , 0.        , ..., 0.65871366,
         0.6499806 , 0.52221221],
        [0.        , 0.        , 0.        , ..., 0.66811488,
         0.65493008, 0.51915473]],

       [[0.88871038, 0.92552793, 0.93085505, ..., 0.9073221 ,
         0.90836784, 0.90322893],
        [0.7509276 , 0.77809604, 0.8005659 , ..., 0.80948897,
         0.79890771, 0.78935383],
        [0.72721298, 0.75437039, 0.74622872, ..., 0.75198994,
         0.72892044, 0.70528696],
        ...,
        [0.43952292, 0.50415219, 0.5241412 , ..., 0.71647211,
         0.69305869, 0.66921779],
        [0.4

In [79]:
# Split twice to get the validation set
X_train, X_test, y_train, y_test = train_test_split(X_normalised, labels_array, test_size = 0.25, random_state = 123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 123)

# Print the shapes
X_train.shape, X_test.shape, X_val.shape

((516, 15, 586), (230, 15, 586), (172, 15, 586))

In [80]:
len(y_train), len(y_test), len(y_val)

(516, 230, 172)

NameError: name 'j' is not defined

In [81]:
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self, num_class=6):
        super(SimpleCNN, self).__init__()

        # Layer 1
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Layer 2
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Layer 3
        self.conv3 = nn.Conv2d(64, 8, kernel_size=3, padding=1)
        #self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Fully connected layers
        self.fc1 = nn.Linear(3504, 512)
        self.fc2 = nn.Linear(512, num_class)
        self.flatten = nn.Flatten()

        # Dropout
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # Layer 1
        
        x = self.pool1(F.relu(self.conv1(x)))
        

        # # Layer 2
        
        x = self.pool2(F.relu(self.conv2(x)))
        

        # # Layer 3
        
        x = F.relu(self.conv3(x))
        
        
        # Flatten
        
        x = self.flatten(x)
        

        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)

        return x


In [82]:
#inputs.unsqueeze(1).shape
model = SimpleCNN()
model.to(device)
out = model(inputs.unsqueeze(1).to(device))

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 1, 1, 15, 586]

In [22]:
sum(y_train<0)

array([0, 0, 0, 0, 0, 0])

In [53]:
# class CNN(nn.Module):
#     def __init__(self, num_class=28):
#         super(CNN, self).__init__()

#         # init bn
#         self.bn_init = nn.BatchNorm2d(1)

#         # layer 1
#         self.conv_1 = nn.Conv2d(1, 64, 3, padding=1)
#         self.bn_1 = nn.BatchNorm2d(64)
#         self.mp_1 = nn.MaxPool2d((2, 4), stride=(1, 2))  

#         # layer 2
#         self.conv_2 = nn.Conv2d(64, 128, 3, padding=1)
#         self.bn_2 = nn.BatchNorm2d(128)
#         self.mp_2 = nn.MaxPool2d((2, 4), stride=(1, 2))  

#         # layer 3
#         self.conv_3 = nn.Conv2d(128, 128, 3, padding=1)
#         self.bn_3 = nn.BatchNorm2d(128)
#         self.mp_3 = nn.MaxPool2d((2, 4), stride=(1, 2))  

#         # layer 4
#         self.conv_4 = nn.Conv2d(128, 128, 3, padding=1)
#         self.bn_4 = nn.BatchNorm2d(128)
#         self.mp_4 = nn.MaxPool2d((2, 4), stride=(2, 2))  

#         # layer 5
#         self.conv_5 = nn.Conv2d(128, 64, 3, padding=1)
#         self.bn_5 = nn.BatchNorm2d(64)
#         self.mp_5 = nn.MaxPool2d((2, 4), stride=(2, 2))  

#         # Calculate input size for linear layer
#         self.flatten_size = self._calculate_flatten_size()

#         # classifier
#         self.dense = nn.Linear(self.flatten_size, num_class)
#         self.dropout = nn.Dropout(0.5)

#     def _calculate_flatten_size(self):
#         # Sample input to calculate the size after convolutional layers
#         input_tensor = torch.randn(1, 1, 22, 22)
#         x = self.bn_init(input_tensor.unsqueeze(1))
#         x = self.mp_1(nn.ELU()(self.bn_1(self.conv_1(x))))
#         x = self.mp_2(nn.ELU()(self.bn_2(self.conv_2(x))))
#         x = self.mp_3(nn.ELU()(self.bn_3(self.conv_3(x))))
#         x = self.mp_4(nn.ELU()(self.bn_4(self.conv_4(x))))
#         x = self.mp_5(nn.ELU()(self.bn_5(self.conv_5(x))))
#         return x.view(1, -1).size(1)

#     def forward(self, x):
#         x = x.unsqueeze(1)

#         # init bn
#         x = self.bn_init(x)

#         # layer 1
#         x = self.mp_1(nn.ELU()(self.bn_1(self.conv_1(x))))

#         # layer 2
#         x = self.mp_2(nn.ELU()(self.bn_2(self.conv_2(x))))

#         # layer 3
#         x = self.mp_3(nn.ELU()(self.bn_3(self.conv_3(x))))

#         # layer 4
#         x = self.mp_4(nn.ELU()(self.bn_4(self.conv_4(x))))

#         # layer 5
#         x = self.mp_5(nn.ELU()(self.bn_5(self.conv_5(x))))

#         # Flatten
#         x = x.view(x.size(0), -1)
#         x = self.dropout(x)

#         # classifier
#         logit = self.dense(x)

#         return logit


In [83]:
from torch.utils.data import DataLoader,Dataset
from torchvision import datasets, transforms
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, models, transforms
from torchsummary import summary
import pandas as pd
import os

In [84]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import sys

import matplotlib.pyplot as plt
import IPython.display as ipd

from tqdm import tqdm

In [85]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
model = SimpleCNN()

# Step 1: Convert NumPy arrays to PyTorch tensors

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_train_tensor = X_train_tensor.unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)  # Assuming you're using labels_array as binary labels
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
X_val_tensor = X_val_tensor.unsqueeze(1)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)  # Assuming you're using labels_array as binary labels

# Step 2: Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss for multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Assuming 'model' is an instance of your CNN class

# Step 3: Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

torch.Size([142, 1, 15, 586])

In [86]:


# Step 4: Training loop
num_epochs = 10
device = torch.device('cpu')
model.to(device)

for epoch in tqdm(range(num_epochs)):
    model.train()  # Set the model to training mode
    train_loss = 0.0
    
    for inputs, labels in train_loader:
        
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()  # Clear gradients
        
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Compute the loss
        
        loss.backward()  # Backpropagation
        optimizer.step()  # Update the model's parameters
        
        train_loss += loss.item() * inputs.size(0)
    
    # Calculate average loss for the epoch
    train_loss /= len(train_loader.dataset)
    
    # Validate the model
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute the loss
            
            val_loss += loss.item() * inputs.size(0)
    
    # Calculate average loss for validation set
    val_loss /= len(val_loader.dataset)
    
    # Print training progress
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

# Step 5: Evaluate the model on the test set if needed
# Note: You may need to convert X_test to a PyTorch tensor and run it through the model to get predictions.



 10%|█         | 1/10 [00:07<01:10,  7.79s/it]

Epoch 1/10, Train Loss: 0.4865, Val Loss: 0.4703


 20%|██        | 2/10 [00:15<01:00,  7.61s/it]

Epoch 2/10, Train Loss: 0.4756, Val Loss: 0.4575


 30%|███       | 3/10 [00:22<00:53,  7.64s/it]

Epoch 3/10, Train Loss: 0.4728, Val Loss: 0.4631


 40%|████      | 4/10 [00:30<00:45,  7.57s/it]

Epoch 4/10, Train Loss: 0.4749, Val Loss: 0.4581


 50%|█████     | 5/10 [00:37<00:37,  7.54s/it]

Epoch 5/10, Train Loss: 0.4699, Val Loss: 0.4553


 60%|██████    | 6/10 [00:45<00:30,  7.57s/it]

Epoch 6/10, Train Loss: 0.4729, Val Loss: 0.4610


 70%|███████   | 7/10 [00:53<00:22,  7.54s/it]

Epoch 7/10, Train Loss: 0.4716, Val Loss: 0.4562


 80%|████████  | 8/10 [01:00<00:15,  7.52s/it]

Epoch 8/10, Train Loss: 0.4695, Val Loss: 0.4600


 90%|█████████ | 9/10 [01:08<00:07,  7.55s/it]

Epoch 9/10, Train Loss: 0.4691, Val Loss: 0.4505


100%|██████████| 10/10 [01:15<00:00,  7.56s/it]

Epoch 10/10, Train Loss: 0.4632, Val Loss: 0.4366





In [99]:
# Step 5: Evaluate the model on the test set

# Assuming X_test is a NumPy array containing your test data

# Convert X_test to a PyTorch tensor
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
X_test_tensor = X_test_tensor.unsqueeze(1)  # Add a channel dimension if necessary

y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
y_true = np.argmax(y_test_tensor, axis=1) 
# Move the tensor to the same device as the model
X_test_tensor = X_test_tensor.to(device)

# Pass the test data through the model to get predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    predictions = model(X_test_tensor)

# Convert predictions to probabilities using sigmoid function
probabilities = torch.sigmoid(predictions)

# Optionally, you can convert probabilities to binary predictions
binary_predictions = np.argmax(probabilities, axis=1)

# Now you can use binary_predictions for further evaluation
# For example, you can calculate accuracy:
# accuracy = (binary_predictions == y_test_tensor).float().mean()

# Or any other evaluation metric you are interested in
def F1Measure(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        if (sum(y_true[i]) == 0) and (sum(y_pred[i]) == 0):
            continue
        temp+= (2*sum(np.logical_and(y_true[i], y_pred[i])))/ (sum(y_true[i])+sum(y_pred[i]))
    return temp/ y_true.shape[0]




In [102]:
(y_true == binary_predictions).float().mean()

tensor(0.2478)

In [96]:
print(F1Measure(y_true, binary))

tensor([2, 4, 2, 1, 2, 4, 2, 1, 4, 2, 4, 2, 2, 2, 4, 4, 4, 2, 2, 1, 2, 2, 4, 2,
        2, 4, 5, 4, 4, 4, 2, 2, 2, 2, 2, 4, 2, 1, 2, 2, 2, 2, 4, 2, 1, 1, 2, 4,
        2, 2, 1, 2, 2, 4, 1, 2, 2, 1, 1, 2, 4, 2, 1, 4, 2, 2, 1, 4, 1, 2, 1, 2,
        2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4,
        2, 1, 2, 2, 4, 4, 1, 4, 4, 2, 2, 2, 2, 2, 4, 2, 4, 1, 2, 2, 1, 4, 2, 2,
        2, 1, 1, 2, 2, 1, 2, 1, 4, 2, 1, 2, 4, 2, 2, 2, 4, 2, 4, 2, 4, 1, 1, 4,
        2, 2, 4, 2, 2, 1, 2, 1, 2, 2, 1, 4, 2, 4, 4, 2, 2, 4, 1, 2, 2, 4, 2, 2,
        1, 4, 2, 4, 2, 4, 1, 4, 5, 2, 2, 4, 4, 1, 2, 1, 4, 2, 2, 2, 4, 1, 4, 2,
        4, 2, 4, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 4, 1, 2, 2, 1,
        1, 2, 2, 4, 2, 2, 2, 2, 2, 4, 1, 2, 2, 2])


In [98]:
np.argmax(y_test_tensor, axis=1) 

tensor([1, 5, 0, 4, 2, 2, 5, 4, 3, 5, 1, 2, 0, 5, 0, 4, 4, 2, 2, 3, 1, 2, 5, 2,
        0, 5, 5, 3, 1, 2, 4, 4, 1, 4, 0, 5, 2, 3, 3, 5, 0, 0, 5, 2, 3, 2, 2, 0,
        0, 5, 5, 4, 1, 5, 4, 2, 1, 2, 0, 0, 4, 3, 0, 2, 2, 4, 3, 4, 2, 1, 2, 5,
        5, 0, 3, 1, 1, 1, 3, 1, 2, 1, 0, 2, 2, 0, 0, 2, 5, 1, 2, 2, 2, 2, 0, 2,
        2, 3, 0, 1, 5, 3, 5, 1, 1, 2, 2, 1, 5, 2, 1, 2, 5, 5, 2, 5, 3, 1, 0, 4,
        1, 1, 0, 2, 2, 5, 0, 3, 2, 1, 5, 1, 1, 1, 2, 5, 2, 2, 1, 2, 1, 2, 2, 1,
        2, 4, 4, 0, 3, 4, 5, 1, 4, 2, 0, 3, 3, 1, 1, 1, 4, 2, 3, 3, 1, 5, 3, 1,
        5, 0, 0, 1, 5, 4, 3, 5, 3, 5, 4, 4, 1, 1, 2, 1, 5, 2, 4, 2, 1, 4, 0, 2,
        3, 2, 5, 4, 1, 5, 0, 0, 0, 1, 1, 4, 3, 0, 1, 3, 3, 2, 0, 1, 3, 2, 1, 2,
        2, 2, 1, 2, 1, 5, 2, 2, 2, 4, 4, 5, 4, 0])

In [195]:
labels.shape

torch.Size([16, 6])

In [165]:
# Initial input size
input_length = 16  # Set your input size here

# Track spatial dimensions through layers
for layer in [model.conv1, model.pool1, model.conv2, model.pool2, model.conv3]:
    if isinstance(layer, nn.Conv1d):
        # Calculate output size after convolution
        output_length = ((input_length + 2 * layer.padding[0] - layer.kernel_size[0]) // layer.stride[0]) + 1
        input_length = output_length
    elif isinstance(layer, nn.MaxPool1d):
        # Ensure kernel_size and stride are tuples
        kernel_size = (layer.kernel_size,) if isinstance(layer.kernel_size, int) else layer.kernel_size
        stride = (layer.stride,) if isinstance(layer.stride, int) else layer.stride
        
        # Calculate output size after pooling
        output_length = ((input_length - kernel_size[0]) // stride[0]) + 1
        input_length = output_length


# After all layers, calculate flattened size
flattened_size = input_length * 128  # Assuming 128 channels in the last conv layer
print("Flattened size:", flattened_size)


Flattened size: 512


In [32]:
import commons

# input_file = '../data/autotagging.tsv'
tracks, tags, extra = commons.read_file(input_file)

print(tags)

Reading: 18486 tracks, 4506 albums, 1533 artists
defaultdict(<class 'dict'>, {'mood/theme': {'background': {1185792, 282626, 1376259, 1189893, 1193989, 1193990, 1193992, 1193993, 1193994, 1067024, 1189904, 1341456, 1189907, 1341457, 1341459, 1079318, 1341460, 1341463, 1189914, 1189915, 1189916, 1189917, 1189918, 1189920, 1189924, 1189925, 1189926, 1189927, 1189928, 1189929, 1189930, 1189931, 1310761, 1310763, 1189934, 1189935, 1189936, 1189937, 1189938, 1189939, 1189940, 159797, 1189941, 1189942, 1189944, 159801, 1189945, 1189947, 1189948, 1189950, 159807, 1189951, 1189953, 1189954, 1237057, 1189956, 1087557, 1189957, 159815, 1087562, 1087564, 1087565, 1087566, 1087567, 1087568, 1087569, 1257554, 1265747, 1335380, 1359955, 1333334, 1359956, 1359960, 1359961, 159834, 974948, 1314921, 1314924, 1198190, 1335407, 1065111, 1288403, 1186020, 1186023, 1186024, 1186027, 1274099, 1157371, 1157372, 1306909, 1052961, 1362212, 1313062, 1313063, 1288489, 1288492, 1087794, 1392971, 1392972, 1413464,