In [1]:
import pandas as pd
import numpy as np

In [7]:
df = pd.DataFrame({'file_name':[1], 'lyrics':[1] , 'vocal':[1] })
df

Unnamed: 0,file_name,lyrics,vocal
0,1,1,1


In [8]:
df.loc[0, 'lyrics']

1

In [10]:
%cd SongToLyrics/

d:\CUHK\IERG4320\Project\SongToLyrics


In [None]:
# Setting the vocal by hand

In [18]:
# The first 5 songs are from Chris Martin - Coldplay
for x in range(1, 6):
    with open(f'./dataset/song_{x}.txt', 'r') as f:
        df.loc[x - 1, 'lyrics'] = f.read()
        df.loc[x - 1, 'file_name'] = f'song_{x}.mp3'
        df.loc[x - 1, 'vocal'] = 'Chris Martin - Coldplay'

In [19]:
df

Unnamed: 0,file_name,lyrics,vocal
0,song_1.mp3,"Look at the stars, look how they shine for you...",Chris Martin - Coldplay
1,song_2.mp3,"'Cause you're a sky, 'cause you're a sky full ...",Chris Martin - Coldplay
2,song_3.mp3,I used to rule the world\nSeas would rise when...,Chris Martin - Coldplay
3,song_4.mp3,"Ooh-ooh-ooh, ooh-ooh-ooh, ooh-ooh-ooh\nOoh-ooh...",Chris Martin - Coldplay
4,song_5.mp3,"When you try your best, but you don't succeed\...",Chris Martin - Coldplay
5,song_6.mp3,"Uh-huh, life's like this\nUh-huh, uh-huh\nThat...",Avril Lavigne
6,song_7.mp3,I always needed time on my own\nI never though...,Avril Lavigne
7,song_8.mp3,"I can be tough, I can be strong\nBut with you,...",Avril Lavigne
8,song_9.mp3,"Hey, hey, you, you, I don't like your girlfrie...",Avril Lavigne
9,song_10.mp3,You say that I'm messing with your head\n(Yeah...,Avril Lavigne


In [15]:
# The first 5 songs are from Avril Lavigne
for x in range(6, 11):
    with open(f'./dataset/song_{x}.txt', 'r') as f:
        df.loc[x - 1, 'lyrics'] = f.read()
        df.loc[x - 1, 'file_name'] = f'song_{x}.mp3'
        df.loc[x - 1, 'vocal'] = 'Avril Lavigne'

In [178]:
df

Unnamed: 0,file_name,lyrics,vocal
0,song_1.mp3,"Look at the stars, look how they shine for you...",Chris Martin - Coldplay
1,song_2.mp3,"'Cause you're a sky, 'cause you're a sky full ...",Chris Martin - Coldplay
2,song_3.mp3,I used to rule the world\nSeas would rise when...,Chris Martin - Coldplay
3,song_4.mp3,"Ooh-ooh-ooh, ooh-ooh-ooh, ooh-ooh-ooh\nOoh-ooh...",Chris Martin - Coldplay
4,song_5.mp3,"When you try your best, but you don't succeed\...",Chris Martin - Coldplay
5,song_6.mp3,"Uh-huh, life's like this\nUh-huh, uh-huh\nThat...",Avril Lavigne
6,song_7.mp3,I always needed time on my own\nI never though...,Avril Lavigne
7,song_8.mp3,"I can be tough, I can be strong\nBut with you,...",Avril Lavigne
8,song_9.mp3,"Hey, hey, you, you, I don't like your girlfrie...",Avril Lavigne
9,song_10.mp3,You say that I'm messing with your head\n(Yeah...,Avril Lavigne


In [181]:
df['label'] = df['vocal'].apply(lambda x: 0 if x == 'Chris Martin - Coldplay' else 1)

In [182]:
df

Unnamed: 0,file_name,lyrics,vocal,label
0,song_1.mp3,"Look at the stars, look how they shine for you...",Chris Martin - Coldplay,0
1,song_2.mp3,"'Cause you're a sky, 'cause you're a sky full ...",Chris Martin - Coldplay,0
2,song_3.mp3,I used to rule the world\nSeas would rise when...,Chris Martin - Coldplay,0
3,song_4.mp3,"Ooh-ooh-ooh, ooh-ooh-ooh, ooh-ooh-ooh\nOoh-ooh...",Chris Martin - Coldplay,0
4,song_5.mp3,"When you try your best, but you don't succeed\...",Chris Martin - Coldplay,0
5,song_6.mp3,"Uh-huh, life's like this\nUh-huh, uh-huh\nThat...",Avril Lavigne,1
6,song_7.mp3,I always needed time on my own\nI never though...,Avril Lavigne,1
7,song_8.mp3,"I can be tough, I can be strong\nBut with you,...",Avril Lavigne,1
8,song_9.mp3,"Hey, hey, you, you, I don't like your girlfrie...",Avril Lavigne,1
9,song_10.mp3,You say that I'm messing with your head\n(Yeah...,Avril Lavigne,1


In [183]:
df.to_csv('./dataset/songsdata.csv')

In [2]:
import torch
from torch import nn
import torchaudio
from torch.utils.data import Dataset 
import os


class SongToLyricsDataset(Dataset):

    def __init__(self, annotation_file, audio_dir, transformation, target_sr, num_samples):
        self.annotations = pd.read_csv(annotation_file)
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sr = target_sr
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)
    

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        sig, sr = torchaudio.load(audio_sample_path)
        sig = self._resample_sr(sig, sr)
        sig = self._mix_down(sig)
        sig = self._add_pad(sig)
        sig = self.transformation(sig)
        return sig, label
    
    def _add_pad(self, sig):
        if sig.shape[1] < self.num_samples:
            num_missing_sample = self.num_samples - sig.shape[1]
            last_dim_padding = (0, num_missing_sample)
            sig = nn.functional.pad(sig, last_dim_padding)
        return sig

    def _resample_sr(self, sig, sr):
        if sr != self.target_sr:
            resample = torchaudio.transforms.Resample(sr, self.target_sr)
            sig = resample(sig)
        return sig

    def _mix_down(self, sig):
        if sig.shape[0] != 1:
            sig = torch.mean(sig, dim = 0, keepdim = True)

        return sig

    def _get_audio_sample_path(self, index):
        return os.path.join(self.audio_dir, self.annotations.loc[index, 'file_name'])
    
    def _get_audio_sample_label(self, index):
        return self.annotations.loc[index, 'label']



In [4]:
SAMPLE_RATE = 22050
NUM_SAMPLES = 6615000 # 5 mins
melS_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate= SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64,

)

In [32]:
class CNN(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride = 1, padding = 2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride = 1, padding = 2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride = 1, padding = 2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride = 1, padding = 2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.flatten = nn.Flatten()
        self.linear = nn.Linear(517760, 2)
        self.softmax = nn.Softmax(dim = 1)
    
    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions

In [34]:
def train_one_epoch(model, data_loader, loss_fn, optimiser, device):
    for inputs, targets in data_loader:
        inputs= inputs.to(device)
        targets = targets.to(device)

        predictions = model(inputs)
        loss = loss_fn(predictions, targets)

        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f'Loss = {loss.item()}')

def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"----- Epoch {i + 1} -----")
        train_one_epoch(model, data_loader, loss_fn, optimiser, device)
        print('------------------------------------')

    print("Finish training")


In [80]:
BATCH_SIZE = 1
EPOCHS = 5
LEARNING_RATE = 1e-4

In [45]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device 

'cpu'

In [46]:
cnn = CNN()
cnn.to(device)

CNN(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=517760, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)

In [47]:
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(cnn.parameters(), lr = LEARNING_RATE)

In [82]:
from torch.utils.data import DataLoader

def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader

In [49]:
%cd SongToLyrics

[WinError 2] The system cannot find the file specified: 'SongToLyrics'
d:\CUHK\IERG4320\Project\SongToLyrics


In [83]:
folder = './dataset'
csv = '/songsdata.csv'
STL = SongToLyricsDataset(folder + csv, folder, melS_spectrogram, SAMPLE_RATE, NUM_SAMPLES)
train_data_loader = create_data_loader(STL, BATCH_SIZE)

In [84]:
len(train_data_loader)

10

In [85]:
train(cnn, train_data_loader, loss_fn, optimiser, device, EPOCHS)

----- Epoch 1 -----
Loss = 1.31326162815094
------------------------------------
----- Epoch 2 -----
Loss = 1.31326162815094
------------------------------------
----- Epoch 3 -----
Loss = 1.31326162815094
------------------------------------
----- Epoch 4 -----
Loss = 1.31326162815094
------------------------------------
----- Epoch 5 -----
Loss = 1.31326162815094
------------------------------------
Finish training


In [62]:
torch.save(cnn.state_dict(), 'vocalClassifier.pt')

In [66]:
model = CNN()
state_dict = torch.load("vocalClassifier.pt")
model.load_state_dict(state_dict)

  state_dict = torch.load("vocalClassifier.pt")


<All keys matched successfully>

In [78]:
songs = SongToLyricsDataset(folder + csv, folder, melS_spectrogram, SAMPLE_RATE, NUM_SAMPLES)
data = create_data_loader(songs, 1)

In [79]:
i = 0
for input, taret in data:
    print(i)
    print(input)
    i += 1

0
tensor([[[[3.2421e-20, 3.8307e-18, 1.7395e-11,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [3.0134e-19, 9.8860e-18, 2.2331e-11,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.2204e-19, 3.4007e-17, 1.9567e-11,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [3.0472e-17, 1.4021e-13, 2.2473e-10,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [5.0080e-16, 6.1886e-13, 1.1961e-10,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [3.3226e-16, 3.6075e-13, 9.8837e-11,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00]]]])
1
tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]])
2
tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0

In [70]:

model.eval()
with torch.no_grad():
    predictions = model(input)
    # Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
    predicted_index = predictions[0].argmax(0)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x4045 and 517760x2)

AttributeError: 'NoneType' object has no attribute 'state_dict'