<a href="https://colab.research.google.com/github/cloughurd/deep-piano/blob/master/Wav2Mid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
## Pulling ideas from https://github.com/jsleep/wav2mid
## Data from http://www.tsi.telecom-paristech.fr/aao/en/2010/07/08/maps-database-a-piano-database-for-multipitch-estimation-and-automatic-transcription-of-music/

In [1]:
!wget https://amubox.univ-amu.fr/s/iNG0xc5Td1Nv4rR/download

--2020-02-07 03:24:09--  https://amubox.univ-amu.fr/s/iNG0xc5Td1Nv4rR/download
Resolving amubox.univ-amu.fr (amubox.univ-amu.fr)... 139.124.245.127
Connecting to amubox.univ-amu.fr (amubox.univ-amu.fr)|139.124.245.127|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘download’

download                [     <=>            ]  15.01G  19.2MB/s    in 13m 9s  

2020-02-07 03:37:20 (19.5 MB/s) - ‘download’ saved [16119980528]



In [0]:
!unzip -q download
!rm download
!mkdir data

import os
from zipfile import ZipFile

for filename in os.listdir('MAPS/'):
  if 'zip' in filename:
    with ZipFile('MAPS/' + filename, 'r') as z:
      z.extractall('data/' + filename.split('.')[0])

!rm -r MAPS/

In [3]:
!pip3 install torch 
!pip3 install torchvision
!pip3 install tqdm
!pip3 install pysoundfile

Collecting pysoundfile
  Downloading https://files.pythonhosted.org/packages/2a/b3/0b871e5fd31b9a8e54b4ee359384e705a1ca1e2870706d2f081dc7cc1693/PySoundFile-0.9.0.post1-py2.py3-none-any.whl
Installing collected packages: pysoundfile
Successfully installed pysoundfile-0.9.0.post1


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

import pandas as pd
import soundfile as sf
import pretty_midi
import numpy as np
import librosa
import glob
import random

ModuleNotFoundError: ignored

In [0]:
lowest_key = 21
highest_key = 108
octave_size = 12
desired_sr = 22050
window_size = 7
pretty_midi.pretty_midi.MAX_TICK = 1e10

def wav_to_input(audio, sr, bin_multiple=3):
  bins_per_octave = bin_multiple * octave_size
  num_bins = (highest_key+1 - lowest_key) * bin_multiple
  
  audio = librosa.resample(audio, sr, desired_sr).reshape(-1)
  cqt = librosa.cqt(audio, desired_sr, fmin=librosa.midi_to_hz(lowest_key), bins_per_octave=bins_per_octave, n_bins=num_bins)
  cqt = cqt.T # Puts time dim first
  cqt = np.abs(cqt)
  min_fq = np.min(cqt)
  cqt = np.pad(cqt, ((window_size//2, window_size//2),(0,0)), 'constant', constant_values=min_fq)

  # This sets up a matrix where at each time step we have a 7 (window_size) frame snippet from which to pull piano pitches
  windows = []
  for i in range(len(cqt) - window_size + 1):
    windows.append(cqt[i:i+window_size, :])
  cqt = np.array(windows)
  return cqt

def midi_to_output(midi, x):
  times = librosa.frames_to_time(np.arange(len(x)), desired_sr)
  roll = midi.get_piano_roll(desired_sr, times)
  roll = roll[lowest_key: highest_key+1]
  roll = roll.T # Puts time dim first
  return roll

In [0]:
class MapsDataset(Dataset):
  def __init__(self, root, chunk_size=300):
    self.wav_files = glob.glob(root + '*/*/MUS/MAPS_MUS*.wav')
    self.chunk_size = chunk_size
  def __getitem__(self, i):
    x, sr = sf.read(self.wav_files[i])
    x = wav_to_input(x, sr)
    midi_filename = self.wav_files[i].split('.')[0] + '.mid'
    y = pretty_midi.PrettyMIDI(midi_filename)
    y = midi_to_output(y, x)
    start = random.randint(0, len(y)-self.chunk_size)
    x = x[start:start+self.chunk_size, :]
    y = y[start:start+self.chunk_size, :]
    return x, y
  def __len__(self):
    return len(self.wav_files)

In [0]:
dataset = MapsDataset('data/')
loader = DataLoader(dataset)

In [6]:
for x, y in loader:
  print(x.shape, y.shape)
  break

torch.Size([1, 300, 7, 264]) torch.Size([1, 300, 88])


In [8]:
x = x.squeeze(0)
x = x.unsqueeze(1)
x.shape

torch.Size([300, 1, 7, 264])

In [0]:
class ConvBlock(nn.Module):
  def __init__(self, in_c, out_c, kernel_size=3, padding=1):
    super(ConvBlock, self).__init__()
    self.net = nn.Sequential(
        nn.Conv2d(in_c, out_c, kernel_size=kernel_size, padding=padding),
        nn.BatchNorm2d(out_c),
        nn.ReLU(),
        nn.Conv2d(out_c, out_c, kernel_size=kernel_size, padding=padding),
        nn.BatchNorm2d(out_c),
        nn.ReLU(),
        nn.Conv2d(out_c, out_c, kernel_size=kernel_size, padding=padding),
        nn.BatchNorm2d(out_c),
        nn.Dropout2d()
    )
    if in_c != out_c:
      self.skip = nn.Conv2d(in_c, out_c, kernel_size=1)
    else:
      self.skip = nn.Identity()
    self.final = nn.ReLU()
  def forward(self, x):
    res = self.net(x)
    y = self.skip(x) + res
    return self.final(y)

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.net = nn.Sequential(
        ConvBlock(1, 8),
        ConvBlock(8, 16),
        nn.MaxPool2d((1, 2)),
        ConvBlock(16, 32),
        ConvBlock(32, 64),
        nn.MaxPool2d((1, 2)),
        ConvBlock(64, 64),
        ConvBlock(64, 128),
        nn.MaxPool2d((1, 2)),
        ConvBlock(128, 128),
        nn.AvgPool2d((7, 33))
    )
    self.final = nn.Linear(128, 88)
  def forward(self, x):
    y = self.net(x)
    y = y.squeeze(2).squeeze(2)
    return self.final(y)

In [0]:
net = Net()

In [0]:
y_hat = net(x.float())

In [35]:
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
objective = nn.BCEWithLogitsLoss()
net = Net().cuda()

RuntimeError: ignored

In [0]:
def train(num_epochs=40):
  for i in range(num_epochs):
    for x, y in loader:
      x = x.squeeze(0).unsqueeze(1).float().cuda()
      y = y.squeeze(0).cuda()

      optimizer.zero_grad()
      y_hat = net(x)
      loss = objective(y_hat, y)
      loss.backward()
      optimizer.step()