# Lofi Gan

## Setup

In [1]:
import scipy.io.wavfile as wavfile
import tensorflow as tf
import numpy as np
import keras
import keras.layers as layers
import keras.models as models
import keras.losses as losses
import keras.optimizers as optimizers

import os
import pickle

from tqdm import tqdm
import time
from IPython import display


In [2]:
DEVICE = "GPU"
DIRECTML_PLUGIN = False


## Load Data

In [3]:
file = "C:/Users/Eliot/Documents/Audacity/lofi-part1.wav"
sample_rate, data = wavfile.read(file)


In [4]:
total_seconds = len(data) / sample_rate
minutes = total_seconds // 60
seconds = total_seconds % 60
print(
    f"wave file of length {minutes} minutes and {seconds} seconds, which is {total_seconds} total seconds at sample rate {sample_rate}"
)


wave file of length 105.0 minutes and 44.35983333333297 seconds, which is 6344.359833333333 total seconds at sample rate 48000


In [5]:
# print(f"max: {max(data)}, min: {min(data)}")


In [6]:
# because we are using signed 16 bit PCM (ints), let's normalize our data to be between [-1, 1)
float_data = np.array(data, dtype=np.float16)
print(
    f"max: {np.max(float_data)}, min: {np.min(float_data)}"
)  # currently, still following int patterns of [-32768, +32767]
float_data /= 32768
print(
    f"max: {np.max(float_data)}, min: {np.min(float_data)}"
)  # now it falls between [-1, 1)


max: 32768.0, min: -32768.0
max: 1.0, min: -1.0


In [7]:
"""class SongSeparator:
    class Iterator:
        def __init__(self, ref) -> None:
            self.idx = 0
            self.ref = ref
            self.zeros = np.zeros((sample_rate,))

        def __next__(self):
            if self.idx >= len(self.ref):
                raise StopIteration

            if (
                self.idx < len(self.ref) - self.ref.sample_rate
                and (
                    self.ref.data[self.idx : self.idx + self.ref.sample_rate]
                    == self.zeros
                ).all()
            ):
                self.ref.num_songs += 1

                times_between = 0
                while self.ref.data[i] == 0:
                    times_between += 1
                    self.idx += 1
                self.ref.max_times_between = max(
                    self.ref.max_times_between, times_between
                )
            else:
                self.idx += 1

            return self.idx

    def __init__(self, data: np.ndarray, sample_rate: int) -> None:
        self.length = len(data)
        self.num_songs = 0
        self.data = data
        self.sample_rate = sample_rate
        self.max_times_between = 0

    def __len__(self):
        return self.length

    def __iter__(self):
        return SongSeparator.Iterator(self)

a = SongSeparator(data, sample_rate)
for i in tqdm(a):
  pass"""


'class SongSeparator:\n    class Iterator:\n        def __init__(self, ref) -> None:\n            self.idx = 0\n            self.ref = ref\n            self.zeros = np.zeros((sample_rate,))\n\n        def __next__(self):\n            if self.idx >= len(self.ref):\n                raise StopIteration\n\n            if (\n                self.idx < len(self.ref) - self.ref.sample_rate\n                and (\n                    self.ref.data[self.idx : self.idx + self.ref.sample_rate]\n                    == self.zeros\n                ).all()\n            ):\n                self.ref.num_songs += 1\n\n                times_between = 0\n                while self.ref.data[i] == 0:\n                    times_between += 1\n                    self.idx += 1\n                self.ref.max_times_between = max(\n                    self.ref.max_times_between, times_between\n                )\n            else:\n                self.idx += 1\n\n            return self.idx\n\n    def __init__(sel

## Prepare Dataset

In [8]:
BATCH_SIZE = 8
SHUFFLE_SIZE = 200
SONG_LEN_IN_SECONDS = 15
EPOCH_SIZE = 100
START_SECTION = 0


In [9]:
songs = []
i = 0
while i < len(float_data) - SONG_LEN_IN_SECONDS * sample_rate:
    songs.append(float_data[i : i + SONG_LEN_IN_SECONDS * sample_rate])
    i += SONG_LEN_IN_SECONDS * sample_rate

songs = np.array(songs)
len(songs)


422

In [10]:
songs = songs[START_SECTION : START_SECTION + EPOCH_SIZE]


In [11]:
with tf.device(DEVICE):
    dataset = (
        tf.data.Dataset.from_tensors(tf.constant(songs, dtype=tf.float16))
        .batch(BATCH_SIZE)
        .shuffle(SHUFFLE_SIZE)
    )


## Make Model

In [12]:
GENERATOR_INPUT_SIZE = (400, 2)


In [13]:
def make_discriminator():
    model = models.Sequential()
    model.add(layers.Input((SONG_LEN_IN_SECONDS * sample_rate,)))
    model.add(layers.Reshape((SONG_LEN_IN_SECONDS * sample_rate, 1)))
    if DIRECTML_PLUGIN:
        model.add(layers.LSTM(2048, dropout=0.2, recurrent_dropout=0.2))
    else:
        model.add(layers.LSTM(2048))
    model.add(layers.ReLU())
    model.add(layers.Dense(512))
    model.add(layers.Dropout(0.3))
    model.add(layers.ReLU())
    model.add(layers.Dense(1))  # output neuron
    return model


In [14]:
with tf.device(DEVICE):
    discriminator = make_discriminator()




In [15]:
discriminator.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 720000, 1)         0         
                                                                 
 lstm (LSTM)                 (None, 2048)              16793600  
                                                                 
 re_lu (ReLU)                (None, 2048)              0         
                                                                 
 dense (Dense)               (None, 512)               1049088   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 re_lu_1 (ReLU)              (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 5

In [16]:
def make_generator():
    model = models.Sequential()
    model.add(layers.Input(GENERATOR_INPUT_SIZE))
    print(model.output_shape)

    if DIRECTML_PLUGIN:
        model.add(
            layers.Bidirectional(
                layers.LSTM(400 * 2, dropout=0.1, recurrent_dropout=0.1)
            )
        )
    else:
        model.add(layers.Bidirectional(layers.LSTM(400 * 2)))
    model.add(layers.Dense(400 * 16))
    model.add(layers.Dropout(0.2))
    model.add(layers.ReLU())

    model.add(layers.Reshape((400, 16)))

    model.add(layers.Conv1DTranspose(14, 4, strides=8, padding="same"))
    model.add(layers.Dropout(0.3))
    model.add(layers.SyncBatchNormalization())
    model.add(layers.ReLU())
    print(model.output_shape)

    model.add(
        layers.Conv1DTranspose(
            14 * SONG_LEN_IN_SECONDS, 4, strides=1, padding="same", activation="tanh"
        )
    )
    print(model.output_shape)

    model.add(layers.Flatten())
    print(model.output_shape)
    return model


In [17]:
with tf.device(DEVICE):
    generator = make_generator()


(None, 400, 2)
(None, 3200, 14)
(None, 3200, 210)
(None, 672000)


In [18]:
generator.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 1600)             5139200   
 l)                                                              
                                                                 
 dense_2 (Dense)             (None, 6400)              10246400  
                                                                 
 dropout_1 (Dropout)         (None, 6400)              0         
                                                                 
 re_lu_2 (ReLU)              (None, 6400)              0         
                                                                 
 reshape_1 (Reshape)         (None, 400, 16)           0         
                                                                 
 conv1d_transpose (Conv1DTra  (None, 3200, 14)         910       
 nspose)                                              

## loss and optimizers

In [19]:
with tf.device(DEVICE):
    loss = losses.BinaryCrossentropy(from_logits=True)
    generator_optimizer = optimizers.Adam(0.0001)
    discriminator_optimizer = optimizers.Adadelta(0.0001)


In [20]:
# loss functions
@tf.function
def generator_loss(fake_pred):
    return loss(tf.ones_like(fake_pred), fake_pred)


@tf.function
def discriminator_loss(true_pred, fake_pred):
    true_loss = loss(tf.ones_like(true_pred), true_pred)
    fake_loss = loss(tf.zeros_like(fake_pred), fake_pred)
    return true_loss + fake_loss


## Training Loop

In [21]:
# get a seed
SAMPLES_TO_GENERATE = 8
valid = False
if os.path.exists("./seed.b"):
    with open("./seed.b", "rb") as file:
        seed = pickle.load(file)["seed"]
        if seed.shape == (SAMPLES_TO_GENERATE, *GENERATOR_INPUT_SIZE):
            valid = True

if not valid:
    with open("./seed.b", "wb") as file:
        seed = tf.random.normal(
            (SAMPLES_TO_GENERATE, *GENERATOR_INPUT_SIZE), dtype=tf.float16
        )
        pickle.dump({"seed": seed}, file)

seed.shape


TensorShape([8, 400, 2])

In [22]:
def generate_and_save_audio(gen, epoch_num, _seed):
    inferences = generator(_seed, training=False)
    inferences = inferences.numpy()
    for i in range(len(inferences)):
        wavfile.write(
            f"./generatedAudio/epoch_{epoch_num}_v{i}", sample_rate, inferences[i]
        )
        display.Audio(inferences[i])


In [23]:
@tf.function
def train_step(audio):
    noise = tf.random.normal([BATCH_SIZE, *GENERATOR_INPUT_SIZE])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_audio = generator(noise, training=True)

        true_output = discriminator(audio, training=True)
        fake_output = discriminator(generated_audio, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(true_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(
        disc_loss, discriminator.trainable_variables
    )

    generator_optimizer.apply_gradients(
        zip(gradients_of_generator, generator.trainable_variables)
    )
    discriminator_optimizer.apply_gradients(
        zip(gradients_of_discriminator, discriminator.trainable_variables)
    )


In [24]:
def train(dataset, epochs):
    # get epoch
    if os.path.exists("epoch.b"):
        with open("epoch.b", "rb") as file:
            epoch = pickle.load(file)["epoch"]
    else:
        epoch = 0
        with open("epoch.b", "wb") as file:
            pickle.dump({"epoch": epoch}, file)

    for epoch in range(epochs):
        start = time.time()

        for image_batch in dataset:
            train_step(image_batch)

        # Produce images for the GIF as you go
        display.clear_output(wait=True)
        generate_and_save_audio(generator, epoch, seed)

        # Save the model every 3 epochs
        if (epoch + 1) % 3 == 0:
            generator.save("./generatorContinued")
            discriminator.save("./discriminatorContinued")

        print("Time for epoch {} is {} sec".format(epoch + 1786, time.time() - start))

    # Generate after the final epoch
    display.clear_output(wait=True)
    generate_and_save_audio(generator, epochs, seed)

    with open("epoch.b", "wb") as file:
        pickle.dump({"epoch": epoch}, file)


# Train!!

In [25]:
with tf.device(DEVICE):
    train(dataset, 3)


ResourceExhaustedError: Exception encountered when calling layer "lstm_cell_2" (type LSTMCell).

OOM when allocating tensor with shape[8,800] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator PluggableDevice_0_bfc [Op:MatMul]

Call arguments received by layer "lstm_cell_2" (type LSTMCell):
  • inputs=tf.Tensor(shape=(8, 2), dtype=float32)
  • states=('tf.Tensor(shape=(8, 800), dtype=float32)', 'tf.Tensor(shape=(8, 800), dtype=float32)')
  • training=True