# Lofi Gan

## Setup

In [1]:
import scipy.io.wavfile as wavfile
import tensorflow as tf
import numpy as np
import keras
import keras.layers as layers
import keras.models as models
import keras.losses as losses
import keras.optimizers as optimizers

import os
import pickle

from tqdm import tqdm
import time
from IPython import display


In [2]:
DEVICE = "GPU"

physical_devices = tf.config.list_physical_devices(DEVICE)
print(physical_devices)

print(tf.config.list_physical_devices())


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Load Data

In [3]:
file = "C:/Users/Eliot/Documents/Audacity/lofi-part1.wav"
sample_rate, data = wavfile.read(file)


In [4]:
total_seconds = len(data) / sample_rate
minutes = total_seconds // 60
seconds = total_seconds % 60
print(
    f"wave file of length {minutes} minutes and {seconds} seconds, which is {total_seconds} total seconds at sample rate {sample_rate}"
)


wave file of length 105.0 minutes and 44.35983333333297 seconds, which is 6344.359833333333 total seconds at sample rate 48000


In [5]:
# print(f"max: {max(data)}, min: {min(data)}")


In [6]:
# because we are using signed 16 bit PCM (ints), let's normalize our data to be between [-1, 1)
float_data = np.array(data, dtype=np.float16)
print(
    f"max: {np.max(float_data)}, min: {np.min(float_data)}"
)  # currently, still following int patterns of [-32768, +32767]
float_data /= 32768
print(
    f"max: {np.max(float_data)}, min: {np.min(float_data)}"
)  # now it falls between [-1, 1)


max: 32768.0, min: -32768.0
max: 1.0, min: -1.0


## Prepare Dataset

In [7]:
BATCH_SIZE = 8
SHUFFLE_SIZE = 200
SONG_LEN_IN_SECONDS = 15
EPOCH_SIZE = 40
START_SECTION = 0


In [8]:
songs = []
i = 0
while i < len(float_data) - SONG_LEN_IN_SECONDS * sample_rate:
    songs.append(float_data[i : i + SONG_LEN_IN_SECONDS * sample_rate])
    i += SONG_LEN_IN_SECONDS * sample_rate

songs = np.array(songs)
songs.shape


(422, 720000)

In [9]:
songs = songs[START_SECTION : START_SECTION + EPOCH_SIZE]


In [10]:
dataset = (
        tf.data.Dataset.from_tensor_slices(tf.constant(songs, dtype=tf.float16))
        .shuffle(SHUFFLE_SIZE)
        .batch(BATCH_SIZE)
    )

print(dataset)

<BatchDataset element_spec=TensorSpec(shape=(None, 720000), dtype=tf.float16, name=None)>


## Make Model

In [11]:
GENERATOR_INPUT_SIZE = (400, 2)
MULTIPLIER = 2

In [12]:
def make_discriminator():
    model = models.Sequential()
    model.add(layers.Input((SONG_LEN_IN_SECONDS * sample_rate, 1)))
    model.add(layers.Reshape((SONG_LEN_IN_SECONDS * MULTIPLIER, sample_rate // MULTIPLIER)))
    model.add(layers.LSTM(128))
    model.add(layers.Dense(64))
    model.add(layers.ReLU())
    model.add(layers.Dense(1))
    return model

In [13]:
discriminator = make_discriminator()


In [14]:
discriminator.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 30, 24000)         0         
                                                                 
 lstm (LSTM)                 (None, 128)               12354048  
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 re_lu (ReLU)                (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 12,362,369
Trainable params: 12,362,369
Non-trainable params: 0
_________________________________________________________________


In [15]:
discriminator(tf.random.normal((1, 720000)), training=False)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.2100232]], dtype=float32)>

In [16]:
def make_generator():
    model = models.Sequential()
    model.add(layers.Input(GENERATOR_INPUT_SIZE))
    
    model.add(layers.Bidirectional(layers.LSTM(400 * 2)))
    model.add(layers.Dense(400 * 16))
    model.add(layers.Dropout(0.2))
    model.add(layers.ReLU())

    model.add(layers.Reshape((400, 16)))

    model.add(layers.Conv1DTranspose(14, 4, strides=8, padding="same"))
    model.add(layers.Dropout(0.3))
    model.add(layers.LeakyReLU())
    model.add(layers.BatchNormalization())
    print(model.output_shape)

    model.add(
        layers.Conv1DTranspose(
            15 * SONG_LEN_IN_SECONDS, 4, strides=1, padding="same", activation="tanh"
        )
    )
    print(model.output_shape)

    model.add(layers.Flatten())
    print(model.output_shape)
    return model

In [17]:
with tf.device(DEVICE):
    generator = make_generator()


(None, 3200, 14)
(None, 3200, 225)
(None, 720000)


In [18]:
generator.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 1600)             5139200   
 l)                                                              
                                                                 
 dense_2 (Dense)             (None, 6400)              10246400  
                                                                 
 dropout (Dropout)           (None, 6400)              0         
                                                                 
 re_lu_1 (ReLU)              (None, 6400)              0         
                                                                 
 reshape_1 (Reshape)         (None, 400, 16)           0         
                                                                 
 conv1d_transpose (Conv1DTra  (None, 3200, 14)         910       
 nspose)                                              

In [19]:
output = generator(tf.random.normal((1, *GENERATOR_INPUT_SIZE)), training=False).numpy()
output = np.array(output, dtype=np.float32)
assert np.max(output) <= 1 and np.min(output) >= -1
display.Audio(output, rate=sample_rate)

## loss and optimizers

In [20]:
with tf.device(DEVICE):
    loss = losses.BinaryCrossentropy(from_logits=True)
    generator_optimizer = optimizers.Adam(0.0001)
    discriminator_optimizer = optimizers.Adam(0.0001)


In [21]:
# loss functions
@tf.function
def generator_loss(fake_pred):
    return loss(tf.ones_like(fake_pred), fake_pred)


@tf.function
def discriminator_loss(true_pred, fake_pred):
    true_loss = loss(tf.ones_like(true_pred), true_pred)
    fake_loss = loss(tf.zeros_like(fake_pred), fake_pred)
    return true_loss + fake_loss


## Training Loop

In [22]:
# get a seed
SAMPLES_TO_GENERATE = 1
valid = False
if os.path.exists("./seed.b"):
    with open("./seed.b", "rb") as file:
        seed = pickle.load(file)["seed"]
        if seed.shape == (SAMPLES_TO_GENERATE, *GENERATOR_INPUT_SIZE):
            valid = True

if not valid:
    with open("./seed.b", "wb") as file:
        seed = tf.random.normal(
            (SAMPLES_TO_GENERATE, *GENERATOR_INPUT_SIZE), dtype=tf.float16
        )
        pickle.dump({"seed": seed}, file)

seed.shape


TensorShape([1, 400, 2])

In [23]:
def generate_and_save_audio(gen, epoch_num, _seed, display_output=True):
    inferences = generator(_seed, training=False)
    inferences = np.array(inferences.numpy(), dtype=np.float32)
    for i in range(len(inferences)):
        wavfile.write(
            f"./generatedAudio/epoch_{epoch_num}_v{i}.wav", sample_rate, inferences[i]
        )
        if display_output:
            display.display(display.Audio(f"./generatedAudio/epoch_{epoch_num}_v{i}.wav"))


In [24]:
generate_and_save_audio(generator, 0, seed)

In [25]:
@tf.function
def train_step(audio):
    noise = tf.random.normal([BATCH_SIZE, *GENERATOR_INPUT_SIZE])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_audio = generator(noise, training=True)
        
        true_output = discriminator(audio, training=True)
        fake_output = discriminator(generated_audio, training=True)
        
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(true_output, fake_output)
        
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(
        disc_loss, discriminator.trainable_variables
    )

    generator_optimizer.apply_gradients(
        zip(gradients_of_generator, generator.trainable_variables)
    )
    discriminator_optimizer.apply_gradients(
        zip(gradients_of_discriminator, discriminator.trainable_variables)
    )


In [26]:
def train(dataset, epochs:int, save_every:int=3, display_output=True):
    # get epoch
    if os.path.exists("epoch.b"):
        with open("epoch.b", "rb") as file:
            epoch = pickle.load(file)["epoch"]
    else:
        epoch = 0
        with open("epoch.b", "wb") as file:
            pickle.dump({"epoch": epoch}, file)

    for i in range(epochs):
        start = time.time()

        for image_batch in dataset:
            train_step(image_batch)

        # Produce images for the GIF as you go
        display.clear_output(wait=True)
        generate_and_save_audio(generator, epoch, seed, display_output)

        # Save the model every `save_every` epochs
        if (i + 1) % save_every == 0:
            generator.save("./generatorContinued")
            discriminator.save("./discriminatorContinued")

        print("Time for epoch {} is {} sec".format(epoch, time.time() - start))
        epoch += 1

    # Generate after the final epoch
    display.clear_output(wait=True)
    generate_and_save_audio(generator, epochs, seed)

    with open("epoch.b", "wb") as file:
        pickle.dump({"epoch": epoch}, file)


# Train!!

In [27]:
with tf.device(DEVICE):
    train(dataset, 100, 50)


## Save

In [28]:
generator.save("./generator")
discriminator.save("./discriminator")





INFO:tensorflow:Assets written to: ./generator\assets


INFO:tensorflow:Assets written to: ./generator\assets






INFO:tensorflow:Assets written to: ./discriminator\assets


INFO:tensorflow:Assets written to: ./discriminator\assets


## Load

In [29]:
generator.load_weights("./generator")
discriminator.load_weights("./discriminator") 

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1c723deca90>