# Lofi Gan

## Setup

In [1]:
import dotenv
import os

dotenv.load_dotenv(dotenv_path="./config.env")
FILE = os.getenv("FILE")
BATCH_SIZE = int(os.getenv("BATCH_SIZE"))
START_POS = int(os.getenv("START_POS"))
EPOCH_SIZE = int(os.getenv("EPOCH_SIZE"))

In [2]:
FILE, BATCH_SIZE, START_POS, EPOCH_SIZE

('lofi-part6.wav', 100, 80, 200)

In [3]:
import scipy.io.wavfile as wavfile
import tensorflow as tf
import numpy as np
import keras
import keras.layers as layers
import keras.models as models
import keras.losses as losses
import keras.optimizers as optimizers

import os
import pickle
import random

import time
from IPython import display


In [4]:
DEVICE = "GPU"

physical_devices = tf.config.list_physical_devices(DEVICE)
print(physical_devices)

print(tf.config.list_physical_devices())


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Load Data

In [5]:
DATA_PREFIX="D:/LofiData/"
file = DATA_PREFIX+FILE
sample_rate, data = wavfile.read(file)


In [6]:
total_seconds = len(data) / sample_rate
minutes = total_seconds // 60
seconds = total_seconds % 60
print(
    f"wave file of length {minutes} minutes and {seconds} seconds, which is {total_seconds} total seconds at sample rate {sample_rate}"
)


wave file of length 70.0 minutes and 33.61983333333319 seconds, which is 4233.619833333333 total seconds at sample rate 48000


In [7]:
# because we are using signed 16 bit PCM (ints), let's normalize our data to be between [-1, 1)
float_data = np.array(data, dtype=np.float16)
print(
    f"max: {np.max(float_data)}, min: {np.min(float_data)}"
)  # currently, still following int patterns of [-32768, +32767]
float_data /= 32768
print(
    f"max: {np.max(float_data)}, min: {np.min(float_data)}"
)  # now it falls between [-1, 1)


max: 32336.0, min: -32400.0
max: 0.98681640625, min: -0.98876953125


## Prepare Dataset

In [8]:
SHUFFLE_SIZE = 200
SONG_LEN_IN_SECONDS = 15


In [9]:
songs = []

# simulate diversity in dataset by starting the song sections at different times
# since that basically makes it a "new" observation
i = random.randint(0, SONG_LEN_IN_SECONDS-1)

# add all songs to our list
while i < len(float_data) - SONG_LEN_IN_SECONDS * sample_rate:
    songs.append(float_data[i : i + SONG_LEN_IN_SECONDS * sample_rate])
    i += SONG_LEN_IN_SECONDS * sample_rate

# convert to numpy array and check the shape - should be (xxx, 720000)
songs = np.array(songs)
songs.shape


(282, 720000)

In [10]:
# only use the selected portion
songs = songs[START_POS : START_POS + EPOCH_SIZE]


In [11]:
# check to make sure that it is still lofi
display.Audio(songs[0], rate=sample_rate)

In [12]:
# make the dataset that the model will be trained on
dataset = (
        tf.data.Dataset.from_tensor_slices(tf.constant(songs, dtype=tf.float16))
        .shuffle(SHUFFLE_SIZE)
        .batch(BATCH_SIZE)
    )

# make sure that the shape is still (None, 720000)
dataset

<BatchDataset element_spec=TensorSpec(shape=(None, 720000), dtype=tf.float16, name=None)>

## Make Model

In [13]:
# the shape of the noise to be transformed into LOFI
GENERATOR_INPUT_SIZE = (400, 2)

# fraction of a second that the LSTM should take as input 
# ex: MULTIPLIER=2 means that the LSTM listens to sections 1/2 second long
MULTIPLIER = 4  

In [14]:
def make_discriminator():
    model = models.Sequential()
    model.add(layers.Input((SONG_LEN_IN_SECONDS * sample_rate, 1)))
    model.add(layers.Reshape((SONG_LEN_IN_SECONDS * MULTIPLIER, sample_rate // MULTIPLIER)))

    model.add(layers.LSTM(256, dropout=0.15))
    
    model.add(layers.Dense(256))
    model.add(layers.Dropout(0.3))
    model.add(layers.ReLU())
    
    model.add(layers.Dense(128))
    model.add(layers.Dropout(0.2))
    model.add(layers.ReLU())
    
    model.add(layers.Dense(1))
    return model

In [15]:
discriminator = make_discriminator()


In [16]:
discriminator.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 60, 12000)         0         
                                                                 
 lstm (LSTM)                 (None, 256)               12551168  
                                                                 
 dense (Dense)               (None, 256)               65792     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 re_lu (ReLU)                (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0

In [17]:
# check to make sure that it takes in the right input shape and has the right output shape
discriminator(tf.random.normal((1, 720000)), training=False)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.04500834]], dtype=float32)>

In [18]:
def make_generator():
    model = models.Sequential()
    model.add(layers.Input(GENERATOR_INPUT_SIZE))
    
    model.add(layers.Bidirectional(layers.LSTM(400 * 2, dropout=0.15)))
    model.add(layers.Dense(400 * 16))
    model.add(layers.Dropout(0.2))
    model.add(layers.ReLU())

    model.add(layers.Reshape((400, 16)))

    model.add(layers.Conv1DTranspose(64, 4, strides=4, padding="same"))
    model.add(layers.Dropout(0.3))
    model.add(layers.LeakyReLU())
    model.add(layers.BatchNormalization())
    
    model.add(layers.Conv1DTranspose(128, 4, strides=2, padding="same"))
    model.add(layers.Dropout(0.3))
    model.add(layers.LeakyReLU())
    print(model.output_shape)

    model.add(
        layers.Conv1DTranspose(
            15 * SONG_LEN_IN_SECONDS, 4, strides=1, padding="same", activation="tanh"
        )
    )
    print(model.output_shape)

    model.add(layers.Flatten())
    print(model.output_shape)
    return model

In [19]:
with tf.device(DEVICE):
    generator = make_generator()


(None, 3200, 128)
(None, 3200, 225)
(None, 720000)


In [20]:
generator.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 1600)             5139200   
 l)                                                              
                                                                 
 dense_3 (Dense)             (None, 6400)              10246400  
                                                                 
 dropout_2 (Dropout)         (None, 6400)              0         
                                                                 
 re_lu_2 (ReLU)              (None, 6400)              0         
                                                                 
 reshape_1 (Reshape)         (None, 400, 16)           0         
                                                                 
 conv1d_transpose (Conv1DTra  (None, 1600, 64)         4160      
 nspose)                                              

In [21]:
# check to make sure that we generate valid audio
output = generator(tf.random.normal((1, *GENERATOR_INPUT_SIZE)), training=False).numpy()
output = np.array(output, dtype=np.float32)
assert np.max(output) <= 1 and np.min(output) >= -1
display.Audio(output, rate=sample_rate)

## Load

In [22]:
# load previouisly saved weights
generator.load_weights("./generator")
discriminator.load_weights("./discriminator") 

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2c123afb610>

## loss and optimizers

In [23]:
with tf.device(DEVICE):
    loss = losses.BinaryCrossentropy(from_logits=True)
    generator_optimizer = optimizers.Adam(0.0001)
    discriminator_optimizer = optimizers.Adam(0.0001)


In [24]:
# loss functions
@tf.function
def generator_loss(fake_pred):
    # the generator's goal is to have all of its outputs
    # be classified as "real" by the discriminator
    return loss(tf.ones_like(fake_pred), fake_pred)


@tf.function
def discriminator_loss(true_pred, fake_pred):
    # the discriminator's goal is to have all of the real
    # inputs be classified as real and all the generated
    # inputs be classified as fake
    true_loss = loss(tf.ones_like(true_pred), true_pred)
    fake_loss = loss(tf.zeros_like(fake_pred), fake_pred)
    return true_loss + fake_loss


## Training Loop

In [25]:
# get a seed for generating samples while training
SAMPLES_TO_GENERATE = 1

valid = False
if os.path.exists("./seed.b"):
    with open("./seed.b", "rb") as file:
        seed = pickle.load(file)["seed"]
        if seed.shape == (SAMPLES_TO_GENERATE, *GENERATOR_INPUT_SIZE):
            valid = True

if not valid:
    with open("./seed.b", "wb") as file:
        seed = tf.random.normal(
            (SAMPLES_TO_GENERATE, *GENERATOR_INPUT_SIZE), dtype=tf.float16
        )
        pickle.dump({"seed": seed}, file)

seed.shape


TensorShape([1, 400, 2])

In [26]:
def generate_and_save_audio(gen, epoch_num, _seed, display_output=True):
    inferences = generator(_seed, training=False)
    inferences = np.array(inferences.numpy(), dtype=np.float32)
    for i in range(len(inferences)):
        wavfile.write(
            f"./generatedAudio/epoch_{epoch_num}_v{i}.wav", sample_rate, inferences[i]
        )
        if display_output:
            display.display(display.Audio(f"./generatedAudio/epoch_{epoch_num}_v{i}.wav"))


In [27]:
@tf.function
def train_step(audio):
    noise = tf.random.normal([BATCH_SIZE, *GENERATOR_INPUT_SIZE])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_audio = generator(noise, training=True)
        
        true_output = discriminator(audio, training=True)
        fake_output = discriminator(generated_audio, training=True)
        
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(true_output, fake_output)
        
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(
        disc_loss, discriminator.trainable_variables
    )

    generator_optimizer.apply_gradients(
        zip(gradients_of_generator, generator.trainable_variables)
    )
    discriminator_optimizer.apply_gradients(
        zip(gradients_of_discriminator, discriminator.trainable_variables)
    )


In [28]:
def train(dataset, epochs:int, save_model_every:int=3, save_output_every:int=1, display_output:bool=True):
    # get epoch
    if os.path.exists("epoch.b"):
        with open("epoch.b", "rb") as file:
            epoch = pickle.load(file)["epoch"]
    else:
        epoch = 0
        with open("epoch.b", "wb") as file:
            pickle.dump({"epoch": epoch}, file)

    # train for the amount of epochs
    for i in range(epochs):
        epoch += 1
        start = time.time()

        for image_batch in dataset:
            train_step(image_batch)

        display.clear_output(wait=True)
        
        # Produce audio every `save_output_every` epochs
        if (i+1) % save_output_every == 0:
            generate_and_save_audio(generator, epoch, seed, display_output)

        # Save the model every `save_model_every` epochs
        if (i + 1) % save_model_every == 0:
            generator.save("./generatorContinued")
            discriminator.save("./discriminatorContinued")

        print("Time for epoch {} (# {} of this session) is {} sec".format(epoch, i+1, time.time() - start))

    # Generate after the final epoch
    display.clear_output(wait=True)
    generate_and_save_audio(generator, epoch, seed)

    # save our epoch number
    with open("epoch.b", "wb") as file:
        pickle.dump({"epoch": epoch}, file)


## Train!!

In [29]:
with tf.device(DEVICE):
    train(dataset, 400, save_model_every=100, save_output_every=100)


## Save

In [30]:
# save the models
generator.save("./generator")
discriminator.save("./discriminator")





INFO:tensorflow:Assets written to: ./generator\assets


INFO:tensorflow:Assets written to: ./generator\assets






INFO:tensorflow:Assets written to: ./discriminator\assets


INFO:tensorflow:Assets written to: ./discriminator\assets


## Try with random seeds

In [33]:
display.Audio(generator(tf.random.normal((1, 400, 2)), training=False).numpy(), rate=sample_rate)