The loss function is MSE & The optimizer is Adam

In [5]:
from __future__ import print_function
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import numpy as np 
import librosa
import copy
import soundfile as sf

#define a class named CNNModel,ref:https://github.com/alishdipani/Neural-Style-Transfer-Audio/blob/master/NeuralStyleTransfer.py
class CNNModel(nn.Module):
    #initialization method of the class, takes self parameter 
    def __init__(self):
        #call the initialization method of the parent class (nn.Module)
        super(CNNModel, self).__init__()
        #create a 1D convolutional layer with input channels 1025, output channels 4096, kernel size 3, stride 1, padding 1
        self.cnn1 = nn.Conv1d(in_channels=1025, out_channels=4096, kernel_size=3, stride=1, padding=1)
        
    #define the forward method, takes input x
    def forward(self, x):
        #pass the input x to the convolutional layer self.cnn1
        out = self.cnn1(x)
        #flatten the output into a one-dimensional vector
        out = out.view(out.size(0), -1)
        #return the output
        return out

#define a class named GramMatrix, inheriting from nn.Module class
class GramMatrix(nn.Module):
    #define the forward method, takes input
    def forward(self, input):
        #get the shape (dimensions) of the input
        a, b, c = input.size()
        #reshape the input into a two-dimensional tensor
        features = input.view(a * b, c)
        #compute the Gram matrix
        G = torch.mm(features, features.t())
        #return the normalized Gram matrix
        return G.div(a * b * c)

#define a class named StyleLoss, inheriting from nn.Module class
class StyleLoss(nn.Module):
    #initialization method of the class, takes target and weight parameters
    def __init__(self, target, weight):
        #call the initialization method of the parent class (nn.Module)
        super(StyleLoss, self).__init__()
        #convert the target value to Variable and multiply by weight
        self.target = target.detach() * weight
        #create an instance of GramMatrix to compute the Gram matrix
        self.gram = GramMatrix()
        #create a mean squared error loss function
        self.criterion = nn.MSELoss()

    #define the forward method, takes input input
    def forward(self, input):
        #compute the Gram matrix of the input
        self.G = self.gram(input)
        #compute the loss value
        self.loss = self.criterion(self.G, self.target)
        #return the input, but actually make no change
        return input

#define a function to read the audio spectrum, takes filename parameter
def read_audio_spectrum(filename):
    #load the audio file using the librosa library, specifying duration as 30s
    x, fs = librosa.load(filename, duration=30)
    #compute the Short-Time Fourier Transform (STFT) of the audio
    S = librosa.stft(x, n_fft=2048)
    #take the logarithm of the magnitude spectrum of STFT
    S = np.log1p(np.abs(S))
    #return the log magnitude spectrum and sample rate
    return S, fs

#define a loss function
def get_style_model_and_losses(cnn, style_float, style_weight=2500):
    #make a copy of the original CNN model to ensure no modification to the original model
    cnn = copy.deepcopy(cnn)
    #create an empty list to store style losses
    style_losses = []
    #create a Sequential model to combine convolutional layers and loss functions
    model = nn.Sequential()
    #create an instance of GramMatrix to compute the Gram matrix
    gram = GramMatrix()
    #add the convolutional layers of the original model to the new Sequential model. This convolutional layer is named 'conv_1'
    model.add_module('conv_1', cnn.cnn1)
    #pass the style image through the model to get the feature representation of the input image
    target_feature = model(style_float).clone()
    #compute the Gram matrix of the target feature
    target_feature_gram = gram(target_feature)
    #create a StyleLoss instance, passing the Gram matrix of the target feature and the style weight
    style_loss = StyleLoss(target_feature_gram, style_weight)
    #add the style loss function to the Sequential model for optimization
    model.add_module('style_loss_1', style_loss)
    #add the style loss to the style loss list
    style_losses.append(style_loss)
    return model, style_losses

#defines an optimization function that converts inputs to trainable parameters and creates an Adam optimizer
def get_input_param_optimizer(input_float):
    #convert the input parameters to trainable parameters
    input_param = nn.Parameter(input_float.data)
    #use Adam optimizer to optimize the parameters
    optimizer = optim.Adam([input_param], lr=0.1, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
    return input_param, optimizer

#defines a style transfer function that optimizes an input image to match the style of a target image using a specified CNN
def run_style_transfer(cnn, style_float, input_float, num_steps=2500, style_weight=2500):
    #get the model and style losses
    model, style_losses = get_style_model_and_losses(cnn, style_float, style_weight)
    #get the input parameters and optimizer
    input_param, optimizer = get_input_param_optimizer(input_float)
    run = [0]

    while run[0] <= num_steps:
        def closure():
            #zero the gradients
            input_param.data.clamp_(0, 1)
            optimizer.zero_grad()
            #forward pass
            style_score = model(input_param)
            #compute the style loss
            style_score = sum([sl.loss for sl in style_losses])
            style_score.backward()
            run[0] += 1
            #output iteration count and style loss
            if run[0] % 100 == 0:
                print("run {}: Style Loss : {:8f}".format(run[0], style_score.item()))

            return style_score

        #perform optimization step
        optimizer.step(closure)
    #clamp the values of input parameters within a reasonable range
    input_param.data.clamp_(0, 1)
    return input_param.data

if __name__ == '__main__':
    #specify the file paths for content audio and style audio
    content_audio_name = 'generated/jazz.wav'
    style_audio_name = 'generated/generated_audio.wav'

    #read the spectrum and sample rate of the style audio and content audio
    style_audio, style_sr = read_audio_spectrum(style_audio_name)
    content_audio, content_sr = read_audio_spectrum(content_audio_name)

   #check if the sample rates of content audio and style audio are the same
    if content_sr != style_sr:
        exit()

     #compute the minimum of sample points of style audio and content audio and crop,ref:https://stackoverflow.com/questions/43204441/how-to-split-the-audio-file-in-python
    num_samples = min(style_audio.shape[1], content_audio.shape[1])  
    style_audio = style_audio[:, :num_samples]
    content_audio = content_audio[:, :num_samples]

    #add a dimension to the spectrum arrays of style audio and content audio,ref:https://numpy.org/doc/stable/reference/generated/numpy.expand_dims.html
    style_audio = np.expand_dims(style_audio, axis=0)
    content_audio = np.expand_dims(content_audio, axis=0)

    #convert the spectrum arrays of style audio and content audio to PyTorch Variable objects
    style_float = Variable(torch.from_numpy(style_audio))
    content_float = Variable(torch.from_numpy(content_audio))

    #instantiate the CNNModel() class to create a convolutional neural network model cnn
    cnn = CNNModel()
    #call the run_style_transfer function to run the style transfer algorithm
    output = run_style_transfer(cnn, style_float, content_float)
    #process the output by removing the batch dimension and then convert to an array
    output = output.squeeze(0).numpy()
    #create an array of zeros with the same shape as the output
    a = np.zeros_like(output)
    #Exponentiate the output array, then subtract 1 to get the new array a
    a = np.exp(output) - 1

    #use an iterative approach by multiplying the exponentiated output array a with random phase p for inverse Short-Time Fourier Transform
    p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi
    for _ in range(500):
        S = a * np.exp(1j*p)
        x = librosa.istft(S)
        #use librosa.stft function to get the new frequency domain representation of audio data x and update the random phase p for the next iteration
        p = np.angle(librosa.stft(x, n_fft=2048))

    #output file name
    OUTPUT_FILENAME = 'generated/output_audio_1.wav'
    #write the generated audio to file, use style_sr as the sampling rate. Assume content and style audio have the same sampling rate
    sf.write(OUTPUT_FILENAME, x, style_sr) 
    print('DONE...')

run 100: Style Loss : 0.011750
run 200: Style Loss : 0.011750
run 300: Style Loss : 0.011750
run 400: Style Loss : 0.011750
run 500: Style Loss : 0.011750
run 600: Style Loss : 0.011750
run 700: Style Loss : 0.011750
run 800: Style Loss : 0.011750
run 900: Style Loss : 0.011750
run 1000: Style Loss : 0.011750
run 1100: Style Loss : 0.011750
run 1200: Style Loss : 0.011750
run 1300: Style Loss : 0.011750
run 1400: Style Loss : 0.011750
run 1500: Style Loss : 0.011750
run 1600: Style Loss : 0.011750
run 1700: Style Loss : 0.011750
run 1800: Style Loss : 0.011750
run 1900: Style Loss : 0.011749
run 2000: Style Loss : 0.011749
run 2100: Style Loss : 0.011749
run 2200: Style Loss : 0.011749
run 2300: Style Loss : 0.011749
run 2400: Style Loss : 0.011749
run 2500: Style Loss : 0.011749
DONE...


The loss function is MAE & The optimizer is SGD

In [6]:
from __future__ import print_function
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import numpy as np 
import librosa
import copy
import soundfile as sf

#define a class named CNNModel,ref:https://github.com/alishdipani/Neural-Style-Transfer-Audio/blob/master/NeuralStyleTransfer.py
class CNNModel(nn.Module):
    #initialization method
    def __init__(self):
        #call the initialization method
        super(CNNModel, self).__init__()
        #create a 1D convolutional layer with input channels 1025, output channels 4096, kernel size 3, stride 1, padding 1
        self.cnn1 = nn.Conv1d(in_channels=1025, out_channels=4096, kernel_size=3, stride=1, padding=1)
        
    #define the forward method, takes input x
    def forward(self, x):
        #pass the input x to the convolutional layer self.cnn1
        out = self.cnn1(x)
        #flatten the output into a one-dimensional vector
        out = out.view(out.size(0), -1)
        #return the output
        return out

#define a class named GramMatrix, inheriting from nn.Module class
class GramMatrix(nn.Module):
    #define the forward method, takes input
    def forward(self, input):
        #get the shape (dimensions) of the input
        a, b, c = input.size()
        #reshape the input into a two-dimensional tensor
        features = input.view(a * b, c)
        #compute the Gram matrix
        G = torch.mm(features, features.t())
        #return the normalized Gram matrix
        return G.div(a * b * c)

#define a class named StyleLoss, inheriting from nn.Module class
class StyleLoss(nn.Module):
    def __init__(self, target, weight):
        super(StyleLoss, self).__init__()
        self.target = target.detach() * weight
        self.gram = GramMatrix()
        #use Mean Absolute Error (MAE) loss function,ref:https://neptune.ai/blog/pytorch-loss-functions#Mean-Absolute-Error
        self.criterion = nn.L1Loss()

    #define the forward method, takes input input
    def forward(self, input):
        #compute the Gram matrix of the input
        self.G = self.gram(input)
        #compute the loss value
        self.loss = self.criterion(self.G, self.target)
        #return the input, but actually make no change
        return input

#define a function to read the audio spectrum, takes filename parameter
def read_audio_spectrum(filename):
    #load the audio file using the librosa library, specifying duration as 30s
    x, fs = librosa.load(filename, duration=30)
    #compute the Short-Time Fourier Transform (STFT) of the audio
    S = librosa.stft(x, n_fft=2048)
    #take the logarithm of the magnitude spectrum of STFT
    S = np.log1p(np.abs(S))
    #return the log magnitude spectrum and sample rate
    return S, fs

#define a loss function
def get_style_model_and_losses(cnn, style_float, style_weight=2500):
    #make a copy of the original CNN model to ensure no modification to the original model
    cnn = copy.deepcopy(cnn)
    #create an empty list to store style losses
    style_losses = []
    #create a Sequential model to combine convolutional layers and loss functions
    model = nn.Sequential()
    #create an instance of GramMatrix to compute the Gram matrix
    gram = GramMatrix()
    #add the convolutional layers of the original model to the new Sequential model. This convolutional layer is named 'conv_1'
    model.add_module('conv_1', cnn.cnn1)
    #pass the style image through the model to get the feature representation of the input image
    target_feature = model(style_float).clone()
    #compute the Gram matrix of the target feature
    target_feature_gram = gram(target_feature)
    #create a StyleLoss instance, passing the Gram matrix of the target feature and the style weight
    style_loss = StyleLoss(target_feature_gram, style_weight)
    #add the style loss function to the Sequential model for optimization
    model.add_module('style_loss_1', style_loss)
    #add the style loss to the style loss list
    style_losses.append(style_loss)
    return model, style_losses

#Defines an optimization function that converts inputs to trainable parameters and creates an Adam optimizer
def get_input_param_optimizer(input_float):
    #convert the input parameter to trainable parameter
    input_param = nn.Parameter(input_float.data)
    #use SGD optimizer to optimize the parameters,ref:https://pytorch.org/docs/stable/optim.html
    optimizer = optim.SGD([input_param], lr=0.05, momentum=0.9)  
    return input_param, optimizer

#defines a style transfer function that optimizes an input image to match the style of a target image using a specified CNN
def run_style_transfer(cnn, style_float, input_float, num_steps=2500, style_weight=2500):
    #get the model and style losses
    model, style_losses = get_style_model_and_losses(cnn, style_float, style_weight)
    #get the input parameters and optimizer
    input_param, optimizer = get_input_param_optimizer(input_float)
    run = [0]

    while run[0] <= num_steps:
        def closure():
            #zero the gradients
            input_param.data.clamp_(0, 1)
            optimizer.zero_grad()
            #forward pass
            style_score = model(input_param)
            #compute the style loss
            style_score = sum([sl.loss for sl in style_losses])
            style_score.backward()
            run[0] += 1
            #output iteration count and style loss
            if run[0] % 100 == 0:
                print("run {}: Style Loss : {:8f}".format(run[0], style_score.item()))

            return style_score

        #perform optimization step
        optimizer.step(closure)
    #clamp the values of input parameters within a reasonable range
    input_param.data.clamp_(0, 1)
    return input_param.data

if __name__ == '__main__':
    #specify the file paths for content audio and style audio
    content_audio_name = 'generated/jazz.wav'
    style_audio_name = 'generated/generated_audio.wav'

    #read the spectrum and sample rate of the style audio and content audio
    style_audio, style_sr = read_audio_spectrum(style_audio_name)
    content_audio, content_sr = read_audio_spectrum(content_audio_name)

   #check if the sample rates of content audio and style audio are the same
    if content_sr != style_sr:
        exit()

    #compute the minimum of sample points of style audio and content audio and crop,ref:https://stackoverflow.com/questions/43204441/how-to-split-the-audio-file-in-python
    num_samples = min(style_audio.shape[1], content_audio.shape[1])  
    style_audio = style_audio[:, :num_samples]
    content_audio = content_audio[:, :num_samples]

    #add a dimension to the spectrum arrays of style audio and content audio,ref:https://numpy.org/doc/stable/reference/generated/numpy.expand_dims.html
    style_audio = np.expand_dims(style_audio, axis=0)
    content_audio = np.expand_dims(content_audio, axis=0)

    #convert the spectrum arrays of style audio and content audio to PyTorch Variable objects
    style_float = Variable(torch.from_numpy(style_audio))
    content_float = Variable(torch.from_numpy(content_audio))

    #instantiate the CNNModel() class to create a convolutional neural network model cnn
    cnn = CNNModel()
    #call the run_style_transfer function to run the style transfer algorithm
    output = run_style_transfer(cnn, style_float, content_float)
    #process the output by removing the batch dimension and then convert to an array
    output = output.squeeze(0).numpy()
    #create an array of zeros with the same shape as the output
    a = np.zeros_like(output)
    #Exponentiate the output array, then subtract 1 to get the new array a
    a = np.exp(output) - 1

    #use an iterative approach by multiplying the exponentiated output array a with random phase p for inverse Short-Time Fourier Transform
    p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi
    for _ in range(500):
        S = a * np.exp(1j*p)
        x = librosa.istft(S)
        #use librosa.stft function to get the new frequency domain representation of audio data x and update the random phase p for the next iteration
        p = np.angle(librosa.stft(x, n_fft=2048))

    #output file name
    OUTPUT_FILENAME = 'generated/output_audio_2.wav'
    #write the generated audio to file, use style_sr as the sampling rate. Assume content and style audio have the same sampling rate
    sf.write(OUTPUT_FILENAME, x, style_sr) 
    print('DONE...')

run 100: Style Loss : 0.068326
run 200: Style Loss : 0.068326
run 300: Style Loss : 0.068326
run 400: Style Loss : 0.068326
run 500: Style Loss : 0.068326
run 600: Style Loss : 0.068326
run 700: Style Loss : 0.068326
run 800: Style Loss : 0.068326
run 900: Style Loss : 0.068326
run 1000: Style Loss : 0.068326
run 1100: Style Loss : 0.068326
run 1200: Style Loss : 0.068326
run 1300: Style Loss : 0.068326
run 1400: Style Loss : 0.068326
run 1500: Style Loss : 0.068326
run 1600: Style Loss : 0.068326
run 1700: Style Loss : 0.068326
run 1800: Style Loss : 0.068326
run 1900: Style Loss : 0.068326
run 2000: Style Loss : 0.068326
run 2100: Style Loss : 0.068326
run 2200: Style Loss : 0.068326
run 2300: Style Loss : 0.068326
run 2400: Style Loss : 0.068326
run 2500: Style Loss : 0.068326
DONE...
