In [1]:
from __future__ import print_function
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import numpy as np

import librosa
from IPython.display import Audio, display
from PIL import Image
import matplotlib.pyplot as plt
import scipy.stats as stats

from torch.nn.modules.module import _addindent

import copy
import os
import math

from scipy.stats import ortho_group

import soundfile as sf
from matplotlib.pyplot import figure

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_distances as cos
from sklearn.metrics import mean_squared_error as mse

In [2]:
def log_scale(img):
    img = np.log1p(img)
    return img

def inv_log(img):
    img = np.exp(img) - 1.
    return img

In [3]:
num_steps=1
numStreams=6
learning_Rate= 1

In [4]:
runs = 1 
N_FFT = 512 
K_HOP = 128 
N_FREQ=257
N_FILTERS = 512

possible_kernels = [2,4,8,16,64,128,256,512,1024,2048]
hor_filters = [0]*numStreams
for j in range(numStreams):
    hor_filters[j]=possible_kernels[j]

In [5]:
log_spectrogram=log_scale
inv_log_spectrogram=inv_log

In [6]:
use_cuda = torch.cuda.is_available() #use GPU if available
print('GPU available =',use_cuda)
dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor

GPU available = False


In [7]:
def read_audio_spectum(filename):
    x, fs  = sf.read(filename)
    N_SAMPLES = len(x)   
    R = np.abs(librosa.stft(x, n_fft=N_FFT, hop_length=K_HOP, win_length=N_FFT,  center=False))    
    return R,fs


def findMinMax(img):
    return int(math.floor(np.amin(img))),int(math.ceil(np.amax(img)))

def img_scale(img,datasetMin,datasetMax,scaleMin,scaleMax):
    """scales input numpy array from [datasetMin,datasetMax] -> [scaleMin,scaleMax]"""    
    shift = (scaleMax-scaleMin) / (datasetMax-datasetMin)
    scaled_values = shift * (img-datasetMin) + scaleMin
    return scaled_values

def img_invscale(img,datasetMin,datasetMax,scaleMin,scaleMax):
    """scales input numpy array from [scaleMin,scaleMax] -> [datasetMin,datasetMax]"""
    shift = (datasetMax-datasetMin) / (scaleMax-scaleMin)
    scaled_values = shift * (img-scaleMin) + datasetMin
    return scaled_values
    

    def db_scale(img,scale=80):
        img = librosa.amplitude_to_db(img)
        shift = float(np.amax(img))
        img = img - shift 
        img = img/scale 
        img = img + 1
        img = np.maximum(img, 0)
        return img, shift

    def inv_db(img,shift,scale=80):
        img = img - 1 
        img = img * scale 
        img = img + shift
        img = librosa.db_to_amplitude(img)    
        return img

In [8]:
def prepare_input(FILENAME):
    R, fs = read_audio_spectum(FILENAME)   
    a_style = log_scale(R)
    a_style = np.ascontiguousarray(a_style[None,None,:,:])
    a_style = torch.from_numpy(a_style).permute(0,2,1,3) 
    converted_img = Variable(a_style).type(dtype)
    return converted_img

In [9]:
"Here we create the custom network"
import collections as c

IN_CHANNELS = N_FREQ
def weights_init(m,hor_filter):
    std = np.sqrt(2) * np.sqrt(2.0 / ((N_FREQ + N_FILTERS) * hor_filter))
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, std)

class style_net(nn.Module):
    """Here create the network you want to use by adding/removing layers in nn.Sequential"""
    def __init__(self,hor_filter):
        super(style_net, self).__init__()
        self.layers = nn.Sequential(c.OrderedDict([
                            ('conv1',nn.Conv2d(IN_CHANNELS,N_FILTERS,kernel_size=(1,hor_filter),bias=False)),
                            ('relu1',nn.ReLU())]))

            
    def forward(self,input):
        out = self.layers(input)
        return out
    
cnnlist=[] 
for j in range(numStreams) :
    cnn = style_net(hor_filters[j])
    cnn.apply(lambda x, f=hor_filters[j]: weights_init(x,f))
    for param in cnn.parameters():
        param.requires_grad = False
    if use_cuda:
        cnn = cnn.cuda()
    
    cnnlist.append(cnn)


content_layers_default = [] 
style_layers_default = ['relu_1']

In [11]:
class GramMatrix(nn.Module):

    def forward(self, input):
        a, b, c, d = input.size()         
        features = input.view(b, a * c * d)
        features2=features.unsqueeze(0)
        G = torch.matmul(features2, torch.transpose(features2, 1,2))
        return G.div(a * c * d)

In [12]:
#return normalized gram matrix
def get_gram(cnn,result,style_img, content_img=None,
                               style_weight=1, content_weight=0,
                               content_layers=content_layers_default,
                               style_layers=style_layers_default, style_img2=None):
    cnn = copy.deepcopy(cnn)
    
    model = nn.Sequential()
    layer_list = list(cnn.layers)
    
    gram = GramMatrix()
     
    i = 1  
    for layer in layer_list:
        
        if isinstance(layer, nn.Conv2d): 
            name = "conv_" + str(i)
            model.add_module(name, layer) 

            if name in style_layers: 
                target_feature = model(style_img).clone()
                target_feature_gram = gram(target_feature)
                target_feature_gram=torch.flatten(target_feature_gram)
                target_feature_gram=target_feature_gram.numpy()
                target_feature_gram=target_feature_gram.reshape(512,512)
                target_feature_gram= StandardScaler().fit_transform(target_feature_gram)
                result.append((target_feature_gram))

        if isinstance(layer, nn.ReLU):
            name = "relu_" + str(i)
            model.add_module(name, layer)
                    
            if name in style_layers:
                target_feature = model(style_img).clone()
                target_feature_gram = gram(target_feature)
                target_feature_gram=torch.flatten(target_feature_gram)
                target_feature_gram=target_feature_gram.numpy()
                target_feature_gram=target_feature_gram.reshape(512,512)
                target_feature_gram= StandardScaler().fit_transform(target_feature_gram)
                result.append((target_feature_gram))
               
            i += 1

        if isinstance(layer, nn.MaxPool2d): 
            name = "pool_" + str(i)
            model.add_module(name, layer)
            
            if name in style_layers:
                target_feature = model(style_img).clone()
                target_feature_gram = gram(target_feature)
                target_feature_gram=torch.flatten(target_feature_gram)
                target_feature_gram=target_feature_gram.numpy()
                target_feature_gram=target_feature_gram.reshape(512,512)
                target_feature_gram= StandardScaler().fit_transform(target_feature_gram)
                result.append((target_feature_gram))

    return result

In [13]:
def get_gram_v2(cnn,result,style_img, content_img=None,
                               style_weight=1, content_weight=0,
                               content_layers=content_layers_default,
                               style_layers=style_layers_default, style_img2=None):
    cnn = copy.deepcopy(cnn)
    
    model = nn.Sequential()
    layer_list = list(cnn.layers)
    
    gram = GramMatrix()  
    i = 1  
    for layer in layer_list:
        
        if isinstance(layer, nn.Conv2d):
            name = "conv_" + str(i)
            model.add_module(name, layer) 

            if name in style_layers:
                target_feature = model(style_img).clone()
                target_feature_gram = gram(target_feature)
                target_feature_gram=torch.flatten(target_feature_gram)
                result.append((target_feature_gram))

        if isinstance(layer, nn.ReLU):
            name = "relu_" + str(i)
            model.add_module(name, layer)
                    
            if name in style_layers:
                target_feature = model(style_img).clone()
                target_feature_gram = gram(target_feature)
                target_feature_gram=torch.flatten(target_feature_gram)
                result.append((target_feature_gram))
               
            i += 1

        if isinstance(layer, nn.MaxPool2d): #do the same for maxpool
            name = "pool_" + str(i)
            model.add_module(name, layer)
            
            if name in style_layers:
                target_feature = model(style_img).clone()
                target_feature_gram = gram(target_feature)
                target_feature_gram=torch.flatten(target_feature_gram)
                result.append((target_feature_gram))

    return result

#### Calculate L2 Distance

In [14]:
def euclidean_distance(img_1, img_2):
    df = np.asarray(img_1-img_2)
    dst = np.sqrt(np.sum(df**2))
    return dst

#### Calculate GM Similarity

In [15]:
def compute_cos_distance(gram1,gram2): 
    cos = nn.CosineSimilarity(dim=0, eps=1e-6)
    distance = np.zeros((6,1))
    for i in range(6): 
        temp= cos(gram1[i], gram2[i]) 
        distance[i]=temp
    return 1-distance.mean()

#### Calculate GM Loss

In [16]:
def compute_mse_distance(gram1,gram2):
    distance = np.zeros((6,1))
    for i in range(6): 
        temp= mse(gram1[i], gram2[i]) 
        distance[i]=temp 
    return distance.sum() 