In [1]:
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter04/Voice%20Impersonation/voice_impersonation_input/Eleanor_Roosevelt.wav
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter04/Voice%20Impersonation/voice_impersonation_input/male_voice.wav

--2021-05-11 08:27:48--  https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter04/Voice%20Impersonation/voice_impersonation_input/Eleanor_Roosevelt.wav
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/master/Chapter04/Voice%20Impersonation/voice_impersonation_input/Eleanor_Roosevelt.wav [following]
--2021-05-11 08:27:48--  https://raw.githubusercontent.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/master/Chapter04/Voice%20Impersonation/voice_impersonation_input/Eleanor_Roosevelt.wav
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
H

Taken from ch4 of Machine learning for cyber security cookbook.

### voice_impersonation_model.py

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
cuda = True if torch.cuda.is_available() else False

In [3]:
N_FFT = 512
N_CHANNELS = round(1 + N_FFT/2)
OUT_CHANNELS = 32

In [4]:
class RandomCNN(nn.Module):
    def __init__(self):
        super(RandomCNN, self).__init__()
        
        self.conv1 = nn.Conv2d(1, OUT_CHANNELS, kernel_size=(3,1), stride=1,padding=0)
        self.LeakyReLU = nn.LeakyReLU(0.2)
        
        # set the random parameters to be constant
        weight = torch.randn(self.conv1.weight.data.shape)
        self.conv1.weight = torch.nn.Parameter(weight, requires_grad=False)
        bias = torch.zeros(self.conv1.bias.data.shape)
        self.conv1.bias = torch.nn.Parameter(bias,requires_grad=False)
    
    def forward(self, x_delta):
        out = self.LeakyReLU(self.conv1(x_delta))
        return out
    
    

In [5]:
# testing

a_random = Variable(torch.randn(1,1,257,430)).float()
model = RandomCNN()
a_O = model(a_random)
print(a_O.shape)

torch.Size([1, 32, 255, 430])


### voice_impersonation_utils.py

In [6]:
import librosa
import numpy as np


import soundfile as sf


In [18]:
def wav2spectrum(filename):
    x, sr = librosa.load(filename)
    S = librosa.stft(x, N_FFT)
    p = np.angle(5)
    
    S = np.log1p(np.abs(S))
    return S, sr

def spectrum2wav(spectrum, sr, outfile):
    a = np.exp(spectrum) - 1
    p = 2 * np.pi * np.random.random_sample(spectrum.shape) - np.pi
    for i in range(50):
        S = a * np.exp(1j * p)
        x = librosa.istft(S)
        p = np.angle(librosa.stft(x, N_FFT))
#     librosa.output.write_wav(outfile, x, sr)
    sf.write(outfile,x,sr)

def wav2spectrum_keep_phase(filename):
    x, sr = librosa.load(filename)
    S = librosa.stft(x, N_FFT)
    p = np.angle(S)
    
    S = np.log1p(np.abs(S))
    return S, p, sr

def spectrum2wav_keep_phase(spectrum, p , sr, outfile):
    a = np.exp(spectrum) - 1
    for i in range(50):
        S = a * np.exp(1j * p)
        x = librosa.istft(S)
        p = np.angle(librosa.stft(x, N_FFT))
    sf.write(outfile,x,sr)

def compute_content_loss(a_C, a_G):
    m, n_C, n_H, n_W = a_G.shape
    
    a_C_unrolled = a_C.view(m * n_C, n_H * n_W)
    a_G_unrolled = a_G.view(m * n_C, n_H * n_W)
    
    J_content = 1.0/ (4 * m * n_C * n_H * n_W) * torch.sum((a_C_unrolled - a_G_unrolled) ** 2) 
    
    return J_content

def gram(A):
    GA = torch.matmul(A, A.t())
    return GA

def gram_over_time_axis(A):
    m, n_C, n_H, n_W = A.shape
    
    A_unrolled = A.view(m * n_C * n_H ,n_W)
    GA = torch.matmul(A_unrolled, A_unrolled.t())
    
    return GA

def compute_layer_style_loss(a_S, a_G):
    m, n_C, n_H, n_W = a_G.shape
    GS = gram_over_time_axis(a_S)
    GG = gram_over_time_axis(a_G)
    
    J_style_layer = 1.0/ (4 * (n_C **2) * (n_H * n_W)) * torch.sum((GS-GG) ** 2)
    
    return J_style_layer


In [8]:
# testing

test_S = torch.randn(1,6,2,2)
test_G = torch.randn(1,6,2,2)

print(test_S)
print(test_G)
print(compute_layer_style_loss(test_S, test_G))

print(compute_content_loss(test_S, test_G))

tensor([[[[ 0.9696,  0.6287],
          [-0.0295,  0.4547]],

         [[ 0.2861,  1.4515],
          [-1.2532, -0.9099]],

         [[-0.3781, -1.2907],
          [-1.0533,  0.1849]],

         [[-0.7200, -0.5681],
          [ 1.6335, -0.5072]],

         [[-0.3220,  0.2964],
          [-0.0226,  0.9850]],

         [[ 0.6676,  0.6030],
          [-0.2105,  0.9193]]]])
tensor([[[[ 1.0952, -0.4757],
          [-1.0673, -0.3312]],

         [[ 2.6644,  0.8887],
          [ 0.2217, -0.9720]],

         [[-0.1809, -0.0598],
          [ 0.0731, -0.4985]],

         [[ 1.7319, -1.4133],
          [ 0.3396,  0.9307]],

         [[ 0.6116, -0.1380],
          [ 0.6377,  0.1399]],

         [[ 0.6768, -0.7000],
          [-1.0808,  0.1378]]]])
tensor(0.5827)
tensor(0.3137)


### utilising the function for voice impersonation

In [9]:
import math

from torch.autograd import Variable

In [10]:
input_files = "./"
content_file  = "./male_voice.wav"
style_file = "./Eleanor_Roosevelt.wav"

In [11]:
audio_content, sampling_rate = wav2spectrum(content_file)
audio_style, sampling_rate = wav2spectrum(style_file)
audio_content_torch = torch.from_numpy(audio_content)[None, None, :,:]
audio_style_torch = torch.from_numpy(audio_style)[None, None, :,:]
voice_impersonation_model = RandomCNN()
voice_impersonation_model.eval()

RandomCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 1), stride=(1, 1))
  (LeakyReLU): LeakyReLU(negative_slope=0.2)
)

In [12]:
audio_content_variable = Variable(audio_content_torch, requires_grad=False).float()
audio_style_variable = Variable(audio_style_torch, requires_grad=False)
audio_content = voice_impersonation_model(audio_content_variable)
audio_style = voice_impersonation_model(audio_style_variable)

learning_rate = 0.003
audio_G_var = Variable(torch.randn(audio_content_torch.shape) * 1e-3, requires_grad=True)
opt = torch.optim.Adam([audio_G_var])

style_param = 1
content_param = 5e2

num_epochs = 500
print_frequency = 50
                                                                                

In [13]:
for epoch in range(1, num_epochs +1):
    opt.zero_grad()
    audio_G = voice_impersonation_model(audio_G_var)
    
    content_loss = content_param * compute_content_loss(audio_content, audio_G)
    style_loss = style_param *compute_layer_style_loss(audio_style, audio_G)
    
    loss = content_loss + style_loss
    loss.backward()
    opt.step()
    
    if epoch % print_frequency == 0:
        print("epoch: ", str(epoch))
        print("content loss: ", str(content_loss.item()) )
        print("style loss: ", str(style_loss.item()))
        print("loss: ", str(loss.item()))

epoch:  50
content loss:  37.094032287597656
style loss:  342.8563537597656
loss:  379.95037841796875
epoch:  100
content loss:  33.55370330810547
style loss:  328.39300537109375
loss:  361.94671630859375
epoch:  150
content loss:  30.514820098876953
style loss:  306.8638610839844
loss:  337.3786926269531
epoch:  200
content loss:  28.111671447753906
style loss:  282.5039978027344
loss:  310.61566162109375
epoch:  250
content loss:  26.16810417175293
style loss:  258.0396423339844
loss:  284.2077331542969
epoch:  300
content loss:  24.498979568481445
style loss:  234.2759552001953
loss:  258.7749328613281
epoch:  350
content loss:  23.024330139160156
style loss:  211.25001525878906
loss:  234.27435302734375
epoch:  400
content loss:  21.73453140258789
style loss:  189.07858276367188
loss:  210.8131103515625
epoch:  450
content loss:  20.624061584472656
style loss:  168.003173828125
loss:  188.62722778320312
epoch:  500
content loss:  19.677778244018555
style loss:  148.2705535888672
lo

In [19]:
gen_spectrum = audio_G_var.cpu().data.numpy().squeeze()
output_audio_name = "Eleanors_voice_but_speech_changed.wav"
spectrum2wav(gen_spectrum, sampling_rate, output_audio_name)

### lets listen

In [20]:
!pip install pydub



In [22]:
from pydub import AudioSegment
import IPython


audio_file = "./male_voice.wav"

IPython.display.Audio(audio_file)
#content

In [23]:
IPython.display.Audio("./Eleanor_Roosevelt.wav")
#style

combining the syle and content


In [25]:
IPython.display.Audio("./Eleanors_voice_but_speech_changed.wav")