In [6]:
%cd /content/drive/MyDrive/voiceConversionExp/

/content/drive/MyDrive/voiceConversionExp


In [None]:
! pip install resemblyzer

In [77]:
from sklearn.metrics.pairwise import cosine_similarity
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import numpy as np
import os
from collections import defaultdict
from tqdm import tqdm

In [96]:
def loadEncoder():
  return VoiceEncoder()

def getSpeakerEmbeddings(encoder, audio_file):

  # get embeddings
  fpath = Path(audio_file)
  wav = preprocess_wav(audio_file)
  embed = encoder.embed_utterance(wav)
  return embed

def getSpeakerSimilarity(encoder, audio_file_1, audio_file_2):
  # get speaker similarity
  embed_1 = getSpeakerEmbeddings(encoder, audio_file_1).reshape(1, -1)
  embed_2 = getSpeakerEmbeddings(encoder, audio_file_2).reshape(1, -1)

  sim = cosine_similarity(embed_1, embed_2)
  score = sim[0][0]
  return score

def getSpeakerSimilarityBatch(encoder, generated_audio_folder, target_speaker_folder, lang_list):
  """get score over folders"""
  target_speaker_lists = os.listdir(target_speaker_folder)
  score = defaultdict(list)
  cnt = 0
  for root, dirs, files in os.walk(generated_audio_folder):
    path = root.split(os.sep)
    speakerID = path[-1]
    # print(root)
    for file in files:
        cnt+=1
        gen_audio_path = root+ "/" + file
        lang_code = gen_audio_path[-7:-4]
        target_aud_path = target_speaker_folder + "/" + speakerID + ".wav"
        sim_score = getSpeakerSimilarity(encoder, gen_audio_path, target_aud_path)
        score[lang_code].append(sim_score)
        if cnt%1000==0:
          print("processed", cnt)
    # break

  return score
  

In [14]:
# load voice encoder
encoder = loadEncoder()

Loaded the voice encoder model on cuda in 8.74 seconds.


In [15]:
file_1 = "./QuickVC-VoiceConversion/test_data/vishal_eng.wav"
file_2 = "./QuickVC-VoiceConversion/test_data/common_voice_ru_18909996.wav"

sim_score = getSpeakerSimilarity(encoder, file_1, file_2)
sim_score

0.6763104

In [97]:
lang_list = ['deu', 'eng', 'hin', 'hun', 'ind', 'por', 'rus', 'spa', 'swe', 'tur']

batch_score = getSpeakerSimilarityBatch(encoder, "./QuickVC-VoiceConversion/quicVC_Output/", "./QuickVC-VoiceConversion/targets/", lang_list)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [98]:
print(batch_score)

defaultdict(<class 'list'>, {'spa': [0.742207, 0.81341314, 0.82764024, 0.83785546, 0.8156769, 0.789538, 0.81858927, 0.7978593, 0.87382954, 0.8230442, 0.79067564, 0.8648246, 0.8706619, 0.81861234, 0.78244364, 0.8132623, 0.7774744, 0.7941965, 0.85684013, 0.86300015, 0.756436, 0.8264663, 0.85732794, 0.7768911, 0.78370756, 0.7520943, 0.74785066, 0.7972324, 0.8421229, 0.8497447, 0.7181699, 0.8485024, 0.85368115, 0.78579354, 0.7968881, 0.7899798, 0.76827365, 0.7381111, 0.8462962, 0.7867098, 0.76997817, 0.8828129, 0.87381476, 0.82659364, 0.8507854, 0.7733475, 0.7732029, 0.7851356, 0.8720391, 0.8608017, 0.74923784, 0.8357696, 0.8695541, 0.80670637, 0.8096854, 0.75742364, 0.78800917, 0.788775, 0.8685017, 0.85523444, 0.80143756, 0.89403546, 0.8841122, 0.84559405, 0.8461121, 0.7998531, 0.8186476, 0.81425095, 0.8940322, 0.880778, 0.755352, 0.8363686, 0.83960235, 0.79523516, 0.8193561, 0.7821125, 0.7943256, 0.7842937, 0.8679588, 0.8249314, 0.8019464, 0.8435187, 0.89361775, 0.81941485, 0.80300856, 0

In [101]:
# calculate average score
final_scores = {}
for key in batch_score.keys():
  final_scores[key] = sum(batch_score[key])/len(batch_score[key])  

In [103]:
final_scores

{'spa': 0.8085532957613468,
 'por': 0.8247997328639031,
 'ind': 0.8091015091538429,
 'tur': 0.8256763606667519,
 'rus': 0.8222587457895278,
 'deu': 0.8192623006105423,
 'swe': 0.8296867590546608,
 'eng': 0.8426341117322445,
 'hin': 0.8141634964346885,
 'hun': 0.8096788687109947}

In [104]:
# write scores to a file
with open('similarityScores.csv', 'w') as f:  # You will need 'wb' mode in Python 2.x
    w = csv.DictWriter(f, final_scores.keys())
    w.writeheader()
    w.writerow(final_scores)