# Visualize Embeddings with TensorBoard Projector
#### Task: Foreign Accent English classification
#### Corpus: Common Voice

In [31]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import datetime
import os
import json
from collections import OrderedDict
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from tqdm import tqdm
from omegaconf import OmegaConf
from nemo.collections.asr.models import EncDecSpeakerLabelModel
from matplotlib import pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

## Constants Definition
* Model
* Data manifests
* Output file

In [32]:
BASE_DIR = '/data1/nemo_experiments'
EXP_DIR = f'{BASE_DIR}/220831-Finetune+Eval-CV-validated'
EXP_DIR0 = f'{BASE_DIR}/220823-Finetune-CV-validated/220825-Finetune-CV-eval'
LOG_DIR = f'{BASE_DIR}/tb'

In [33]:
# Model
model_nemo_path = f'{BASE_DIR}/tb/Finetune-CV/2022-08-27_02-58-59/checkpoints/Finetune-CV.nemo'
#
# Data manifests
eval_manifest_filepath = f'{EXP_DIR}/cv-self-manifest.json'

eval0_manifest_filepath = f'{EXP_DIR0}/antonio_validated-reduced-resampled-eval_manifest.json'

#
# The problem with the data below is that the label space is different.
#wcat_seg_manifest_filepath = f'{BASE_DIR}/WildcatDiapix/filelist_manifest-2.json'
#wcat_all_manifest_filepath = f'{BASE_DIR}/WildcatDiapix/filelist_manifest.json'
#

In [34]:
# Define In/Out filepaths
# Select below as needed.
#
conf = OmegaConf.create()
conf['data'] = {}
# IN:
conf.data['eval'] = dict(manifest_path=eval_manifest_filepath)
conf.data.eval['title'] = "%s\n%s" % ( model_nemo_path.replace(BASE_DIR+"/",""),  os.path.basename( conf.data.eval.manifest_path ))
#
conf.data['eval0'] = dict(manifest_path=eval0_manifest_filepath)
conf.data.eval0['title'] = "%s\n%s" % ( model_nemo_path.replace(BASE_DIR+"/",""),  os.path.basename( conf.data.eval0.manifest_path ))
#

-----
### Step 1: Load model

In [35]:
# Step 1: Load DNN.
speaker_model = EncDecSpeakerLabelModel.restore_from(model_nemo_path)

[NeMo W 2022-08-31 12:37:59 modelPT:149] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: nemo_experiments/220823-Finetune-CV-validated/220825-Finetune-CV-eval/antonio_validated-reduced-resampled-train_manifest.json
    sample_rate: 16000
    labels:
    - african
    - australian
    - canada
    - england
    - hongkong
    - india
    - ireland
    - newzealand
    - philippines
    - scotland
    - us
    batch_size: 32
    shuffle: true
    augmentor:
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    
[NeMo W 2022-08-31 12:37:59 modelPT:156] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the valid

[NeMo I 2022-08-31 12:37:59 features:200] PADDING: 16
[NeMo I 2022-08-31 12:38:00 label_models:98] loss is Angular Softmax


                    not been set for this class (TopKClassificationAccuracy). The property determines if `update` by
                    default needs access to the full metric state. If this is not the case, significant speedups can be
                    achieved and we recommend setting this to `False`.
                    We provide an checking function
                    `from torchmetrics.utilities import check_forward_no_full_state`
                    that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                    default for now) or if `full_state_update=False` can be used safely.
                    
    


[NeMo I 2022-08-31 12:38:00 save_restore_connector:243] Model EncDecSpeakerLabelModel was successfully restored from /data1/nemo_experiments/tb/Finetune-CV/2022-08-27_02-58-59/checkpoints/Finetune-CV.nemo.


-----
### Step 2: Extract Embeddings
#### Evaluation Set

In [36]:
# STEP 2.1: Run Evaluation.
# 
MANIFEST_FILEPATH = conf.data.eval.manifest_path
#
eval_embs, eval_logits, eval_ref_labels, eval_idx2labD = \
    EncDecSpeakerLabelModel.get_batch_embeddings(
    speaker_model=speaker_model, 
    manifest_filepath=MANIFEST_FILEPATH, 
    batch_size=32, 
    sample_rate=16000, 
    device='cuda' )
eval_embs = eval_embs / (np.linalg.norm(eval_embs, ord=2, axis=-1, keepdims=True))

[NeMo I 2022-08-31 12:38:13 collections:290] Filtered duration for loading collection is 0.000000.
[NeMo I 2022-08-31 12:38:13 collections:294] # 24 files loaded accounting to # 2 labels


100%|██████████| 1/1 [00:00<00:00, 14.92it/s]


In [37]:
# STEP 2.1: Run Evaluation.
# 
MANIFEST_FILEPATH = conf.data.eval0.manifest_path
#
eval0_embs, eval0_logits, eval0_ref_labels, eval0_idx2labD = \
    EncDecSpeakerLabelModel.get_batch_embeddings(
    speaker_model=speaker_model, 
    manifest_filepath=MANIFEST_FILEPATH, 
    batch_size=32, 
    sample_rate=16000, 
    device='cuda' )
eval0_embs = eval0_embs / (np.linalg.norm(eval0_embs, ord=2, axis=-1, keepdims=True))

[NeMo I 2022-08-31 12:39:03 collections:290] Filtered duration for loading collection is 0.000000.
[NeMo I 2022-08-31 12:39:03 collections:294] # 2563 files loaded accounting to # 11 labels


100%|██████████| 81/81 [00:05<00:00, 15.46it/s]


---------
### Step 4: Visualization of the Embedding Space
#### TensorBoard Projector

In [38]:
# Switch this to True if you really want to create a new entry in TB.
WRITE2TB = True

##### - Eval Data Points

In [9]:
# Gather vector data
vectors = eval_embs
metadata = [ eval_idx2labD[x] for x in eval_ref_labels ]

In [10]:
# Create SummaryWritter
logdir = f'{LOG_DIR}/SelfEmbsEvalCV_{datetime.datetime.now().isoformat()}'.replace(':','-')
if WRITE2TB:
    writer = SummaryWriter(logdir)
    writer = SummaryWriter(logdir)
    writer.add_embedding(vectors, metadata)
    writer.close()
print(f'Wrote {vectors.shape[0]} embeddings in a {vectors.shape[1]}-dim space.')

Wrote 24 embeddings in a 192-dim space.


----------
### Experimentsl (skip this part below).
###### Trying to add more meta-data.

In [70]:
MANIFEST_FILEPATH = conf.data.eval.manifest_path
manifestDF = pd.read_json(MANIFEST_FILEPATH, lines=True)
manifestDF['path'] = manifestDF.audio_filepath.apply(lambda x : x.split("/")[-1])

In [77]:
#
#
origMetaDF = pd.read_csv(f"{BASE_DIR}/220831-Finetune+Eval-CV-validated/CV-self-recordings/data.tsv", sep=',')
#origMetaDF
#origMetaDF.columns=['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'locale', 'segment', 'label']
#
_ = pd.merge(left=manifestDF, right=origMetaDF, how='left', on=['path', 'label'])
_['speaker'] = _.client_id
_['collection'] = 'iact'
metadataLOL = _[['label', 'gender','age','speaker', 'sentence', 'collection']].values.tolist()
#metadataLOL = manifestDF[['label', 'path']].values.tolist()
#metadataLOL
MANIFEST_FILEPATH = conf.data.eval0.manifest_path
manifest0DF = pd.read_json(MANIFEST_FILEPATH, lines=True)
manifest0DF['path'] = manifest0DF.audio_filepath.apply(lambda x : x.split("/")[-1]).apply(lambda x : x.replace('.wav','.mp3'))

origMeta0DF = pd.read_csv(f"{BASE_DIR}/220823-Finetune-CV-validated/validated-label-dur.tsv", sep='\t', header=None)
origMeta0DF.columns=['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'locale', 'segment', 'label', 'duration']
#
_ = pd.merge(left=manifest0DF, right=origMeta0DF, how='left', on=['path', 'label'])
_['speaker'] = _.client_id.apply(lambda x : x[-4:])
_['collection'] = 'mozilla'
metadata0LOL = _[['label', 'gender','age','speaker', 'sentence', 'collection']].values.tolist()

metadataLOL += metadata0LOL

In [67]:
embs = np.concatenate((eval_embs,eval0_embs), axis=0)

In [81]:
# Gather vector data
vectors = embs

In [82]:
# Create SummaryWritter
logdir = f'{LOG_DIR}/SelfEmbsEvalCV_{datetime.datetime.now().isoformat()}'.replace(':','-')
WRITE2TB = True
if WRITE2TB:
    writer = SummaryWriter(logdir)
    writer.add_embedding(vectors, metadataLOL, metadata_header=['label', 'gender','age','speaker', 'sentence', 'collection'], tag='SelfEmbsEvalCV')
    writer.close()
print(f'{logdir}\nWrote {vectors.shape[0]} embeddings in a {vectors.shape[1]}-dim space.')

/data1/nemo_experiments/tb/SelfEmbsEvalCV_2022-08-31T13-09-03.226288
Wrote 2587 embeddings in a 192-dim space.
