# Data Exploration and Visualisation

### Part One : Using How2 Dataset for YouTube videos IDs, transcriptions and description summaries. 

* Construct databases of video transcriptions and human made descriptions
* Also, download audios of 400 videos

In [1]:
import pandas as pd

In [2]:
# Load the test dataset into a pandas data frame
# Load the descriptions into a pandas data frame
# Load the transcriptions into into a pandas data frame 
desctable = pd.read_csv('speech_data/text/sum_cv/desc.tok.txt', header=None, skipinitialspace=True, names=["a"])
desctable.head()
filename = 'speech_data/text/sum_cv/tran.tok.txt'
with open(filename, 'r') as f:
    data = f.read().replace('\n','%%%')
trantable = pd.read_csv(pd.compat.StringIO(data), sep="%%%", header=None)
trantable = trantable.T
trantable.drop(trantable.tail(1).index,inplace=True)
trantable.columns=['a']

  if __name__ == '__main__':


In [3]:
# Split id and description
desctable['id'] = desctable.apply(lambda row: str(row.a).split(" ")[0], axis = 1) 
desctable['desc'] = desctable.apply(lambda row: ' '.join(str(row.a).split(" ")[1:]), axis = 1) 
desctable.drop('a',1,inplace=True)

In [4]:
# Split id and transcription
trantable['id'] = trantable.apply(lambda row: row.a.split(" ")[0], axis = 1) 
trantable['tran'] = trantable.apply(lambda row: ' '.join(row.a.split(" ")[1:]), axis = 1) 
trantable.drop('a',1,inplace=True)

In [5]:
# Join tables on ID to create a single table 
conctable = pd.merge(desctable,trantable,on="id")
conctable.head()

Unnamed: 0,id,desc,tran
0,-xd1aAlPXqs,learn the sivananda yoga single right leg rais...,after you 've done at least six to twelve roun...
1,KtMjOT6fDrw,learn how to apply hanger hooks for your woodc...,"on behalf of expert village , my name is husai..."
2,ehbFyYlcEhc,learn about how hand washing can help prevent ...,hi ! this is david jackel on behalf of expert ...
3,lcw8f2od6z8,how to julienne cucumbers to make kimchi for k...,the other way we can do cucumbers which is als...
4,G-VRHmkiqtc,in order to put photographic emulsion on water...,my name is anthony maddaloni and i 'm going to...


In [None]:
# # Install dependencies to get audio from YouTube
# !pip -q install wget youtube-dl wget 

In [None]:
# # Take the first 400 YouTube video IDs
# youtube_ids = conctable['id'].tolist()
# shortlist = youtube_ids[:400]

In [None]:
# # Loop over the 400 YouTube videos
# # Save each video's audio as 8000Hz wav
# for YOUTUBE_ID in shortlist:
#     !youtube-dl --extract-audio --audio-format wav --quiet --output "{YOUTUBE_ID}_FULL.%(ext)s" https://www.youtube.com/watch\?v\={YOUTUBE_ID}
#     !ffmpeg -loglevel panic -y -i {YOUTUBE_ID}_FULL.wav -acodec pcm_s16le -ac 1 -ar 8000 {YOUTUBE_ID}.wav
#     !rm {YOUTUBE_ID}_FULL.wav

## Data exploration and statistics

In [None]:
from rouge import Rouge 
rouge = Rouge()

In [6]:
exploretable = conctable.iloc[1:2]

In [None]:
exploretable.loc[:,'intersection'] = exploretable.apply(lambda row: [value for value in row.desc.split(" ") if value in row.tran.split(" ")] , axis=1)
exploretable.loc[:,'descnottran'] = exploretable.apply(lambda row: [value for value in row.desc.split(" ") if value not in row.tran.split(" ")] , axis=1)

In [None]:
exploretable['rouge1-f'] = exploretable.apply(lambda row:  rouge.get_scores(row.desc, row.tran)[0]['rouge-1']['f'], axis=1)
exploretable['rouge1-r'] = exploretable.apply(lambda row:  rouge.get_scores(row.desc, row.tran)[0]['rouge-1']['r'], axis=1)
exploretable['rouge1-p'] = exploretable.apply(lambda row:  rouge.get_scores(row.desc, row.tran)[0]['rouge-1']['p'], axis=1)
exploretable['rouge2-f'] = exploretable.apply(lambda row:  rouge.get_scores(row.desc, row.tran)[0]['rouge-2']['f'], axis=1)
exploretable['rouge2-r'] = exploretable.apply(lambda row:  rouge.get_scores(row.desc, row.tran)[0]['rouge-2']['r'], axis=1)
exploretable['rouge2-p'] = exploretable.apply(lambda row:  rouge.get_scores(row.desc, row.tran)[0]['rouge-2']['p'], axis=1)
exploretable['rougel-f'] = exploretable.apply(lambda row:  rouge.get_scores(row.desc, row.tran)[0]['rouge-l']['f'], axis=1)
exploretable['rougel-r'] = exploretable.apply(lambda row:  rouge.get_scores(row.desc, row.tran)[0]['rouge-l']['r'], axis=1)
exploretable['rougel-p'] = exploretable.apply(lambda row:  rouge.get_scores(row.desc, row.tran)[0]['rouge-l']['p'], axis=1)

In [7]:
exploretable.head(75)

Unnamed: 0,id,desc,tran
1,KtMjOT6fDrw,learn how to apply hanger hooks for your woodc...,"on behalf of expert village , my name is husai..."


In [8]:
# Alignment of transcription to audio

# Define imports for Kaldi Alignment
from kaldi.alignment import NnetAligner
from kaldi.fstext import SymbolTable
from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo
from kaldi.nnet3 import NnetSimpleComputationOptions
from kaldi.util.table import SequentialMatrixReader
import string

In [9]:
def aspire_alignment():
    # Construct aligner
    decodable_opts = NnetSimpleComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frames_per_chunk = 150
    aligner = NnetAligner.from_files(
        "exp/tdnn_7b_chain_online/final.mdl",
        "exp/tdnn_7b_chain_online/tree",
        "data/lang/L.fst",
        "data/lang/words.txt",
        "data/lang/phones/disambig.int",
        decodable_opts=decodable_opts)
    phones = SymbolTable.read_text("data/lang/phones.txt")
    wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(),
                                         "data/lang/phones/word_boundary.int")

    # Define feature pipelines as Kaldi rspecifiers
    feats_rspec = (
        "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
    )
    ivectors_rspec = (
        "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
        "ivector-extract-online2 --config=conf/ivector_extractor.conf ark:data/test/spk2utt ark:- ark:- |"
    )

    # Align wav files
    with SequentialMatrixReader(feats_rspec) as f, \
         SequentialMatrixReader(ivectors_rspec) as i, \
         open("data/test/text","r") as t:
        for (fkey, feats), (ikey, ivectors), line in zip(f, i, t):
            tkey, text = line.strip().split(None, 1)
            text = text.translate(str.maketrans('', '', string.punctuation))
            assert(fkey == ikey == tkey)
            out = aligner.align((feats, ivectors), text)
            phone_alignment = aligner.to_phone_alignment(out["alignment"], phones)
            word_alignment = aligner.to_word_alignment(out["best_path"], wb_info)

            print(f"The Input Text: {text}")
    print(f"The word alignment: {word_alignment}")
#     return (text,word_alignment)

### File structure within the /data folder is as follows :
-    /lang
-    /test
-        spk2utt - maps speakers to utterances? just repeat double unique id eg utt1 utt1
-        text - transcription for each utterance on each line 
-        utt1.wav
-        wav.scp

In [12]:
# Ensure that you have run alignment_dependencies/path.sh in order to add Kaldi to the PATH


#HACKY FIX IMPLEMENTED 
#IN PyKaldi API, if word not found in symbol table (out of vocabulary) it is set to <unk> or index 16. 
# The effect of this upon results needs to be discussed
aspire_alignment();

The Input Text: on behalf of expert village  my name is husain abdul alim and i am here to tell you all about wood carving  hooks  all kinds of hooks  and we have got a hook spotter cause some of this wood is kind of hard  kind of hard  kind of hard  no hook on my potential piece but we are going to get one on there sooner than later  we have to think about how that is going to hang  right about there  okay  okay and it needs another wire on it and we need another little wire to put up  you can hang on your wall  make a nice little circle inside my screw eye  okay  now it is ready to be hung  now it is ready to be hung 
The word alignment: [('<eps>', 0, 74), ('on', 74, 19), ('behalf', 93, 40), ('of', 133, 10), ('<eps>', 143, 18), ('expert', 161, 58), ('<eps>', 219, 22), ('village', 241, 69), ('<eps>', 310, 46), ('my', 356, 20), ('name', 376, 20), ('is', 396, 14), ('<eps>', 410, 37), ("a's", 447, 16), ('abdul', 463, 51), ("a's", 514, 37), ('<eps>', 551, 11), ('and', 562, 17), ('i', 579,

In [None]:
def generateAlignmentDeps(table):
    tranlist = table['tran'].tolist()
    idlist = table['id'].tolist()
#generate spk2utt 
    with open("spk2utt", 'w') as f:
        for id in idlist:
            print(f"{id} {id}", file=f)
#generate text 
# BE SURE TO RENAME 'test' to 'text' !!
    with open("test", 'w') as f:
        for index, id in enumerate(idlist):
            tran = tranlist[index]
            print(f"{id} {tran}", file=f)
#generate wav.scp 
    with open("wav.scp", 'w') as f:
        for id in idlist:
            print(f"{id}  ../../speech_audios/{id}.wav", file=f)

In [None]:
generateAlignmentDeps(exploretable)