# Towards abstractive audio-to-audio summarization

## Thread 0 : Trimming Pauses and non-vocal activity

### Alignment and playback using PyKaldi

In [1]:
!source ./path.sh

In [14]:
!./models.sh

--2020-04-16 17:57:51--  https://lowerquality.com/gentle/kaldi-models-0.03.zip
Resolving lowerquality.com (lowerquality.com)... 82.221.106.101
Connecting to lowerquality.com (lowerquality.com)|82.221.106.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 161246499 (154M) [application/zip]
Saving to: ‘kaldi-models-0.03.zip’

kaldi-models-0.03.z  13%[=>                  ]  20.18M  3.52MB/s    eta 41s    ^C


In [1]:
# Define imports for Kaldi Alignment
from kaldi.alignment import NnetAligner
from kaldi.fstext import SymbolTable
from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo
from kaldi.nnet3 import NnetSimpleComputationOptions
from kaldi.util.table import SequentialMatrixReader
from pydub import AudioSegment
from pydub.playback import play

In [6]:
def aspire_alignment():
    # Construct aligner
    decodable_opts = NnetSimpleComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frame_subsampling_factor = 0.1
    decodable_opts.frames_per_chunk = 150
    aligner = NnetAligner.from_files(
        "exp/tdnn_7b_chain_online/final.mdl",
        "exp/tdnn_7b_chain_online/tree",
        "data/lang/L.fst",
        "data/lang/words.txt",
        "data/lang/phones/disambig.int",
        decodable_opts=decodable_opts)
    phones = SymbolTable.read_text("data/lang/phones.txt")
    wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(),
                                         "data/lang/phones/word_boundary.int")

    # Define feature pipelines as Kaldi rspecifiers
    feats_rspec = (
        "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
    )
    ivectors_rspec = (
        "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
        "ivector-extract-online2 --config=conf/ivector_extractor.conf ark:data/test/spk2utt ark:- ark:- |"
    )

    # Align wav files
    with SequentialMatrixReader(feats_rspec) as f, \
         SequentialMatrixReader(ivectors_rspec) as i, \
         open("data/test/text","r") as t:
        for (fkey, feats), (ikey, ivectors), line in zip(f, i, t):
            tkey, text = line.strip().split(None, 1)
            assert(fkey == ikey == tkey)
            out = aligner.align((feats, ivectors), text)
            phone_alignment = aligner.to_phone_alignment(out["alignment"], phones)
            word_alignment = aligner.to_word_alignment(out["best_path"], wb_info)

            print(f"The Input Text: {text}")
#         print(f"The phoneme alignment: {phone_alignment}")
    print(f"The word alignment: {word_alignment}")
#     return (text,word_alignment)


In [7]:
aspire_alignment();

The Input Text: one two three four five six seven eight
The word alignment: [('<eps>', 0, 121), ('one', 121, 41), ('<eps>', 162, 4), ('two', 166, 30), ('<eps>', 196, 4), ('three', 200, 39), ('<eps>', 239, 1), ('four', 240, 52), ('five', 292, 41), ('six', 333, 33), ('seven', 366, 47), ('<eps>', 413, 16), ('eight', 429, 28), ('<eps>', 457, 155)]


In [19]:
def playback(play_original, word_spacing,buffer):
    originalutterance = AudioSegment.from_wav("./data/test/utt1.wav")
    silence = AudioSegment.silent(duration=word_spacing)
    if play_original:
        play(originalutterance)
    for word in word_alignment:
        if (word[0] != '<eps>'):
            start_time = word[1]*10-buffer
            end_time = start_time + word[2]*10+buffer
            print(word[0])
            snippet = originalutterance[start_time:end_time]
            padded_snippet = silence.append(snippet.append(silence, crossfade=word_spacing/2),crossfade=word_spacing/2)
            play(padded_snippet)



In [21]:
playback(True,50,50)

one
two
three
four
five
six
seven
eight


In [98]:
len(originalutterance)

6144