# Speech to Speech Summarisation : Execution Notebook

In [None]:
# Load Requirements and Libraries
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import spacy
import scipy.io
import math
import pickle 
import Levenshtein
import string  
import re
import requests
import json
from sklearn import preprocessing
from pydub import AudioSegment
from pydub.playback import play
from gensim.summarization.summarizer import summarize as extractive_sum
from rouge import Rouge 
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer
from gtts import gTTS
from IPython.display import Audio
rouge = Rouge()
nlp = spacy.load("en_core_web_sm")

In [None]:
# Load Acoustic Prominence Scorer
with open('best_svm.sav', 'rb') as pickle_file:
    clf = pickle.load(pickle_file)

def log_softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.log(e_x / e_x.sum())
        
def acoustic_prominence_scorer(pre_aligned_acoustic_features,alignment): 
    raw_scores = clf.predict(pre_aligned_acoustic_features)
    scored_sequence = {}
    appearances = {}
    for x in alignment :
        if x[0] != '<eps>':
            start = math.ceil((x[1])/10)-1
            gap =  math.floor((x[2]/10- (x[2]%1)))+1
            end= start+gap
            relevant_scores = raw_scores[start:end]
            mean_score = sum(relevant_scores)/len(relevant_scores)
            if x[0] in appearances:
                new_freq = appearances[x[0]]+1
                appearances[x[0]] = new_freq
                scored_sequence[x[0]] = ((new_freq-1) * appearances[x[0]] + mean_score) / new_freq
            else:
                appearances[x[0]] = 1
                scored_sequence[x[0]] = mean_score
#     scored_sequence = dict(zip(scored_sequence.keys(), log_softmax(list(scored_sequence.values()))))
    return scored_sequence

In [None]:
# Load alignment and acoustic features from the disk
ALIGNMENTPATH = "alignments/"
FEATUREPATH = "./acoustic_feats_170520/"
exploretable = pd.read_pickle("./exploretable16052020.pkl")

In [None]:
# Import summarisation model

import nlpete.training.metrics
import nlpete.data.dataset_readers
from nlpete.models.copynet import CopyNet
from allennlp.data.fields.text_field import TextFieldTensors
from overrides import overrides
from allennlp.models.archival import load_archive
from allennlp.models.model import Model
from allennlp.data import DatasetReader
from allennlp.common.util import JsonDict
from allennlp.data import Instance
from nlpete.data.dataset_readers import (
    CopyNetDatasetReader,
) 
from allennlp.predictors import Predictor
import warnings
class CopyNetPredictor(Predictor):
    """
    Predictor for the CopyNet model.
    """
    
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        warnings.warn(
            "The 'copynet' predictor has been deprecated in favor of "
            "the 'seq2seq' predictor.",
            DeprecationWarning,
        )
        
    def predict(self, source: str,acoustic_data: str) -> JsonDict:
        return self.predict_json({"source_string": source,"acoustic_data":acoustic_data})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        source = json_dict["source_string"]
        acoustic_data = json_dict["acoustic_data"]
        return self._dataset_reader.text_to_instance(source,acoustic_data)

archive = load_archive('./absummodel3.tar.gz')
predictor = CopyNetPredictor.from_archive(archive)

In [None]:
# Run prediction 
chosen_id = None
while chosen_id is None:
    input_value = int(input("Which ID would you like to summarise? "))
    try:
        # try and convert the string input to a number
        if 0 <= int(input_value) < 500:
            chosen_id = int(input_value)
    except ValueError:
        # tell the user off
        print("{input} is not a number, please enter a number only".format(input=input_value))

youtube_id = str(exploretable.iloc[chosen_id]['id'])

In [None]:
def find_start_indexes(alignment,summary):
    indexes = []
    for idx, i in enumerate(alignment):
        if (summary[0] == i[0] ):
            indexes.append(idx)
    return indexes

def find_end_indexes(alignment,summary):
    lastword = summary[-1]
    if lastword in string.punctuation: 
        return find_end_indexes(alignment,summary[:-1])
    indexes = [index for index, m in enumerate(alignment) if re.match(f"\('{lastword}', \d+, \d+\)", str(m))]
    if (indexes != []):
        return indexes
    else:
        return find_end_indexes(alignment,list(reversed(summary))[:-1])

def mode_1_summary(youtube_id):
    original_transcription = exploretable.loc[exploretable['id'] == youtube_id]['tran'].values[0]
    alignment_data =  eval(open(f'{ALIGNMENTPATH}{youtube_id}.txt', "r").read())
    ground_truth_summary = exploretable.loc[exploretable['id'] == youtube_id]['desc'].values[0]
    print(ground_truth_summary)
    try:
        summary = extractive_sum(original_transcription,word_count=40)
    except:
        return "Extractive Summarisation Failed"
    start_indexes = find_start_indexes(alignment_data,summary.split(" "))      
    end_indexes = find_end_indexes(alignment_data,summary.split(" "))    
    editdistance = 99999
    mini = None
    minj = None
    for i in start_indexes:
        for j in end_indexes:
            candidate = alignment_data[i:j+1]
            candidate = [x[0] for x in candidate if x[0] != '<eps>']
            candidate = " ".join(candidate)
            candidatedistance = Levenshtein.distance(candidate,summary)
            if ( candidatedistance < editdistance):
                editdistance = candidatedistance
                mini = i
                minj = j
    audiostart = alignment_data[mini][1]*10
    audioend= (alignment_data[minj][1]+alignment_data[minj][2])*10 +100
    print(summary)
    originalutterance = AudioSegment.from_wav(f'./speech_audios/{youtube_id}.wav')
    originalutterance.export("4.mp3", format="wav")
    sumaudio = originalutterance[audiostart:audioend]
    sumaudio.export("4a.mp3", format="wav")
    play(sumaudio)

In [None]:
# Display results
def mode_2_summary(youtube_id):
    original_transcription = exploretable.loc[exploretable['id'] == youtube_id]['tran'].values[0]
    ground_truth_summary = exploretable.loc[exploretable['id'] == youtube_id]['desc'].values[0]
    results = predictor.predict(original_transcription,[])
    print(f"The ground truth summary is: {ground_truth_summary}")
    sentences = []
    for predictions in results['predicted_tokens']:
        sentences.append("".join(predictions))
    return sentences

In [None]:
# Display results
def mode_3_summary(youtube_id):
    original_transcription = exploretable.loc[exploretable['id'] == youtube_id]['tran'].values[0]
    ground_truth_summary = exploretable.loc[exploretable['id'] == youtube_id]['desc'].values[0]
    alignment_data =  eval(open(f'{ALIGNMENTPATH}{youtube_id}.txt', "r").read())
    pre_aligned_acoustic_features = preprocessing.scale(scipy.io.loadmat(FEATUREPATH+youtube_id)['ret'])
    aligned_acoustic_data =  acoustic_prominence_scorer(pre_aligned_acoustic_features,alignment_data)
    results = predictor.predict(original_transcription,aligned_acoustic_data)
    print(f"The ground truth summary is: {ground_truth_summary}")
    sentences = []
    for predictions in results['predicted_tokens']:
        sentences.append("".join(predictions))
    return sentences

In [None]:
mode_1_summary(youtube_id)


In [None]:
mode_2_summary(youtube_id)

In [None]:
mode_3_summary(youtube_id)

In [None]:
text_sum_mode_3 = mode_3_summary(youtube_id)[1]
tts = gTTS(text_sum_mode_3)

display(Audio('4c.mp3', rate=8000, autoplay=True))

In [None]:
# For the evaluation o 
url = 'http://56a58cf2ee29.ngrok.io/file-upload'
files =  {'file': open(f'./speech_audios/{youtube_id}.wav','rb')}
myobj = {
    'summary': 'This is a test of the API, hopefully it will work better',
}

x = requests.post(url, data = myobj,files = files)

In [None]:
api_return = json.loads(x.text)
returned_wav = json.loads(api_return['ret_wav'])

In [None]:

display(Audio(returned_wav, rate=8000, autoplay=True))

In [None]:

tts = gTTS(ground_truth_summary)
tts.save('temp.mp3')
from IPython.display import Audio
display(Audio('temp.mp3', rate=8000, autoplay=True))