# String Matching Evaluation

## Downloads and imports

In [None]:
!pip install -U python-dotenv pydub mutagen

In [None]:
!cp -R '/content/drive/My Drive/TCC_data/corpus/' .
!unzip 'corpus/*.zip'

In [None]:
import io
import json
import os
import glob
import time
import wave
import contextlib
import warnings
from tempfile import TemporaryFile
from collections import OrderedDict
from itertools import chain
from multiprocessing import Process
import re

import numpy as np
import pandas as pd
import requests
from IPython import display
from tqdm import tqdm
from mutagen.mp3 import MP3

from dotenv import load_dotenv
from pydub import AudioSegment

In [None]:
warnings.filterwarnings('ignore')

tqdm.pandas()

load_dotenv(dotenv_path='/content/drive/My Drive/Colab Notebooks/.env')

## Aux

In [None]:
def split_into_chunks(
    segment, length=20000 / 1001, split_on_silence=False, noise_threshold=-36
):
    chunks = list()

    if split_on_silence is False:
        for i in range(0, len(segment), int(length * 1000)):
            chunks.append(segment[i:i + int(length * 1000)])
    else:
        while len(chunks) < 1:
            chunks = split_on_silence(segment, noise_threshold)
            noise_threshold += 4

    for i, chunk in enumerate(chunks):
        if len(chunk) > int(length * 1000):
            subchunks = split_into_chunks(
                chunk, length, split_on_silence, noise_threshold + 4
            )
            chunks = chunks[:i - 1] + subchunks + chunks[i + 1:]

    return chunks


def preprocess_audio(audio):
    return audio.set_sample_width(2).set_channels(1).set_frame_rate(48000)


def read_audio_into_chunks(file_path):
    audio = AudioSegment.from_file(file_path)
    with tempfile.TemporaryFile() as fp:
        audio.export(fp, format='wav')
        audio = AudioSegment.from_file_using_temporary_files(fp, format='ogg')
    return split_into_chunks(preprocess_audio(audio))


def transcribe_audio_wit(file_path, app_id):
    url = 'https://api.wit.ai/speech'

    authorization = 'Bearer ' + os.environ.get('WIT_KEY_{}'.format(app_id))
    content_type = 'audio/raw;' \
        'encoding=signed-integer;' \
        'bits=16;' \
        'rate=48000;' \
        'endian=little'

    # defining headers for HTTP request
    headers = {
        'authorization': authorization,
        'content-type': content_type
    }

    chunks = read_audio_into_chunks(file_path)

    text = []
    for audio in chunks:
        response = requests.post(
            url,
            headers=headers,
            data=io.BufferedReader(io.BytesIO(audio.raw_data))
        )

        try:
            # Get the text
            data = json.loads(response.content)
            if 'text' in data:
                text.append(data['text'])
        except:
            pass

    return ' '.join(text)

def flatten_dict(d, parent_key='', sep='_'):
    if not isinstance(d, dict):
        return {parent_key: d}
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


def flatten_columns(df, columns):
    for col in columns:
        df[f'{col}_'] = df[f'{col}'].apply(flatten_dict)
        keys = set(chain(*df[f'{col}_'].apply(lambda column: column.keys())))
        for key in keys:
            column_name = f'{col}_{key}'.lower()
            df[column_name] = df[f'{col}_'].apply(
                lambda cell: cell[key] if key in cell.keys() else np.NaN
            )
    cols_to_drop = [(f'{col}', f'{col}_') for col in columns]
    return df.drop(columns=list(chain(*cols_to_drop)))

def clean_str(x):
    return re.sub('\W', ' ', x).lower()

def get_audio_length(file_path):
    if file_path.endswith('.wav'):
        with contextlib.closing(wave.open(file_path,'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration = frames / float(rate)
            return duration
    if file_path.endswith('.mp3'):
        audio = MP3(file_path)
        return audio.info.length
    raise Exception('Unsuported file format. File must be wav or mp3')

## STT

In [None]:
def transcribe_on_process(corpus, df, p):
    with open('evaluate_metrics_{}/{}.tsv'.format(corpus, p), 'w') as f:
        print('file\tlength\tsentence\ttranslation', file=f)
        for i, row in tqdm(df.iterrows(), total=len(df), desc='Process {}'.format(p)):
            path = row['filepath']
            length = get_audio_length(path)
            sentence = row['sentence']
            translation = transcribe_audio_wit(path, p)
            print('{}\t{}\t{}\t{}'.format(path, length, sentence, translation), file=f)
            time.sleep(0.5)

In [None]:
corpus = 'vorforge'

os.mkdir('evaluate_metrics_{}'.format(corpus))
final_df = pd.read_csv('{}/sentences.tsv'.format(corpus), sep='\t')

sz = len(final_df)
bs = sz // 4

all_processes = [
    Process(
        target=transcribe_on_process, args=(corpus, final_df.iloc[i * bs: (i + 1) * bs], i)
    ) for i in range(4)
]

for p in all_processes:
    p.start()

for p in all_processes:
    p.join()

In [None]:
files = glob.glob('evaluate_metrics_{}/*.tsv'.format(corpus))

transcribed_df = pd.concat(
    [pd.read_csv(f, sep='\t') for f in files], 
    ignore_index = True
)

print(transcribed_df.shape)

transcribed_df.to_csv(
    '/content/drive/My Drive/TCC_data/metrics/data/evaluate_metrics_{}.tsv'.format(corpus),
    sep='\t', index=False
)