In [1]:
import os
import sys
import pickle
import time
import librosa
import torch
import numpy as np
import pandas as pd
import scipy
from tqdm import tqdm

sys.path.append("./utils/")
import laugh_segmenter
import models, configs
import dataset_utils, audio_utils, data_loaders, torch_utils
from torch import optim, nn
from functools import partial
from distutils.util import strtobool

sample_rate = 8000

def segment_laughter(
    input_audio_file,
    model_path="checkpoints/in_use/resnet_with_augmentation",
    config="resnet_with_augmentation",
    threshold=0.5,
    min_length=0.2,
):
    device = torch.device("cpu")
    print(f"Using device {device}")

    ##### Load the Model
    config_data = configs.CONFIG_MAP[config]
    model = config_data["model"](
        dropout_rate=0.0,
        linear_layer_size=config_data["linear_layer_size"],
        filter_sizes=config_data["filter_sizes"],
    )
    feature_fn = config_data["feature_fn"]
    model.set_device(device)

    if os.path.exists(model_path):
        torch_utils.load_checkpoint(model_path + "/best.pth.tar", model)
        model.eval()
    else:
        raise Exception(f"Model checkpoint not found at {model_path}")

    ##### Load the audio file and features
    inference_dataset = data_loaders.SwitchBoardLaughterInferenceDataset(
        audio_path=input_audio_file, feature_fn=feature_fn, sr=sample_rate
    )

    collate_fn = partial(
        audio_utils.pad_sequences_with_labels,
        expand_channel_dim=config_data["expand_channel_dim"],
    )

    inference_generator = torch.utils.data.DataLoader(
        inference_dataset,
        num_workers=4,
        batch_size=8,
        shuffle=False,
        collate_fn=collate_fn,
    )

    ##### Make Predictions
    probs = []
    for model_inputs, _ in tqdm(inference_generator):
        x = torch.from_numpy(model_inputs).float().to(device)
        preds = model(x).cpu().detach().numpy().squeeze()
        if len(preds.shape) == 0:
            preds = [float(preds)]
        else:
            preds = list(preds)
        probs += preds
    probs = np.array(probs)

    file_length = audio_utils.get_audio_length(input_audio_file)

    fps = len(probs) / float(file_length)

    probs = laugh_segmenter.lowpass(probs)
    instances = laugh_segmenter.get_laughter_instances(
        probs, threshold=threshold, min_length=min_length, fps=fps
    )

    results = []

    if len(instances) > 0:
        full_res_y, full_res_sr = librosa.load(input_audio_file, sr=44100)
        wav_paths = []
        maxv = np.iinfo(np.int16).max

        base_name = os.path.splitext(os.path.basename(input_audio_file))[0]
        
        for index, instance in enumerate(instances):
            laughs = laugh_segmenter.cut_laughter_segments(
                [instance], full_res_y, full_res_sr
            )
            results.append(
                {
                    "start": instance[0],
                    "end": instance[1],
                }
            )

    return results


In [2]:
from google.oauth2.service_account import Credentials
from google.cloud import speech_v1p1beta1 as speech

GOOGLE_KEY_PATH = '/Users/ernest.chow/Downloads/what-eat-391801-fd884f1ac9ea.json'

def transcribe_text(filepath, key_path=GOOGLE_KEY_PATH):

    # Create a credentials object
    creds = Credentials.from_service_account_file(key_path)
    client = speech.SpeechClient(credentials=creds)

    with open(filepath, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        #encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=16000,
        language_code="en-US",
        use_enhanced=True,
        # A model must be specified to use enhanced model.
        model="video",
        enable_word_time_offsets=True,
    )

    response = client.recognize(config=config, audio=audio)

    return response.results


def run_analysis(video_files, file_dir):
    results = dict()

    for video_file in video_files:
        file_prefix = video_file[:-4]
        results[file_prefix] = {"text": []}
        mp3_filename = f"{file_dir}/{file_prefix}.mp4"

        raw_text_result = transcribe_text(mp3_filename)

        for detected_phrase in raw_text_result:
            text = detected_phrase.alternatives[0].transcript
            if text:
                confidence = detected_phrase.alternatives[0].confidence
                phrase_begin_timing = str(detected_phrase.alternatives[0].words[0].start_time)
                phrase_end_timing = str(detected_phrase.result_end_time)

                results[file_prefix]["text"].append({
                    "phrase": text,
                    "confidence": confidence,
                    "phrase_begin_timing": phrase_begin_timing,
                    "phrase_end_timing": phrase_end_timing,
                })

        wav_filename = f"{file_dir}/{file_prefix}.wav"
        results[file_prefix]["laughs"] = segment_laughter(wav_filename)

    return results


In [3]:
import os


FILE_DIR = "../work_dir"

video_files = sorted([x for x in os.listdir(FILE_DIR) if x.endswith(".mp4")])

for x in video_files:
    print(x)


joke1.1_5dcc826c-e70a-4c28-bd4e-dca0e9023fa8_1560890971288_431.mp4
joke5.3_f4e620b4-8cf1-485a-961b-3a0828415ec5_1577919787295_189.mp4


In [4]:
results = run_analysis(video_files, FILE_DIR)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
Using device cpu
training with dropout=0.0
Loading checkpoint at: checkpoints/in_use/resnet_with_augmentation/best.pth.tar
Loading checkpoint at step:  60600


100%|███████████████████████████████████████████| 67/67 [00:39<00:00,  1.69it/s]


Using device cpu
training with dropout=0.0
Loading checkpoint at: checkpoints/in_use/resnet_with_augmentation/best.pth.tar
Loading checkpoint at step:  60600


100%|███████████████████████████████████████████| 89/89 [00:53<00:00,  1.68it/s]


In [5]:
import json

with open(f"{FILE_DIR}/output.json", "w") as f:
    json.dump(results, f)

Selling Points
- Bargain! Cheap + quick to run (laptop power, roughly minutes)
- Aids explainability (laugh associated with text)
- Can already be implemented
- Output design is modular!!! Can keep adding layers as we go