In [None]:
# Note: restart runtime after this import before running the augmentations
!pip install -U augly[av]
!sudo apt-get install python3-magic
!pip install pydub
!pip install audiomentations
!pip install torchaudio
!sudo pip install git+https://github.com/okankop/vidaug
!pip install moviepy
!pip install av
!pip install pytorchvideo
!pip install -U albumentations
!pip install -U torchvision
!pip install -U imgaug
!pip install torch==1.10.0+cu111 -f https://download.pytorch.org/whl/cu111/torch_stable.html
!pip install -U imagecorruptions
!pip install numpy requests nlpaug
!pip install textattack[tensorflow]
!pip install textflint

In [24]:
import cProfile
import inspect
import os
import pstats
import subprocess
import tempfile
import time
import torch
import numpy as np
import pandas as pd
from collections import defaultdict
from copy import deepcopy
from PIL import Image

import augly.audio as audaugs
import augly.audio.utils as aud_utils
import augly.image as imaugs
import augly.text as textaugs
import augly.video as vidaugs
import augly.video.helpers as vid_helpers

from moviepy.editor import VideoFileClip
from pydub import AudioSegment
from textattack.shared import AttackedText
from textflint.input.component.sample.ut_sample import UTSample
from torchvision.io import video as vd

In [34]:
# Change this to "audio" in order to benchmark the audio augs, etc.
modality = "audio"

In [35]:
modules = {
    "audio": audaugs,
    "image": imaugs,
    "text": textaugs,
    "video": vidaugs,
}

lib_names = {
    "audio": ["AugLy", "pydub", "torchaudio", "audiomentations"],
    "image": ["AugLy", "imgaug", "torchvision", "albumentations"],
    "text": ["AugLy", "nlpaug", "textattack", "textflint"],
    "video": ["AugLy", "moviepy", "pytorchvideo", "vidaug"],
}

In [36]:
# Create datapoints to test on
num_dp = 10
tmpdir = "/tmp/"
data = defaultdict(list)

# Create audio
sample_rate = 44100
duration_s = 10
channels = 2
num_samples = int(sample_rate * duration_s)
for i in range(num_dp):
    dp_np = np.random.standard_normal((channels, num_samples))
    filepath = os.path.join(tmpdir, f"audio_{i}.wav")
    aud_utils.ret_and_save_audio(dp_np, filepath, sample_rate)
    data["audio"].append([dp_np, sample_rate])
    data["audio_pydub"].append([AudioSegment.from_file(filepath)])
    data["audio_torchaudio"].append([torch.Tensor(dp_np), sample_rate])

# Create images
width, height, channels = 1920, 1080, 3
for i in range(num_dp):
    dp_np = (
        np.random.rand(width, height, channels) * 255
    ).astype("uint8")
    im = Image.fromarray(dp_np).convert("RGBA")
    data["image"].append([im])
    data["image_imgaug"].append([dp_np])
    data["image_albumentations"].append([dp_np])

# Create text
data["text"].extend(
    [
        ["Hello! How are you today?"],
        [
            "The decision to move the photocopier business was "
            "done for privacy reasons."
        ],
        ["I am twenty years old and have two brothers"],
        [
            "The National Weather Service is calling for a string "
            "of cold, wet storms coming to Northern California for "
            "the rest of the week."
        ],
        ["I can not believe he said that!"],
        ["The victim was less than a quarter century old"],
        ["She likes to eat a croissant and coffee for breakfast"],
        ["knock knock who's there"],
        ["Adam lives with his mother, Amanda, and his brother, Lee"],
        [
            "It almost never rains here in California. I do not even "
            "remember the last time I saw snow!"
        ],
    ],
)
for dp in data["text"]:
    data["text_textattack"].append([AttackedText(dp[0])])
    data["text_textflint"].append([UTSample({"x": dp[0]})])

# used for reading videos using torchvision since we need to supply
# length as an arg for read_video
def get_length(filename):
    result = subprocess.run(
        [
            "ffprobe", "-v", "error", "-show_entries",
            "format=duration", "-of",
            "default=noprint_wrappers=1:nokey=1", filename
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT)
    return float(result.stdout)

# Create videos
for i in range(num_dp):
    filepath = os.path.join(tmpdir, f"video_{i}.mp4")
    vid_helpers.create_color_video(
        filepath,
        duration_s,
        height,
        width,
        color=(
            np.random.randint(0, 255),
            np.random.randint(0, 255),
            np.random.randint(0, 255),
        ),
    )
    vidaugs.audio_swap(
        filepath,
        audio_path=os.path.join(tmpdir, f"audio_{i}.wav"),
    )
    data["video"].append([filepath])
    data["video_moviepy"].append([VideoFileClip(filepath)])
    length = get_length(filepath)
    video_tensor, *_ = vd.read_video(filepath, 0, length, "sec")
    data["video_pytorchvideo"].append([video_tensor.permute(0, 3, 1, 2)])
    data["video_vidaug"].append([video_tensor.detach().cpu().numpy()])

In [37]:
# For many augmentations we will use the default kwargs when calling
# them, but for some we want to override the defaults, so let's define
# those here
transforms_nondefault_kwargs = {
    "audio": {
        "ChangeVolume": {"volume_db": 5.0},
        "Clip": {"offset_factor": 0.2, "duration_factor": 0.5},
        "InsertInBackground": {"offset_factor": 0.6},
    },
    "image": {
        "Brightness": {"factor": 2.0},
        "ColorJitter": {
            "brightness_factor": 0.5,
            "contrast_factor": 2.0,
            "saturation_factor": 0.3,
        },
        "Contrast": {"factor": 0.6},
        "ConvertColor": {"mode": "P"},
        "MaskedComposite": {
            "transform_function": imaugs.Brightness(factor=0.2),
            "mask": None,
        },
        "Opacity": {"level": 0.5},
        "OverlayImage": {
            "overlay": Image.fromarray(
                (
                    np.random.rand(width, height, channels)
                ).astype("uint8") * 255
            ).convert("RGB"),
            "opacity": 0.7
        },
        "OverlayOntoBackgroundImage": {
            "background_image": Image.fromarray(
                (
                    np.random.rand(width, height, channels)
                ).astype("uint8") * 255
            ).convert("RGB"),
            "overlay_size": 0.4,
        },
        "Pixelization": {"ratio": 0.3},
        "Resize": {"width": 1300, "height": 1200},
        "Saturation": {"factor": 3.0},
        "Sharpen": {"factor": 4.0},
        "ShufflePixels": {"factor": 0.5},
    },
    "text": {
        "ChangeCase": {"cadence": 3.0},
        "Contractions": {"aug_p": 1.0},
        "InsertPunctuationChars": {"cadence": 2.5, "granularity": "word"},
        "InsertWhitespaceChars": {"cadence": 2.5},
        "InsertZeroWidthChars": {"cadence": 2.5},
        "ReplaceBidirectional": {"granularity": "word"},
        "ReplaceFunFonts": {"granularity": "word", "vary_fonts": True},
        "SwapGenderedWords": {"aug_word_p": 1.0},
    },
    "video": {
        "AudioSwap": {
            "audio_path": os.path.join(tmpdir, "audio_0.wav"),
            "offset": 0.5,
        },
        "AugmentAudio": {
            "audio_aug_function": audaugs.normalize,
        },
        "BlendVideos": {"overlay_path": data["video"][-1][0]},
        "ChangeVideoSpeed": {"factor": 2.0},
        "ColorJitter": {
            "brightness_factor": 0.5,
            "contrast_factor": -10.0,
            "saturation_factor": 2.0,
        },
        "Concat": {
            "other_video_paths": [d[0] for d in data["video"][-2:]],
        },
        "HStack": {"second_video_path": data["video"][-1][0]},
        "InsertInBackground": {"offset_factor": 0.3},
        "Loop": {"num_loops": 2},
        "Overlay": {"overlay_path": data["video"][2][0]},
        "OverlayOntoBackgroundVideo": {
            "background_path": data["video"][1][0],
        },
        "OverlayShapes": {"num_shapes": 3},
        "Pixelization": {"ratio": 0.4},
        "ReplaceWithColorFrames": {
            "offset_factor": 0.2, "duration_factor": 0.4
        },
        "Resize": {"height": 1300, "width": 1200},
        "Shift": {"x_factor": 0.1, "y_factor": 0.6},
        "TimeCrop": {"offset_factor": 0.2, "duration_factor": 0.7},
        "Trim": {"start": 2.0, "end": 7.3},
        "VStack": {"second_video_path": data["video"][0][0]},
    },
}

In [38]:
# Map the AugLy augmentation names to analogues in other libraries for
# the modality we're benchmarking
other_lib_funcs = {}
if modality == "audio":
    import audiomentations
    import torchaudio.backend
    from torchaudio.sox_effects import apply_effects_tensor

    other_lib_funcs = {
        "AddBackgroundNoise": {
            "audiomentations": audiomentations.AddGaussianNoise(),
        },
        "ChangeVolume": {
            "audiomentations": audiomentations.Gain(),
            "pydub": lambda audio_seg: audio_seg.apply_gain,
            "torchaudio": lambda t, sr: apply_effects_tensor(
                t, sr, [["vol", "5.0"]]
            ),
        },
        "Clip": {
            "audiomentations": audiomentations.Clip(),
        },
        "HighPassFilter": {
            "pydub": lambda audio_seg: audio_seg.high_pass_filter,
            "audiomentations": audiomentations.HighPassFilter(),
            "torchaudio": lambda t, sr: apply_effects_tensor(
                t, sr, [["highpass", "3000"]]
            ),
        },
        "LowPassFilter": {
            "pydub": lambda audio_seg: audio_seg.low_pass_filter,
            "audiomentations": audiomentations.LowPassFilter(),
            "torchaudio": lambda t, sr: apply_effects_tensor(
                t, sr, [["lowpass", "500"]]
            ),
        },
        "Normalize": {
            "pydub": lambda audio_seg: audio_seg.normalize,
            "audiomentations": audiomentations.Normalize(),
            "torchaudio": lambda t, sr: apply_effects_tensor(
                t, sr, [["norm", "-n"]]
            ),
        },
        "PitchShift": {
            "audiomentations": audiomentations.PitchShift(),
            "torchaudio": lambda t, sr: apply_effects_tensor(
                t, sr, [["pitch", "1.0"]]
            ),
        },
        "Reverb": {
            "torchaudio": lambda t, sr: apply_effects_tensor(
                t, sr, [["reverb", "50.0"]]
            ),
        },
        "Speed": {
            "pydub": lambda audio_seg: audio_seg.speedup,
        },
        "TimeStretch": {
            "audiomentations": audiomentations.TimeStretch(),
            "torchaudio": lambda t, sr: apply_effects_tensor(
                t, sr, [["stretch", "1.5"]]
            ),
        },
        "ToMono": {
            "torchaudio": lambda t, sr: apply_effects_tensor(
                t, sr, [["channels", "1"]]
            ),
        },
    }
elif modality == "image":
    from albumentations.augmentations import transforms as alb
    from albumentations.augmentations.crops import transforms as alb_crops
    from albumentations.augmentations.geometric import (
        resize as alb_resize,
        rotate as alb_rotate,
        transforms as alb_geo,
    )
    from albumentations.core import transforms_interface as alb_core
    from imgaug import augmenters as imgaug
    from torchvision import transforms as torchvision

    other_lib_funcs = {
        "ApplyLambda": {
            "albumentations": alb_core.NoOp(),
        },
        "Blur": {
            "imgaug": imgaug.blur.GaussianBlur(sigma=2),
            "torchvision": torchvision.GaussianBlur(kernel_size=1),
            "albumentations": alb.Blur(p=1.0),
        },
        "Brightness": {
            "torchvision": torchvision.ColorJitter(brightness=2.0),
            "albumentations": alb.RandomBrightness(p=1.0),
        },
        "ColorJitter": {
            "imgaug": imgaug.color.AddToHueAndSaturation(value=20),
            "torchvision": torchvision.ColorJitter(0.5, 0.2, 0.3),
            "albumentations": alb.ColorJitter(0.5, 0.2, 0.3),
        },
        "Contrast": {
            "imgaug": imgaug.contrast.LinearContrast(),
            "torchvision": torchvision.ColorJitter(contrast=2.0),
        },
        "Crop": {
            "imgaug": imgaug.size.Crop(percent=0.25),
            "torchvision": torchvision.CenterCrop((960, 540)),
            "albumentations": alb_crops.Crop(480, 270),
        },
        "EncodingQuality": {
            "imgaug": imgaug.arithmetic.JpegCompression(compression=50),
            "albumentations": alb.Downscale(),
        },
        "Grayscale": {
            "imgaug": imgaug.color.Grayscale(),
            "torchvision": torchvision.Grayscale(),
            "albumentations": alb.ToGray(),
        },
        "HFlip": {
            "imgaug": imgaug.flip.HorizontalFlip(),
            "torchvision": torchvision.RandomHorizontalFlip(p=1.0),
            "albumentations": alb.HorizontalFlip(),
        },
        "Pad": {
            "imgaug": imgaug.size.Pad(percent=0.25),
            "torchvision": torchvision.Pad((480, 270)),
            "albumentations": alb.PadIfNeeded(
                min_height=1350, min_width=2400
             ),
        },
        "PerspectiveTransform": {
            "imgaug": imgaug.geometric.PerspectiveTransform(scale=0.05),
            "torchvision": torchvision.RandomPerspective(p=1.0),
            "albumentations": alb_geo.Perspective(),
        },
        "Pixelization": {
            "imgaug": imgaug.imgcorruptlike.Pixelate(severity=3),
        },
        "Resize": {
            "imgaug": imgaug.size.Resize(size=(1300, 1200)),
            "torchvision": torchvision.Resize((1300, 1200)),
            "albumentations": alb_resize.Resize(1200, 1300),
        },
        "Rotate": {
            "imgaug": imgaug.geometric.Rotate(),
            "torchvision": torchvision.RandomRotation(degrees=40),
            "albumentations": alb_rotate.Rotate(),
        },
        "Saturation": {
            "imgaug": imgaug.imgcorruptlike.Saturate(severity=3),
            "torchvision": torchvision.ColorJitter(saturation=2.0),
            "albumentations": alb.HueSaturationValue(),
        },
        "Sharpen": {
            "imgaug": imgaug.convolutional.Sharpen(
                alpha=1.0, lightness=1.0
            ),
            "torchvision": torchvision.RandomAdjustSharpness(2, p=1.0),
            "albumentations": alb.Sharpen(),
        },
        "VFlip": {
            "imgaug": imgaug.flip.VerticalFlip(),
            "torchvision": torchvision.RandomVerticalFlip(p=1.0),
            "albumentations": alb.VerticalFlip(),
        },
    }
elif modality == "text":
    from textattack import augmentation as textattack_aug
    from textattack.transformations import (
        word_swaps as textattack_ws,
        word_merges as textattack_wm,
    )
    from nlpaug.augmenter import char as nlpaug_c, word as nlpaug_w
    from textflint.generation.transformation import UT as textflint

    other_lib_funcs = {
        "ChangeCase": {
            "textflint": textflint.WordCase().transform,
        },
        "Contractions": {
            "textattack": textattack_ws.WordSwapContract(),
            "textflint": textflint.Contraction().transform,
        },
        "InsertPunctuationChars": {
            "textattack": (
                textattack_ws.WordSwapRandomCharacterInsertion()
            ),
            "textflint": textflint.Punctuation().transform,
        },
        "ReplaceSimilarChars": {
            "nlpaug": nlpaug_c.ocr.OcrAug().augment,
            "textattack": textattack_ws.WordSwapHomoglyphSwap(),
            "textflint": textflint.Ocr().transform,
        },
        "SimulateTypos": {
            "nlpaug": nlpaug_w.spelling.SpellingAug().augment,
            "textattack": textattack_aug.CharSwapAugmenter().augment,
            "textflint": textflint.SpellingError().transform,
        },
        "SplitWords": {
            "nlpaug": nlpaug_w.split.SplitAug().augment,
        },
        "SwapGenderedWords": {
            "textflint": (
                textflint.Prejudice(change_type="Name").transform
            ),
        },
    }
elif modality == "video":
    import moviepy.audio.fx.all as moviepy_audio
    import moviepy.video.fx.all as moviepy
    from pytorchvideo.transforms import augmentations as pytorchvideo
    from vidaug import augmentors as vidaug

    other_lib_funcs = {
        "AddNoise": {
            "vidaug": vidaug.Add(value=10),
        },
        "AugmentAudio": {
            "moviepy": moviepy_audio.audio_normalize,
        },
        "Blur": {
            "pytorchvideo": lambda vid: pytorchvideo._adjust_sharpness(
                vid, factor=0
            ),
            "vidaug": vidaug.GaussianBlur(sigma=1),
        },
        "Brightness": {
            "pytorchvideo": lambda vid: pytorchvideo._adjust_brightness(
                vid, factor=0.15
            ),
        },
        "ColorJitter": {
            "moviepy": lambda vid: moviepy.colorx(vid, factor=2.0),
            "pytorchvideo": lambda vid: pytorchvideo._adjust_saturation(
              vid, factor=10.0
            ),
        },
        "Crop": {
            "moviepy": lambda vid: moviepy.crop(
                vid, 0.25, 0.25, 0.75, 0.75
            ),
            "vidaug": vidaug.CenterCrop(size=(960, 540)),
        },
        "ChangeVideoSpeed": {
            "moviepy": lambda vid: moviepy.speedx(vid, factor=2.0),
            "vidaug": vidaug.Upsample(2.0),
        },
        "Grayscale": {
            "moviepy": moviepy.blackwhite,
            "pytorchvideo": pytorchvideo._autocontrast,
        },
        "HFlip": {
            "moviepy": moviepy.mirror_x,
            "pytorchvideo": lambda vid: pytorchvideo._translate_x(
                vid, factor=1, fill=1
            ),
            "vidaug": vidaug.HorizontalFlip(),
        },
        "Loop": {
            "moviepy": lambda vid: moviepy.loop(vid, n=2),
        },
        "Pad": {
            "moviepy": lambda vid: moviepy.margin(vid, mar=20),
        },
        "Pixelization": {
            "vidaug": vidaug.Superpixel(p_replace=0.5, n_segments=10),
        },
        "Resize": {
            "moviepy": lambda vid: moviepy.resize(
                vid, width=1300, height=1200
            ),
        },
        "Rotate": {
            "moviepy": lambda vid: moviepy.rotate(vid, angle=90),
            "pytorchvideo": lambda vid: pytorchvideo._rotate(
                vid, factor=90, fill=1
            ),
            "vidaug": vidaug.RandomRotate(degrees=90),
        },
        "Shift": {
            "vidaug": vidaug.RandomTranslate(),
        },
        "TimeCrop": {
            "vidaug": vidaug.TemporalRandomCrop(size=1),
        },
        "VFlip": {
            "moviepy": moviepy.mirror_y,
            "pytorchvideo": lambda vid: pytorchvideo._translate_y(
                vid, factor=1, fill=1
            ),
            "vidaug": vidaug.VerticalFlip(),
        },
    }
else:
    print(f"Modality {modality} is not supported")

In [39]:
module = modules[modality]
transforms_kwargs = transforms_nondefault_kwargs[modality]
libs = lib_names[modality]

In [None]:
transforms_name_to_callables = {
    k: {
        "AugLy": v(**transforms_kwargs.get(k, {})),
        **other_lib_funcs.get(k, {}),
    }
    for k, v in inspect.getmembers(module)
    if not k.endswith("_intensity")
    and not k.startswith("Random")
    and k[0].isupper()
    and k not in {
        "Compose",
        "OneOf",
    }
}

num_other_lib_augs = sum(
    1 for name, d in other_lib_funcs.items() for lib, f in d.items()
)

print(
    f"Starting benchmarking on {modality} modality: will run "
    f"{len(transforms_name_to_callables.keys())} AugLy augmentations & "
    f"{num_other_lib_augs} other libraries' augmentations on "
    f"{len(data[modality])} data points"
)

In [None]:
mono_only = ["HighPassFilter", "LowPassFilter"]
metrics = []
for tname, transforms in transforms_name_to_callables.items():
    print(tname)
    num_dp = len(data[modality])
    avg_runtimes_s = []
    for lib in libs:
        transform = transforms.get(lib, None)
        if transform is None:
            avg_runtimes_s.append(None)
            continue
        t0 = time.time()
        for i, kwargs in enumerate(data[modality]):
            args = (
                data[f"{modality}_{lib}"][i]
                if f"{modality}_{lib}" in data and not (
                    tname is "SimulateTypos" and lib is "textattack"
                )
                else kwargs
            )
            args = list(args) if len(args) > 1 else [*args]
            if modality == "audio" and tname in mono_only:
                args[0] = args[0][0]
                if lib == "torchaudio":
                    args[0] = args[0].reshape((1, -1))
            if modality == "video" and lib == "AugLy":
                args.append("/tmp/video_out.mp4")
            if lib in ["imgaug", "albumentations"]:
                transform(image=args[0])
            else:
                transform(*args)
        avg_runtimes_s.append((time.time() - t0) / num_dp)
    metrics.append((tname, *avg_runtimes_s))
    print(
        f"Average runtime on {num_dp} data points: "
        f"{list(zip(libs, avg_runtimes_s))}\n"
    )
    print("----------\n")

In [None]:
metrics.sort(key=lambda t: t[1], reverse=True)
pd.DataFrame(metrics, columns=["Augmentation", *libs])