Machine perceptual quality evaluation

* Images
  * Dataset: [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k)
  * Model: [Distilled data-efficient Image Transformer (DeiT)](https://huggingface.co/facebook/deit-small-distilled-patch16-224)
  * Metric: Image classification accuracy
  * Compression:
    * JPEG Q=5/100
    * HIFIC
    * TFCI
* Audio
  * Dataset: [Common Voice Corpus 11.0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0)
  * Model: [Whisper](https://huggingface.co/openai/whisper-small)
  * Metric: Speech recognition word error rate
  * Compression:
    * MP3 kbps
    * Descript
    * Encodec 

In [139]:
from datasets import load_dataset
import PIL
import torchaudio
from evaluate import evaluator
from transformers import pipeline
from io import BytesIO
import encodec
import torch

from compressai.zoo import bmshj2018_factorized
from torchvision import transforms

In [2]:
def jpeg_compress_quality_5(sample):
    img = sample['image']
    with BytesIO() as f:
        img.save(f, format='JPEG',quality=5)
        img = f.getvalue()
    sample['image'] = img
    return sample

In [82]:
def mp3_compress_quality_8(sample):
    try:
        audio = sample['audio']['array'].unsqueeze(0)
        fs = sample['audio']['sampling_rate']
        with BytesIO() as f:
            torchaudio.save(f, audio, sample_rate=fs, format="mp3", compression=8)
            f.seek(0)
            audio = torchaudio.load(f,format="mp3")
        sample['audio']['array'] = audio[0][0]
    except:
        pass
    return sample

In [None]:
model = encodec.EncodecModel.encodec_model_48khz()
model.set_target_bandwidth(3)

def encodec_compress(sample):
    audio = sample['audio']['array'].unsqueeze(0)
    fs = sample['audio']['sampling_rate']
    audio = encodec.utils.convert_audio(audio,fs,model.sample_rate,model.channels)
    with torch.no_grad():
        encoded_frames = model.encode(audio.unsqueeze(0))
        sample['audio']['array'] = model.decode(encoded_frames).mean(dim=[0,1]) 
    return sample

In [141]:
data = load_dataset("imagenet-1k", split="validation[:100]")
jpeg_q5 = data.map(jpeg_compress_quality_5)
rdae = data.map(rdae_compress)

In [145]:
net = bmshj2018_factorized(quality=2, pretrained=True).eval()
def rdae_compress(sample):
    img = sample['image']

    if (img.mode == 'L') | (img.mode == 'CMYK') | (img.mode == 'RGBA'):
        rgbimg = PIL.Image.new("RGB", img.size)
        rgbimg.paste(img)
        img = rgbimg

    x = transforms.ToTensor()(img).unsqueeze(0)
    with torch.no_grad():
        out_net = net.forward(x)
    out_net['x_hat'].clamp_(0, 1)
    sample['image'] = transforms.ToPILImage()(out_net['x_hat'].squeeze())

    return sample

In [147]:
pipe = pipeline(
    task="image-classification",
    model="facebook/deit-small-distilled-patch16-224"
)

task_evaluator = evaluator("image-classification")

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [5]:
eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=data,
    metric="accuracy",
    label_mapping=pipe.model.config.label2id
)
eval_results

{'accuracy': 0.81,
 'total_time_in_seconds': 7.377623004023917,
 'samples_per_second': 13.554501218815032,
 'latency_in_seconds': 0.07377623004023917}

In [6]:
eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=jpeg_q5,
    metric="accuracy",
    label_mapping=pipe.model.config.label2id
)
eval_results

{'accuracy': 0.7,
 'total_time_in_seconds': 7.054108065029141,
 'samples_per_second': 14.1761366679016,
 'latency_in_seconds': 0.0705410806502914}

In [148]:
eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=rdae,
    metric="accuracy",
    label_mapping=pipe.model.config.label2id
)
eval_results

{'accuracy': 0.78,
 'total_time_in_seconds': 8.429108447977342,
 'samples_per_second': 11.863650897028869,
 'latency_in_seconds': 0.08429108447977342}

In [89]:
data = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="validation[:40]")
mp3_q8 = data.with_format("torch").map(mp3_compress_quality_8).with_format('numpy')
encodec_q48 = data.with_format("torch").map(encodec_compress).with_format("numpy")

In [87]:
pipe = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-small",
)

task_evaluator = evaluator("automatic-speech-recognition")
task_evaluator.PIPELINE_KWARGS.pop('truncation', None)

In [90]:
eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=data,
    input_column="audio",
    label_column="sentence",
    metric="wer",
)
eval_results



{'wer': 0.24324324324324326,
 'total_time_in_seconds': 129.5941981250071,
 'samples_per_second': 0.3086557930735127,
 'latency_in_seconds': 3.2398549531251777}

In [91]:
eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=mp3_q8,
    input_column="audio",
    label_column="sentence",
    metric="wer",
)
eval_results

{'wer': 0.327027027027027,
 'total_time_in_seconds': 130.1211449749535,
 'samples_per_second': 0.3074058409776477,
 'latency_in_seconds': 3.2530286243738376}

In [135]:
eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=encodec_q48,
    input_column="audio",
    label_column="sentence",
    metric="wer",
)
eval_results



{'wer': 0.5054054054054054,
 'total_time_in_seconds': 135.95577442500507,
 'samples_per_second': 0.2942133217156179,
 'latency_in_seconds': 3.398894360625127}