In [9]:
# Copyright (c) Meta Platforms, Inc. and affiliates
# All rights reserved.
#
# This source code is licensed under the license found in the
# MIT_LICENSE file in the root directory of this source tree.

# MUTOX toxicity classification

Mutox enables toxicity scoring for speech and text using sonar embeddings and a classifier trained with a _Binary Cross Entropy loss with logits_ objective. To obtain probabilities from the classifier's output, apply a sigmoid layer. This notebook demonstrates encoding speech and text into sonar embeddings and classifying their toxicity.

In [1]:
from pathlib import Path

import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    dtype = torch.float16
else:
    device = torch.device("cpu")
    dtype = torch.float32

# Speech Scoring

1. download some demo audio segments
2. create a tsv file to feed to the speech scoring pipeline
3. load the model and build the pipeline
4. go through the batches in the pipeline

In [2]:
# get demo file
import tempfile
import urllib.request

files = [
    (
        "https://dl.fbaipublicfiles.com/seamless/tests/commonvoice_example_en_clocks.wav",
        "commonvoice_example_en_clocks.wav",
    ),
    (
        "https://dl.fbaipublicfiles.com/seamlessM4T/LJ037-0171_sr16k.wav",
        "LJ037-0171_sr16k.wav",
    ),
]

tmpdir = Path(tempfile.mkdtemp())
tsv_file = tmpdir / "data.tsv"
with tsv_file.open("w") as tsv_file_p:
    print("path", file=tsv_file_p)
    for uri, name in files:
        dl = tmpdir / name
        urllib.request.urlretrieve(uri, dl)
        print(dl, file=tsv_file_p)

In [None]:
from sonar.inference_pipelines.mutox_speech import MutoxSpeechClassifierPipeline
from sonar.inference_pipelines.speech import SpeechInferenceParams

pipeline_builder = MutoxSpeechClassifierPipeline.load_model_from_name(
    mutox_classifier_name="sonar_mutox",
    encoder_name="sonar_speech_encoder_eng",
    device=device,
)

In [6]:
pipeline = pipeline_builder.build_pipeline(
    SpeechInferenceParams(
        data_file=tsv_file,
        audio_root_dir=None,
        audio_path_index=0,
        target_lang="eng",
        batch_size=4,
        pad_idx=0,
        device=device,
        fbank_dtype=torch.float32,
        n_parallel=4,
    )
)

**Note:** This model was trained using a "Binary Cross Entropy loss with logits" objective (as described in the paper). To convert the model's output into probabilities, apply a sigmoid function to the output.


In [7]:
for batch in pipeline:
    ex = batch["audio"]
    for idx, path in enumerate(ex["path"]):
        print(str(path), ex["data"][idx].item(), sep="\t")

/tmp/tmpqasvhgx6/commonvoice_example_en_clocks.wav	-42.40079116821289
/tmp/tmpqasvhgx6/LJ037-0171_sr16k.wav	-47.90427780151367


In [8]:
# cleanup tmp dir
import shutil

shutil.rmtree(tmpdir)

# Text Scoring

1. load the sonar text encoder
2. load the mutox classifier model
3. compute embedding for a sentence
4. score this embedding

In [None]:
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
from sonar.models.mutox.loader import load_mutox_model

t2vec_model = TextToEmbeddingModelPipeline(
    encoder="text_sonar_basic_encoder",
    tokenizer="text_sonar_basic_encoder",
    device=device,
)
text_column = "lang_txt"
classifier = load_mutox_model(
    "sonar_mutox",
    device=device,
    dtype=dtype,
).eval()

Using the cached checkpoint of mutox. Set `force` to `True` to download again.


In [10]:
with torch.inference_mode():
    emb = t2vec_model.predict(
        ["De peur que le pays ne se prostitue et ne se remplisse de crimes."],
        source_lang="fra_Latn",
    )
    x = classifier(emb.to(device).half())

x

tensor([[-19.7812]], device='cuda:0', dtype=torch.float16)