In [1]:
%load_ext autoreload
%autoreload 2

%load_ext dotenv
%dotenv

In [2]:
import json
import os
from collections import Counter, defaultdict
from pathlib import Path
from typing import List, Optional

In [3]:
import numpy as np
import torch
import tqdm
from pydantic import parse_file_as

In [4]:
from arch.edenai_model import GoogleNSFWModel, ProviderResponse, ResponseItem, ResponseLabel, GoogleResponseLabel
from dataset import load_imagenet_nsfw_test_data

2022-12-02 13:45:38.698687: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-02 13:45:38.877332: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-02 13:45:39.417907: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/local/home/edebenedetti/miniconda3/envs/pt/lib/
2022-12-02 13:45:39.417983: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libn

In [5]:
OUT_DIR = Path("nsfw_filters_results")
dl = load_imagenet_nsfw_test_data()
device = torch.device("cuda:0")

Using custom data configuration dedeswim--imagenet-nsfw-acd0b4b4851f04c2
Found cached dataset parquet (/data/huggingface/datasets/dedeswim___parquet/dedeswim--imagenet-nsfw-acd0b4b4851f04c2/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [6]:
def classify_dl(dl, model, out_file):
    overall_results = []
    for batch in tqdm.tqdm(dl):
        result = model.make_request(batch["image"].squeeze())
        overall_results.append(result)
    results_json = [result.json() for result in overall_results]
    with out_file.open("w") as f:
        json.dump(results_json, f)
    return overall_results

def restore_results(label_type, out_file):
    with out_file.open("r") as f:
        restored_json_results = json.load(f)
    return [
        ProviderResponse[label_type](**json.loads(restored_result)) for restored_result in restored_json_results
    ]

In [7]:
google_test_model = GoogleNSFWModel(device)
test_dl = list(map(lambda it: it[0], zip(dl, range(10))))
google_test_overall_results = classify_dl(test_dl, google_test_model, OUT_DIR / "google_test.json")
restored_google_test_overall_results = restore_results(GoogleResponseLabel, OUT_DIR / "google_test.json")
google_test_overall_results == restored_google_test_overall_results

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.89it/s]


True

In [8]:
# Running this cell costs ~1.5$

# google_model = GoogleNSFWModel(device, api_key=os.environ["EDENAI_API_KEY"])
# google_overall_results = classify_dl(dl, google_model, OUT_DIR / "google.json")

In [9]:
google_overall_results = restore_results(GoogleResponseLabel, OUT_DIR / "google.json")

In [10]:
def remove_status(provider_responses: List[ProviderResponse]) -> List[ResponseItem]:
    return [response.items for response in provider_responses]

def filter_label(items_list: List[List[ResponseItem]], label: ResponseLabel) -> List[ResponseItem]:
    return [item for items in items_list for item in items if item.label == label]

def filter_items_with_score(items: List[ResponseItem], score: int) -> List[ResponseItem]:
    return [item for item in items if item[1].likelihood == score]

def get_indices_with_score_for_label(items: List[ProviderResponse], label: ResponseLabel, score: int, out_file: Optional[Path]) -> List[int]:
    overall_items = remove_status(items)
    items_with_label = filter_label(overall_items, label)
    items_with_label_and_score = filter_items_with_score(enumerate(items_with_label), score)
    indices = list(map(lambda x: x[0], items_with_label_and_score))
    if out_file is not None:
        np.save(out_file, np.array(indices))
    return indices

racy_items_5_indices = get_indices_with_score_for_label(google_overall_results, GoogleResponseLabel.Racy, 5, OUT_DIR / "google_racy_five_indices.npy")
len(racy_items_5_indices)

758

In [11]:
len(np.load("nsfw_filters_results/google_racy_five_indices.npy"))

758

In [12]:
from arch.edenai_model import API4AINSFWModel, API4AIResponseLabel

In [13]:
api4ai_test_model = API4AINSFWModel(device)
test_dl = list(map(lambda it: it[0], zip(dl, range(10))))
api4ai_test_overall_results = classify_dl(test_dl, api4ai_test_model, OUT_DIR / "api4ai_test.json")
restored_api4ai_test_overall_results = restore_results(API4AIResponseLabel, OUT_DIR / "api4ai_test.json")
api4ai_test_overall_results == restored_api4ai_test_overall_results

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.45it/s]


True

In [14]:
# api4ai_model = API4AINSFWModel(device, api_key=os.environ["EDENAI_API_KEY"])
# api4ai_overall_results = classify_dl(dl, api4ai_model, OUT_DIR / "api4ai.json")

In [15]:
api4ai_overall_results = restore_results(API4AIResponseLabel, OUT_DIR / "api4ai.json")

In [16]:
nsfw_items_5_indices = get_indices_with_score_for_label(api4ai_overall_results, API4AIResponseLabel.nsfw, 5, OUT_DIR / "api4ai_nsfw_five_indices.npy")
len(nsfw_items_5_indices)

829

In [17]:
from arch.edenai_model import AmazonNSFWModel, AmazonResponseLabel

In [18]:
amazon_test_model = AmazonNSFWModel(device)
test_dl = list(map(lambda it: it[0], zip(dl, range(10))))
amazon_test_overall_results = classify_dl(test_dl, amazon_test_model, OUT_DIR / "amazon_test.json")
restored_amazon_test_overall_results = restore_results(AmazonResponseLabel, OUT_DIR / "amazon_test.json")
amazon_test_overall_results == restored_amazon_test_overall_results

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.50it/s]


True

In [19]:
# amazon_model = AmazonNSFWModel(device, api_key=os.environ["EDENAI_API_KEY"])
# amazon_overall_results = classify_dl(dl, amazon_model, OUT_DIR / "amazon.json")

In [20]:
amazon_overall_results = restore_results(AmazonResponseLabel, OUT_DIR / "amazon.json")

In [28]:
suggestive_items_5_images = get_indices_with_score_for_label(amazon_overall_results, AmazonResponseLabel.Suggestive, 5, OUT_DIR / "amazon_suggestive_five_indices.npy")
len(suggestive_items_5_images)

607

In [21]:
amazon_model = AmazonNSFWModel(device, api_key=os.environ["EDENAI_API_KEY"])
amazon_overall_results_replay = classify_dl(dl, amazon_model, OUT_DIR / "amazon_replay.json")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [09:10<00:00,  1.82it/s]


In [22]:
amazon_overall_results_replay == amazon_overall_results

True

In [23]:
len(get_indices_with_score_for_label(amazon_overall_results, AmazonResponseLabel.Suggestive, 5, OUT_DIR / "amazon_suggestive_five_indices.npy"))

607

In [34]:
amazon_overall_results_replay[suggestive_items_5_images[18]]

ProviderResponse[AmazonResponseLabel](status=<ResponseStatus.success: 'success'>, items=[])