In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
SETTINGS = json.load(open("settings.json"))
FORCE_RECOMPUTE = False

In [None]:
chat0 = SETTINGS["download"]["from_chats"][0]
CHAT0 = pd.read_csv(f"{chat0}.csv", parse_dates=["date"])
CHAT0

In [None]:
from detoxify import Detoxify

model1 = Detoxify("multilingual", device="cuda")

In [None]:
from transformers import pipeline

model2 = pipeline(model="SkolkovoInstitute/russian_toxicity_classifier", device="cuda")

In [None]:
a = model1.predict(["приветик", "как делишки?"])
b = model2(["приветик", "как делишки?"], return_all_scores=True)
a, b

In [None]:
def model1_predict(x: pd.Series):
    BATCH_SIZE = 32
    scores = np.zeros((len(x),))
    for i in range(0, len(x), BATCH_SIZE):
        batch = list(x[i : i + BATCH_SIZE])
        pred = model1.predict(list(batch))
        batch_scores = np.mean(list(model1.predict(batch).values()), axis=0)
        scores[i : i + BATCH_SIZE] = batch_scores
    return scores


model1_predict(pd.Series(["приветик", "как делишки?"]))

In [None]:
def model2_predict(x: pd.Series):
    return [
        next(row["score"] for row in verdict if row["label"] == "toxic")  # type: ignore
        for verdict in model2(list(x), top_k=None, truncation=True)  # type: ignore
    ]


model2_predict(pd.Series(["приветик", "как делишки?"]))

In [None]:
import os

if not os.path.exists(f"{chat0}.parquet") or FORCE_RECOMPUTE:
    CHAT0["toxicity_1"] = model1_predict(CHAT0["message"])
    CHAT0["toxicity_2"] = model2_predict(CHAT0["message"])
    CHAT0.to_parquet(f"{chat0}.parquet")
else:
    CHAT0 = pd.read_parquet(f"{chat0}.parquet")

In [None]:
CHAT0

In [None]:
# group toxicity by day (take max)

q = partial(np.quantile, q=0.9)

by_day = CHAT0.groupby(CHAT0["date"].dt.date)
by_day.agg({"toxicity_1": "max", "toxicity_2": "max"}).plot()
by_day.agg({"toxicity_1": q}).plot()

In [None]:
# group toxocity by from_id
from functools import partial

q90 = partial(np.percentile, q=99)

by_from_id = CHAT0.groupby("from_id")
ax = (
    by_from_id.agg({"toxicity_1": q90, "toxicity_2": q90})
    .sort_values("toxicity_1", ascending=False)
    .plot.bar()
)
ax.set_xticklabels(
    ["King", "Queen", "Joker"] + ["███" for _ in range(len(ax.get_xticks()) - 3)]
)
True

In [None]:
by_from_id.aggregate({"toxicity_1": "mean", "toxicity_2": "mean"}).sort_values(
    "toxicity_1"
)