In [1]:
import numpy as np
from sklearn. model_selection import GridSearchCV
from matplotlib import pyplot as plt
from sklearn.metrics import precision_score, recall_score, precision_recall_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt')

#https://www.kaggle.com/datasets/blackmoon/russian-language-toxic-comments?resource=download -- dataset link

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


False

In [2]:
df = pd.read_csv("labeled.csv", sep=",")
train_df, test_df = train_test_split(df, test_size=1000)
df["toxic"] = df["toxic"].apply(int)
test_df["toxic"].value_counts()
train_df["toxic"].value_counts()

toxic
0.0    8914
1.0    4498
Name: count, dtype: int64

In [3]:
snowball = SnowballStemmer(language="russian")
russian_stop_words = stopwords.words("russian")


def tokenize_sentence(sentence: str, remove_stop_words: bool = True):
    tokens = word_tokenize(sentence, language="russian")
    tokens = [i for i in tokens if i not in string.punctuation]
    if remove_stop_words:
        tokens = [i for i in tokens if i not in russian_stop_words]
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

In [4]:
# creating model, learning logistic regression
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))
features = vectorizer.fit_transform(train_df["comment"])
model = LogisticRegression(random_state=0)
model.fit(features, train_df["toxic"])



In [5]:
model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(
        tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", LogisticRegression(random_state=0))
])

In [6]:
model_pipeline.fit(train_df["comment"], train_df["toxic"])



In [7]:
# checking work pipeline
model_pipeline.predict(["Привет, у меня все нормально"])

array([0.])

In [8]:

model_pipeline.predict(["Слушай, не пойти ли тебе нафиг отсюда?"])

array([1.])

In [9]:
# measure metrics
precision_score(y_true=test_df["toxic"],
                y_pred=model_pipeline.predict(test_df["comment"]))
recall_score(y_true=test_df["toxic"],
             y_pred=model_pipeline.predict(test_df["comment"]))

0.6310975609756098

In [10]:
# we want to 0.95 precision
prec, rec, thresholds = precision_recall_curve(y_true=test_df["toxic"], probas_pred=model_pipeline.predict_proba(test_df["comment"])[:, 1])

In [11]:
np.where(prec > 0.95)

(array([948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960,
        961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973,
        974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986,
        987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998]),)

In [12]:
thresholds[429]

0.20452275899967695

In [13]:
precision_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict_proba(
    test_df["comment"])[:, 1] > thresholds[429])

0.5588752196836555

In [14]:
recall_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict_proba(
    test_df["comment"])[:, 1] > thresholds[429])

0.9695121951219512

In [15]:
# upgrading model
grid_pipeline = Pipeline([
    ("vectorizer"), TfidfVectorizer(
        tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True)),
    ("model", GridSearchCV(
        LogisticRegression(random_state=0),
        param_grid={'C': {0.1, 1, 10.}},
        cv=3,
        verbose=4
    ))
])

In [16]:
model_pipeline_c_10 = Pipeline([
    ("vectorizer", TfidfVectorizer(
        tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", LogisticRegression(random_state=0, C=10. ))
])

In [17]:
model_pipeline_c_10.fit(train_df["comment"], train_df["toxic"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:

prec_c_10, rec_c_10, thresholds_c_10 = precision_recall_curve(y_true=test_df["toxic"], probas_pred=model_pipeline_c_10.predict_proba(test_df["comment"])[:, 1])

In [19]:
np.where(prec_c_10 > 0.95)

(array([925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937,
        938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950,
        951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963,
        964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976,
        977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989,
        990, 991, 992, 993, 994, 995, 996, 997, 998]),)

In [20]:
precision_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict_proba(
    test_df["comment"])[:, 1] > thresholds_c_10[824])

0.9583333333333334

In [21]:
recall_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict_proba(
    test_df["comment"])[:, 1] > thresholds_c_10[824])

0.1402439024390244

In [22]:
model_pipeline.predict([""])

array([0.])

In [56]:
from tkinter import *

root = Tk()
root.title('OPD Project: Filter toxic comments')
root.geometry('640x360')
root['bg'] = 'black'

In [57]:
def check():
    input = e.get()
    if model_pipeline.predict([input]) != 1.:
        label1.config(bg='black', fg='green')
        label1['text'] = "Not toxic comment"
    else:
        label1.config(bg='black', fg='red')
        label1['text'] = "Toxic comment"

label2 = Label(root, font='Arial 20', bg='black', fg='white')
label2['text'] = "Enter a comment to check"
label2.pack(pady= 14)

e = Entry(root, font='Arial 20', width=35)
e.pack(pady=16)

btn3 = Button (root, bg='white', font='Arial 20', text='check', width=20, command=check)
btn3.pack(pady=18)

label1 = Label(root, font='Arial 30', bg='black', fg='white')
label1.pack(pady=30)

root.mainloop()