In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('train_data.csv', usecols=['comment'])
# df_test = pd.read_csv('test_data.csv', usecols=['comment'])
labels = pd.read_csv('train_data.csv', usecols=['toxic'])

In [3]:
df['label'] = labels
df

Unnamed: 0,comment,label
0,Преступление и наказание\n,0.0
1,"И именно эти неработающие весы показывают, что...",0.0
2,"В Японии такие панельки, ебанько.\n",0.0
3,Еще у нас выявляют трещины с помощью белой кра...,0.0
4,"Дочитал до поезда в Норильск , дальше не стал\n",0.0
...,...,...
10804,А у мамы в группе до самого выпуска из сада та...,0.0
10805,Сука тупой дегенарт. Вот на эти видео. Съеби у...,1.0
10806,"В племенах украинцев, особенно западных, с дет...",1.0
10807,"Пост про жадность, о том, как человек оплативш...",0.0


In [4]:
import re
from pymorphy2 import MorphAnalyzer
from functools import lru_cache
from nltk.corpus import stopwords

m = MorphAnalyzer()
regex = re.compile("[А-Яа-яA-z]+")

def words_only(text, regex=regex):
    try:
        return regex.findall(text.lower())
    except:
        return []

In [5]:
@lru_cache(maxsize=128)
def lemmatize_word(token, pymorphy=m):
    return pymorphy.parse(token)[0].normal_form

def lemmatize_text(text):
    return [lemmatize_word(w) for w in text]


mystopwords = stopwords.words('russian') 
def remove_stopwords(lemmas, stopwords = mystopwords):
    return [w for w in lemmas if not w in stopwords and len(w) > 3]

def clean_text(text):
    tokens = words_only(text)
    lemmas = lemmatize_text(tokens)
    
    return ' '.join(remove_stopwords(lemmas))

In [6]:
lemmas = list(map(clean_text, df['comment']))
    
df['lemmas'] = lemmas
df.sample(5)

Unnamed: 0,comment,label,lemmas
7804,Ну эти работодатель платит. Если завтра их вне...,0.0,работодатель платить завтра внезапно отменить ...
4876,Привет от соседей-зомби\n,1.0,привет сосед зомби
2278,Нарушение ст 10. Закона о защите прав потребит...,0.0,нарушение закон защита право потребитель изгот...
4190,Твою женушку за это могут набутылить петух\n,1.0,твой жёнушка мочь набутылить петух
3113,"ребят, тапками не кидайтесь) у меня вопрос: си...",0.0,ребята тапка кидаться вопрос симфонический мет...


In [7]:
x_train, x_test, y_train, y_test = train_test_split(df.lemmas, df.label)

# Мешок слов (BoW)

In [99]:
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer

In [106]:
vec = CountVectorizer(ngram_range=(1, 1)) # строим BoW для слов
bow = vec.fit_transform(x_train)

In [107]:
list(vec.vocabulary_.items())[:10]

[('глянуть', 3866),
 ('тысяча', 20432),
 ('рубль', 17401),
 ('недовольный', 10727),
 ('секунда', 17950),
 ('округляться', 11972),
 ('минута', 9613),
 ('факт', 21138),
 ('вместо', 2685),
 ('наговорить', 10120)]

In [24]:
clf = LogisticRegression(random_state=42, max_iter=500)
clf.fit(bow, y_train)

LogisticRegression(max_iter=500, random_state=42)

In [25]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

         0.0       0.96      0.83      0.89      2076
         1.0       0.61      0.89      0.73       627

    accuracy                           0.84      2703
   macro avg       0.79      0.86      0.81      2703
weighted avg       0.88      0.84      0.85      2703



In [26]:
pred

array([1., 0., 0., ..., 0., 0., 0.])

# TF-IDF векторизация

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vec = TfidfVectorizer(ngram_range=(1, 3))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42, max_iter = 500)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

         0.0       0.99      0.77      0.86      2313
         1.0       0.41      0.94      0.57       390

    accuracy                           0.79      2703
   macro avg       0.70      0.86      0.72      2703
weighted avg       0.90      0.79      0.82      2703



# Решающие деревья

In [16]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(bow, y_train)

DecisionTreeClassifier()

In [18]:
y_pred = classifier.predict(vec.transform(x_test))
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.84      0.81      0.82      1859
         1.0       0.61      0.65      0.63       844

    accuracy                           0.76      2703
   macro avg       0.72      0.73      0.73      2703
weighted avg       0.76      0.76      0.76      2703



# SVC

In [37]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(bow, y_train)

SVC(kernel='linear')

In [38]:
y_pred_SVC = svclassifier.predict(vec.transform(x_test))
print(classification_report(y_pred_SVC, y_test))

              precision    recall  f1-score   support

         0.0       0.94      0.84      0.89      2004
         1.0       0.65      0.84      0.74       699

    accuracy                           0.84      2703
   macro avg       0.80      0.84      0.81      2703
weighted avg       0.86      0.84      0.85      2703



# SVM

In [40]:
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(bow, y_train)

SVC(kernel='sigmoid')

In [41]:
y_pred_SVM = svclassifier.predict(vec.transform(x_test))
print(classification_report(y_pred_SVM, y_test))

              precision    recall  f1-score   support

         0.0       0.99      0.74      0.85      2393
         1.0       0.32      0.92      0.47       310

    accuracy                           0.76      2703
   macro avg       0.65      0.83      0.66      2703
weighted avg       0.91      0.76      0.80      2703



# Gaussian Kernel

In [42]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(bow, y_train)

SVC()

In [43]:
y_pred_Gaus = svclassifier.predict(vec.transform(x_test))
print(classification_report(y_pred_Gaus, y_test))

              precision    recall  f1-score   support

         0.0       0.98      0.76      0.86      2330
         1.0       0.38      0.92      0.53       373

    accuracy                           0.78      2703
   macro avg       0.68      0.84      0.70      2703
weighted avg       0.90      0.78      0.81      2703



# Наивный Байес

In [61]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(bow, y_train)

In [62]:
predicted = model.predict(vec.transform(x_test))
print(classification_report(predicted, y_test))

              precision    recall  f1-score   support

         0.0       0.97      0.82      0.89      2131
         1.0       0.58      0.91      0.71       572

    accuracy                           0.84      2703
   macro avg       0.77      0.87      0.80      2703
weighted avg       0.89      0.84      0.85      2703



# Валидация

In [27]:
df_test = pd.read_csv('test_data.csv', usecols=['comment'])
test_lemmas = list(map(clean_text, df_test.comment))

In [28]:
df_test['lemmas'] = test_lemmas
x_valid = df_test.lemmas

In [29]:
pred = clf.predict(vec.transform(x_valid))

In [30]:
df_test['pred_label'] = pred

In [31]:
df_test.pred_label.value_counts()

0.0    2761
1.0     842
Name: pred_label, dtype: int64

In [32]:
answer = df_test.copy()

In [33]:
answer.drop(['comment', 'lemmas'], axis=1, inplace=True)

In [34]:
answer.rename(columns={'pred_label': 'toxic'}, inplace=True)



answer['toxic'] = answer.toxic.astype(int)

answer.index.names = ['comment_id']

In [35]:
answer

Unnamed: 0_level_0,toxic
comment_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0
...,...
3598,1
3599,0
3600,1
3601,0


In [36]:
answer.to_csv('answer.csv')

# Validation the Best (SVC)

In [44]:
df_test = pd.read_csv('test_data.csv', usecols=['comment'])
test_lemmas = list(map(clean_text, df_test.comment))
df_test['lemmas'] = test_lemmas
x_valid = df_test.lemmas

In [47]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(bow, y_train)

SVC(kernel='linear')

In [48]:
y_pred_SVC = svclassifier.predict(vec.transform(x_test))
print(classification_report(y_pred_SVC, y_test))

              precision    recall  f1-score   support

         0.0       0.94      0.84      0.89      2004
         1.0       0.65      0.84      0.74       699

    accuracy                           0.84      2703
   macro avg       0.80      0.84      0.81      2703
weighted avg       0.86      0.84      0.85      2703



In [51]:
pred_valid = svclassifier.predict(vec.transform(x_valid))

In [52]:
df_test['pred_label'] = pred_valid

In [54]:
df_test.pred_label.value_counts()

0.0    2693
1.0     910
Name: pred_label, dtype: int64

In [55]:
final_answer = df_test.copy()

In [56]:
final_answer.drop(['comment', 'lemmas'], axis=1, inplace=True)

In [57]:
final_answer.rename(columns={'pred_label': 'toxic'}, inplace=True)



final_answer['toxic'] = final_answer.toxic.astype(int)

final_answer.index.names = ['comment_id']

In [58]:
final_answer

Unnamed: 0_level_0,toxic
comment_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0
...,...
3598,1
3599,0
3600,1
3601,0


In [59]:
final_answer.to_csv('final_answer.csv')

# Валидация Байес

In [64]:
pred_valid_ba = model.predict(vec.transform(x_valid))

In [71]:
pred_valid_ba

array([0., 0., 0., ..., 1., 0., 0.])

In [72]:
df_test['pred_label'] = pred_valid_ba

In [73]:
df_test['pred_label'].value_counts()

0.0    2852
1.0     751
Name: pred_label, dtype: int64

In [74]:
ba_answer = df_test.copy()

In [75]:
ba_answer.drop(['comment', 'lemmas'], axis=1, inplace=True)
ba_answer.rename(columns={'pred_label': 'toxic'}, inplace=True)
ba_answer['toxic'] = ba_answer.toxic.astype(int)
ba_answer.index.names = ['comment_id']

In [76]:
ba_answer

Unnamed: 0_level_0,toxic
comment_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0
...,...
3598,0
3599,0
3600,1
3601,0


In [77]:
ba_answer.to_csv('ba_answer.csv')

In [92]:
def prediction():
    text = str(input('Напиши предложение:'))
    word = list(map(clean_text, text))
    pred_melk = model.predict(vec.transform(word))
    if pred_melk[0] == 0:
        return 'Хорошо!'
    else:
        return 'Хреново!'

In [98]:
prediction()

Напиши предложение:люблю тебя


'Хорошо!'