In [1]:
from time import time
import numpy as np 
import pandas as pd 

import re
import nltk
from nltk.corpus import stopwords
from gensim.models import word2vec

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("./data/labeled_rutoxic.csv", delimiter=',', header=0, names=['sentence', 'label'])

print('В наборе предложений: \n',df.shape[0])

В наборе предложений: 
 14412


In [3]:
print('toxic:', df[df['label'] > 0]['label'].count())
print('not toxic:', df[df['label'] < 1]['label'].count())

toxic: 4826
not toxic: 9586


## Разбиение на тестовые и обучающие

In [4]:
X = df.iloc[:,0]# 
y = df.iloc[:,1]# 

train , test , y_train, y_test = train_test_split(X, y, test_size=0.3) # и отдаем 30% на тест, остальное на обучен

## Обучение модели Word2Vec

In [5]:
# Метод преобразования текста в слова
def text_to_words(raw_text, remove_stopwords=False):
   
    letters_only = re.sub("[^0-9а-яА-Я]", " ", raw_text)
    words = letters_only.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("russian"))
        meaningful_words = [w for w in words if not w in stops]
        words = meaningful_words
    return words 

sentences_train = train.apply(text_to_words, remove_stopwords=False)
sentences_test = test.apply(text_to_words, remove_stopwords=False)
print(sentences_train[:3])

1931     [18, смотрел, их, прям, пиздец, мелким, они, в...
2066     [пыня, тоже, геноцидил, чеченцев, во, вторую, ...
13775                    [санкции, полезные, так, полезны]
Name: sentence, dtype: object


In [6]:

num_features = 300                
min_word_count = 40                     
num_workers = 4      
context = 20                                                                                        
downsampling = 1e-3  
model = word2vec.Word2Vec(sentences_train, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, sample = downsampling)

t = time()
model.init_sims(replace=True) 
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))


Time to build vocab: 0.0 mins


  model.init_sims(replace=True)


In [7]:
# получение векторного представления
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    index2word_set = set(model.wv.index_to_key)
   
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec, model.wv[word])

    if nwords == 0:
        nwords = 1
    featureVec = np.divide(featureVec, nwords)
    return featureVec

# получение среднего векторного простнраства для предложения
def getAvgFeatureVecs(reviews, model, num_features):
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    counter = 0
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

f_matrix_train = getAvgFeatureVecs(sentences_train, model, num_features)
f_matrix_test = getAvgFeatureVecs(sentences_test, model, num_features)

## Создание обучение  MLPClassifier

In [8]:
model = []

m = MLPClassifier(solver='adam', hidden_layer_sizes=(300,40,30), random_state=1)
model.append(m)

print(model)

[MLPClassifier(hidden_layer_sizes=(300, 40, 30), random_state=1)]


In [9]:
batch_size = 200
total_rows = f_matrix_train.shape[0]
duration = 0
start_train = time()
pos = 0
classes = [0.0, 1.0]
while duration < 10 and pos < total_rows:
    if pos+batch_size > total_rows:
        batch_size = total_rows-pos
    X_p = f_matrix_train[pos:pos+batch_size]
    y_p = y_train.values[pos:pos+batch_size]
    model[0].partial_fit(X_p, y_p, classes)
    pos = pos + batch_size
    duration = time() - start_train
    if pos == total_rows:
        pos = 0
        batch_size = 10000
print('done')

done


In [10]:
## Сохранение результатов и расчет ошибки

In [11]:
y_test_values=y_test.values
predicted_results = model[0].predict_proba(f_matrix_test)
predicted_results = np.where(predicted_results[:,0]>predicted_results[:,1], 0.0,1.0)

sum_errors =sum(y_test_values - predicted_results)
print('count test values', len(y_test_values))
print('sum_errors', sum_errors)


count test values 4324
sum_errors 733.0


In [12]:
saved_result = pd.DataFrame({'text':test.values,
 'expected':  y_test_values,
 'predicted': predicted_results})

In [13]:
saved_result.to_csv('result.csv', encoding='utf-8', index=False)