In [25]:
import re
import csv
import random
import urllib.request
import requests
import pandas as pd
from bs4 import BeautifulSoup
from bs4 import Comment
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [None]:
# ячейка для загрузки данных с Lurkmore в csv-файл

def get_page(url):
    if not re.search('%D0%9A%D0%BE%D0%BF%D0%B8%D0%BF%D0%B0%D1%81%D1%82%D0%B0:', url):
        req = requests.get(url)

    if req.status_code == 200:
        return BeautifulSoup(req.text, 'html.parser')
    return None 

lurk_dict = {}
lurk_data = []
site_num = 150
site_link = 'https://lurkmore.net/%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%D0%BD%D0%B0%D1%8F:Random'

for i in range(site_num):
    soup = get_page(site_link)
    s = str(soup)
    s = s[s.find('<!-- start content -->'):s.find('<!-- end content -->') + len('<!-- end content -->')]
    soup = BeautifulSoup(s)
    z = soup
    lurk_data.append(z.get_text(strip=True))
    for sent in lurk_data:
        tokens = sent_tokenize(sent.strip())
        for token in tokens:
            if re.search(r'\s',token):
                lurk_dict[token] = 0

                
with open('lurk_data.csv','w',encoding='utf8',newline='') as csvfile:
    w = csv.writer(csvfile)
    w.writerows(lurk_dict.items())

In [11]:
# ячейка для загрузки csv-файла с данными Лурка

lurk_dict = pd.read_csv('lurk_data.csv')
lurk_dict = dict(lurk_dict.values)

In [15]:
# ячейка для загрузки данных из работы
# по анализу токсичности твитов
# и её предобработки

toxic = pd.read_csv('labeled.csv')
toxic = toxic.loc[toxic['toxic'] == 0.0]
sentim_dict = dict(toxic.values)
final_sent_dict = {}
for key, value in sentim_dict.items():
    tokens = sent_tokenize(key)
    for token in tokens:
        if re.search(r'\s',token):
            final_sent_dict[token] = int(value)

In [16]:
# ячейка для загрузки ЖЖ постов и комментов
# из корпуса Тайга

soc_dict = {}
with open('LiveJournalPostsandcommentsGICR.txt', 'r', encoding='utf-8') as f:
    taiga_jj = f.readlines()
for sent in taiga_jj:
    tokens = sent_tokenize(sent.strip())
    for token in tokens:
        if re.search(r'\s',token):
            token = re.sub(r'\<[^>]*\>', '', token)
            soc_dict[token] = 0

# ограничение размера корпуса

small_soc_dict = {}
for key, value in soc_dict.items():
    if value == 0:
        small_soc_dict[key] = value
    if len(small_soc_dict) == 15000:
        break

In [17]:
# ячейка для загрузки новостей Ленты
# из корпуса Тайги

import os

curr_dir = os.getcwd()
dirpath = os.path.join(curr_dir, 'texts')
fnames = []
taiga_news = []
news_dict = {}

for root, dirs, files in os.walk(dirpath):
    for name in files:
        fnames.append(os.path.join(root, name))

for fpath in fnames:
    with open(fpath, 'r', encoding='utf-8') as f:
        text = ''.join([str(item) for item in f.readlines()])
        taiga_news.append(text)
for sent in taiga_news:
    tokens = sent_tokenize(sent)
    for token in tokens:
        if re.search(r'\s',token):
            news_dict[token.strip()] = 1

# ограничение размера корпуса

small_news_dict = {}
for key, value in news_dict.items():
    if value == 1:
        small_news_dict[key] = value
    if len(small_news_dict) == 49000:
        break

In [18]:
# ячейка для загрузки данных из Википедии

wiki_dict = {}
with open('wiki_data.txt', encoding='utf-8') as f:
    for line in f.readlines():
        tokens = sent_tokenize(line)
        for token in tokens:
            if re.search(r'\.',token) and re.search(r'\s',token):
                wiki_dict[token.strip()] = 1

In [19]:
# ячейка для загрузки отобранных данных из Лурка
# с их формальными аналогами

lurk_test = pd.read_csv('lurk_test.csv')
lurk_test_dict = dict(lurk_test.values)

In [12]:
# ячейка для создания тестовой выборки

def get_test(d, n, is_formal):
    test_dict = {}
    for k in random.sample(list(d), n):
        test_dict[k] = is_formal
        del d[k]
    return test_dict

test_lurk_dict = get_test(lurk_dict, 100, 0)
test_dict_sent = get_test(final_sent_dict, 100, 0)
test_dict_soc = get_test(small_soc_dict, 100, 0)
test_dict_news = get_test(small_news_dict, 100, 1)
test_dict_wiki = get_test(wiki_dict, 100, 1)


test_dict = {**lurk_test_dict, **test_dict_sent}
test_dict = {**test_dict, **test_dict_soc}
test_dict = {**test_dict, **test_dict_news}
test_dict = {**test_dict, **test_dict_wiki}
test_dict = {**test_dict, **test_lurk_dict}

In [20]:
# ячейка для загрузки чистой тестовой выборки
# из csv-файла размером 500 предложений

test_dict = pd.read_csv('cleared_test_data.csv')
test_dict = dict(test_dict.values)

In [21]:
# ячейка для создания единого трэйна

train_dict = {**final_sent_dict, **small_soc_dict}
train_dict = {**train_dict, **small_news_dict}
train_dict = {**train_dict, **wiki_dict}
train_dict = {**train_dict, **lurk_dict}

In [22]:
len(train_dict)

118667

In [23]:
# подготовка данных для векторизации с помощью sklearn

data_train = list(train_dict.keys())
data_test = list(test_dict.keys())

train_target = list(train_dict.values())
test_target = list(test_dict.values())

In [26]:
# векторизация данных с помощью CountVectorizer

count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(data_train)
X_test_counts = count_vect.transform(data_test)

In [27]:
# проверка качества классификатора с помощью функции логистической регрессии

clf = LogisticRegression().fit(X_train_counts, train_target)
pred = clf.predict(X_test_counts)
print(classification_report(test_target, pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93       318
           1       0.92      0.87      0.90       223

    accuracy                           0.92       541
   macro avg       0.92      0.91      0.92       541
weighted avg       0.92      0.92      0.92       541



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# проверка качества классификатора с помощью метода опорных векторов

clf = LinearSVC().fit(X_train_counts, train_target)
pred = clf.predict(X_test_counts)
print(classification_report(test_target, pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       318
           1       0.92      0.87      0.90       223

    accuracy                           0.92       541
   macro avg       0.92      0.91      0.91       541
weighted avg       0.92      0.92      0.92       541



In [29]:
# векторизация данных с помощью TfidfVectorizer

tfidf_vect = TfidfVectorizer()

X_train_counts = tfidf_vect.fit_transform(data_train)
X_test_counts = tfidf_vect.transform(data_test)

In [30]:
# проверка качества классификатора с помощью функции логистической регрессии

clf = LogisticRegression().fit(X_train_counts, train_target)
pred = clf.predict(X_test_counts)
print(classification_report(test_target, pred))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90       318
           1       0.90      0.80      0.85       223

    accuracy                           0.88       541
   macro avg       0.88      0.87      0.87       541
weighted avg       0.88      0.88      0.88       541



In [31]:
# проверка качества классификатора с помощью метода опорных векторов

clf = LinearSVC().fit(X_train_counts, train_target)
pred = clf.predict(X_test_counts)
print(classification_report(test_target, pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       318
           1       0.93      0.90      0.91       223

    accuracy                           0.93       541
   macro avg       0.93      0.92      0.93       541
weighted avg       0.93      0.93      0.93       541

