In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from joblib import dump, load

In [2]:
# Read the data from CSV files
n = ['id', 'date','name','text','typr','rep','rtw','faw','stcount','foll','frien','listcount']
data_positive = pd.read_csv('positive.csv', sep=';',error_bad_lines=False, names=n, usecols=['text'])
data_negative = pd.read_csv('negative.csv', sep=';',error_bad_lines=False, names=n, usecols=['text'])

# Create balanced dataset
sample_size = min(data_positive.shape[0], data_negative.shape[0])
raw_data = np.concatenate((data_positive['text'].values[:sample_size], 
                           data_negative['text'].values[:sample_size]), axis=0) 
labels = [1]*sample_size + [0]*sample_size

def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()

data = [preprocess_text(t) for t in raw_data]

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)

In [3]:
clf = load('model.joblib')

# Tuning hyper-parameters for f1_macro



In [19]:
print(classification_report(y_test, clf.predict(x_test), digits=4))

              precision    recall  f1-score   support

           0     0.7397    0.7941    0.7659     37078
           1     0.7759    0.7183    0.7460     36792

    accuracy                         0.7564     73870
   macro avg     0.7578    0.7562    0.7559     73870
weighted avg     0.7577    0.7564    0.7560     73870



In [15]:
clf.predict(['россия'])[0]

1

In [18]:
clf.predict(['говно'])[0]

0

In [22]:
lines = open('input_data.txt', 'r').readlines()
print(len(lines))

13940


In [31]:
result_lines = []
for line in lines:
    words = line.split(' ')
    target_word = words[1]
    sign = words[2]
    if clf.predict([target_word])[0] == 0:
        sign = "-"
    else:
        sign = "="
    line = f"{words[0]} {target_word} {sign} {words[3]}"
    result_lines.append(line)
    if len(result_lines) % 1000 == 0:
        print(len(result_lines))

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000


In [32]:
open('output_data.txt', 'w').writelines(result_lines)