In [None]:
import pandas as pd
import os
labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = f'aclImdb/{s}/{l}'
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
df.columns = ['review', 'sentiment']

In [1]:
import pandas as pd
ru_texts = pd.read_csv('./dict_2016/collection (docs&words)_2016_all_labels/doc_comment_summary.csv')
print(ru_texts.shape)
ru_texts = ru_texts.dropna()
print(ru_texts.shape)
# ru_texts['text']
print(ru_texts.index)

(26863, 2)
(26770, 2)
Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            26853, 26854, 26855, 26856, 26857, 26858, 26859, 26860, 26861,
            26862],
           dtype='int64', length=26770)


In [7]:
for i in ru_texts.index:
    try:
        ru_texts.loc[i, 'sentiment'] = int(ru_texts.loc[i, 'sentiment'])
    except ValueError:
        print(ru_texts.loc[i, 'sentiment'])
        ru_texts = ru_texts.drop([i])
ru_texts.shape

(26756, 2)

In [1]:
import pandas as pd
ru_texts = pd.read_csv('./ru_texts_clear.csv')
ru_texts['sentiment'].value_counts()

 0    17555
-1     9201
Name: sentiment, dtype: int64

In [2]:
import numpy as np

np.random.seed(0)
ru_texts = ru_texts.reindex(np.random.permutation(ru_texts.index))


In [3]:
import re
def preprocessor(text):
    # text = re.sub('^(https?:\/\/)?([\w-]{1,32}\.[\w-]{1,32})[^\s@]', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [None]:
df['review'] = df['review'].apply(preprocessor)
df.to_csv('./movie_data.csv', index=False)

In [4]:
ru_texts['text'] = ru_texts['text'].apply(preprocessor)

In [8]:
for i in ru_texts.index:
    sen = ru_texts.loc[i, 'sentiment']
    ru_texts.loc[i, 'sentiment'] = -1 if sen == -1 else 0
ru_texts.shape

(26756, 2)

In [9]:
ru_texts.to_csv('./ru_texts_clear.csv', index=False)

In [5]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [10]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/dima/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords

stop = stopwords.words('russian')

In [12]:
X_train = ru_texts.loc[:20000, 'text'].values
# print((X_train))
y_train = list(map(int, ru_texts.loc[:20000, 'sentiment'].values))
# print(y_train)
X_test = ru_texts.loc[20000:, 'text'].values
y_test = list(map(int, ru_texts.loc[20000:, 'sentiment'].values))

In [7]:
from sklearn.model_selection import train_test_split

x = ru_texts['text'][:7500]
y = ru_texts['sentiment'][:7500]

x_train, x_test, y, y_test = train_test_split(x, y, test_size=0.4, random_state=24)

In [67]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]
gnb = Pipeline([('vect', tfidf),
('clf', GaussianNB())])
# lr_tfidf = Pipeline([('vect', tfidf),
                    #  ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(gnb, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=stop, tokenizer=tokenizer_porter)
x = tfidf.fit_transform(x_train).toarray()
model = MultinomialNB(alpha=1)
model.fit(x, y)


In [27]:
X_test = tfidf.transform(x_test).toarray()
y_pred = model.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))
print(y_pred)
print(y_test)

Number of mislabeled points out of a total 3000 points : 1052
[0 0 0 ... 0 0 0]
9837    -1
6033     0
4792     0
9355     0
2625     0
        ..
4655     0
6477     0
19749   -1
15332   -1
26533    0
Name: sentiment, Length: 3000, dtype: int64


In [11]:
print(model.predict(tfidf.transform([' путин'])))

[0]


In [None]:
print(*tfidf.get_feature_names_out())

In [2]:
import pickle
import os

dest = './bot/pkl_objects'
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(tokenizer_porter, open(os.path.join(dest, 'tokenizer.pkl'), 'wb'), protocol=4) 
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)   
  
pickle.dump(model, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)
pickle.dump(tfidf, open(os.path.join(dest, 'vectorizer.pkl'), 'wb'))

NameError: name 'stop' is not defined

In [1]:
translate = {-1:'Negative', 0:'Neutral', 1:'Positive'}
translate[1]

'Positive'