In [181]:
pip install nltk numpy pymorphy2

Note: you may need to restart the kernel to use updated packages.


In [182]:
import json
import re
import nltk
import pymorphy2
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

nltk.download('punkt');
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [111]:
PATH_DATA = './cyberleninka_0.jsonlines'
LANGUAGE = 'russian'

morph = pymorphy2.MorphAnalyzer()
stopwords = nltk.corpus.stopwords.words(LANGUAGE)
stemmer = nltk.stem.snowball.SnowballStemmer(LANGUAGE)

In [130]:
%%time

def removeSpecialChar(text):
    teg_re = re.compile(r'[^А-Яа-я]+')
    return teg_re.sub(' ', text)

def isMeaningWord(string):
    p = morph.parse(string)[0]
    if p.tag.POS == 'INTJ' or p.tag.POS == 'PRCL' or p.tag.POS == 'CONJ' or p.tag.POS == 'PREP' or p.tag.POS == 'PRED' or p.tag.POS == 'NPRO':
        return False
    return True

def tokenizeText(text):
    morph = pymorphy2.MorphAnalyzer()
    text = removeSpecialChar(text)
    sentences = nltk.word_tokenize(text, language=LANGUAGE)
    tokens = []
    for val in sentences:
        val = val.lower()
        if val in stopwords:
            continue
        if isMeaningWord(val) == False:
            continue
        val = stemmer.stem(val)
        tokens.append(val)
    return ' '.join(tokens)

def removeEn(data):
    res = []
    lower = set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя')
    for val in data:
        val = val.lower()
        if lower.intersection(val.lower()) != set():
             res.append(val)
    return res


dataContent = []
dataKeywords = []
file = open(PATH_DATA, 'r')
line = file.readline()
count = 300
i = 0
while line:
    append = {}
    loaded_json = json.loads(line)
    description = loaded_json['content']
    description = tokenizeText(description)
    dataContent.append(description)
    
    keywords = loaded_json['keywords']
    keywords = removeEn(keywords)
    dataKeywords.append(keywords)
    line = file.readline()
    i = i + 1
    if i >= count:
        break
file.close()

CPU times: user 2min 58s, sys: 2.06 s, total: 3min
Wall time: 3min 1s


In [131]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataContent)

In [132]:
print(vectorizer.get_feature_names()[:10])

['аб', 'абагурск', 'абад', 'абазат', 'абазатов', 'абалкин', 'абат', 'абашевск', 'абашевц', 'аббат']


In [162]:
data_x = dataContent
data_y = keywords

In [176]:
TRAIN_PERCENT = 70
delimiter = int(len(data_y) * TRAIN_PERCENT / 100)

train_x = data_x[:delimiter]
test_x = data_x[delimiter:]
train_y = data_y[:delimiter]
test_y = data_y[delimiter:]

In [205]:
model_vectorizer = TfidfVectorizer()
vectorizer = model_vectorizer.fit(train_x)
feature_names = model_vectorizer.get_feature_names()

In [183]:
train_x = pd.DataFrame(data=vectorizer.transform(train_x).toarray(), columns=feature_names)
train_y = pd.DataFrame(data=vectorizer.transform(train_y).toarray(), columns=feature_names)

test_x = pd.DataFrame(data=vectorizer.transform(test_x).toarray(), columns=feature_names)
test_y = pd.DataFrame(data=vectorizer.transform(test_y).toarray(), columns=feature_names)

In [184]:
test_x.shape

(3047, 3047)

In [186]:
%%time
logit =  OneVsRestClassifier(LinearSVC(random_state=17))
logit.fit(train_x, train_y)


CPU times: user 2min 55s, sys: 83.6 ms, total: 2min 55s
Wall time: 2min 55s


OneVsRestClassifier(estimator=LinearSVC(random_state=17))

In [187]:
round(logit.score(train_x, train_y), 3), round(logit.score(test_x, test_y), 3),

(1.0, 1.0)

In [216]:
testArr = []
testArr.append(data_x[1])
text = pd.DataFrame(data=vectorizer.transform(testArr).toarray(), columns=feature_names)

In [218]:
predictArr = logit.predict(text)

In [223]:
predictArr.shape, feature_names.count

((1, 3047), <function list.count(value, /)>)

In [265]:
intex = 0
for item in predictArr[0]:
    if item > 0:
        break
    intex = intex + 1
intex

3047

In [266]:
len(feature_names), len(predictArr[0])

(3047, 3047)

In [267]:
feature_names[intex-1]

'ячейк'

0