In [1]:
import pandas as pd
import numpy as np

In [2]:
def prepare_Xy():
    df = pd.read_csv('dataset.csv')
    df = df.query('CLASS_ID != 9000')
    X = df['uu_usl_name']
    y = df['CLASS_ID']
    return X,y
def texts_to_cv(X):
    from joblib import load
    cv = load('simple_vectorized_cv.joblib')
    sparce = cv.transform(X)
    return pd.DataFrame.sparse.from_spmatrix(sparce)
def normalize_texts(X):
    from pymorphy2 import MorphAnalyzer
    morph = MorphAnalyzer(lang='ru')
    return X.apply(lambda x: ' '.join([morph.parse(word.lower().strip('(.,!?:;\'"«»—-–)'))[0].normal_form for word in x.split()]))
def texts_to_vecs(X):
    from gensim.models import KeyedVectors
    model = KeyedVectors.load_word2vec_format('word_to_vec.bin', binary=True)
    def get_average_vector(words):
        vectors = []
        for word in words:
            if word in model:
                vectors.append(model[word])
            else:
                vectors.append(np.array([0.0] * 99 + [1.0]))
        return np.mean(vectors, axis=0)
    pre_X = [get_average_vector(text.split()) for text in X]
    return pd.DataFrame(pre_X, dtype=float)

In [3]:
X, y = prepare_Xy()

In [4]:
normalized_X = normalize_texts(X)

In [5]:
vecs_X = texts_to_vecs(normalized_X)
cv_X = texts_to_cv(normalized_X)

In [6]:
from sklearn.model_selection import train_test_split
vecs_X_train, vecs_X_test, y_train, y_test = train_test_split(vecs_X, y, test_size=0.2, random_state=42)
cv_X_train, cv_X_test, _, _ = train_test_split(cv_X, y, test_size=0.2, random_state=42)

In [23]:
from sklearn.ensemble import RandomForestClassifier
vecs_clf = RandomForestClassifier(
    n_estimators=20,
    max_depth=200,
    random_state=0,
    n_jobs=-1,
    min_samples_split=10,
    min_samples_leaf=10,
)

vecs_clf.fit(vecs_X_train, y_train)

In [24]:
from sklearn.linear_model import LogisticRegression
cv_clf = LogisticRegression(random_state=42)
cv_clf.fit(cv_X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
from sklearn.metrics import classification_report, accuracy_score
vecs_y_proba = vecs_clf.predict_proba(vecs_X_test)
cv_y_proba = cv_clf.predict_proba(cv_X_test)

In [26]:
ind_to_class = {ind: cl for ind, cl in enumerate(vecs_clf.classes_)}

In [27]:
vecs_y_pred = np.argmax(vecs_y_proba, axis=1)
vecs_y_pred = [ind_to_class[ind] for ind in vecs_y_pred]
cv_y_pred = np.argmax(cv_y_proba, axis=1)
cv_y_pred = [ind_to_class[ind] for ind in cv_y_pred]
total_y_pred = np.argmax(vecs_y_proba + cv_y_proba, axis=1)
total_y_pred = [ind_to_class[ind] for ind in total_y_pred]
print(accuracy_score(y_test, vecs_y_pred))
print(accuracy_score(y_test, cv_y_pred))
print(accuracy_score(y_test, total_y_pred))

0.38859212869159454
0.5180819058144532
0.5108368637563702


In [28]:
print(accuracy_score(y_test, cv_clf.predict(cv_X_test)))

0.5180819058144532


In [29]:
cv_clf.predict(cv_X_test)

array([1000, 2400, 1311, ..., 1000, 3100, 1241], dtype=int64)

In [30]:
cv_clf.predict_proba(cv_X_test)

array([[3.50660452e-01, 2.67515907e-03, 1.36320652e-03, ...,
        7.03320703e-04, 1.19402948e-03, 5.38962277e-04],
       [3.38787421e-01, 6.28230225e-05, 2.82873224e-04, ...,
        7.78103922e-06, 5.98294749e-05, 5.72559504e-06],
       [1.70402329e-01, 1.58967546e-02, 2.51530647e-05, ...,
        3.03666085e-04, 7.39841915e-05, 2.26146282e-05],
       ...,
       [2.97371714e-01, 4.20106711e-04, 2.22334357e-03, ...,
        1.86452450e-03, 1.38968133e-03, 2.55138327e-04],
       [9.96247529e-03, 2.63999348e-04, 1.45460574e-04, ...,
        3.64175818e-04, 1.62076494e-04, 4.21555754e-05],
       [1.92900518e-01, 4.59247232e-03, 8.67420889e-05, ...,
        3.36313194e-03, 6.75652223e-05, 4.27028167e-05]])