In [1]:
import pandas as pd
import numpy as np

In [2]:
def prepare_Xy():
    df = pd.read_csv('dataset.csv')
    df = df.query('CLASS_ID != 9000')
    X = df['uu_usl_name']
    y = df['CLASS_ID']
    return X,y
def texts_to_cv(X):
    from joblib import load
    cv = load('simple_vectorized_cv.joblib')
    sparce = cv.transform(X)
    return sparce
def normalize_texts(X):
    from pymorphy2 import MorphAnalyzer
    morph = MorphAnalyzer(lang='ru')
    return X.apply(lambda x: ' '.join([morph.parse(word.lower().strip('(.,!?:;\'"«»—-–)'))[0].normal_form for word in x.split()]))
def texts_to_vecs(X):
    from gensim.models import KeyedVectors
    model = KeyedVectors.load_word2vec_format('word_to_vec.bin', binary=True)
    def get_average_vector(words):
        vectors = []
        for word in words:
            if word in model:
                vectors.append(model[word])
            else:
                vectors.append(np.array([0.0] * 99 + [1.0]))
        return np.mean(vectors, axis=0)
    pre_X = [get_average_vector(text.split()) for text in X]
    return pd.DataFrame(pre_X, dtype=float)

In [3]:
X, y = prepare_Xy()

In [4]:
normalized_X = normalize_texts(X)

KeyboardInterrupt: 

In [None]:
vecs_X = texts_to_vecs(normalized_X)
cv_X = texts_to_cv(normalized_X)

In [None]:
from sklearn.model_selection import train_test_split
vecs_X_train, vecs_X_test, y_train, y_test = train_test_split(vecs_X, y, test_size=0.2, random_state=42)
cv_X_train, cv_X_test, _, _ = train_test_split(cv_X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
vecs_clf = RandomForestClassifier(
    n_estimators=20,
    max_depth=200,
    random_state=0,
    n_jobs=-1,
    min_samples_split=10,
    min_samples_leaf=10,
)

vecs_clf.fit(vecs_X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from joblib import load
cv_clf = load('simple_vectorized_model.joblib')
cv_clf.fit(cv_X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
vecs_y_proba = vecs_clf.predict_proba(vecs_X_test)
cv_y_proba = cv_clf.predict_proba(cv_X_test)

In [None]:
ind_to_class = {ind: cl for ind, cl in enumerate(vecs_clf.classes_)}

In [None]:
vecs_y_pred = np.argmax(vecs_y_proba, axis=1)
vecs_y_pred = [ind_to_class[ind] for ind in vecs_y_pred]
cv_y_pred = np.argmax(cv_y_proba, axis=1)
cv_y_pred = [ind_to_class[ind] for ind in cv_y_pred]
total_y_pred = np.argmax(vecs_y_proba + cv_y_proba, axis=1)
total_y_pred = [ind_to_class[ind] for ind in total_y_pred]
print(accuracy_score(y_test, vecs_y_pred))
print(accuracy_score(y_test, cv_y_pred))
print(accuracy_score(y_test, total_y_pred))

In [None]:
print(accuracy_score(y_test, cv_clf.predict(cv_X_test)))

In [None]:
cv_clf.predict(cv_X_test)

In [None]:
cv_clf.predict_proba(cv_X_test)