In [None]:
import pandas as pd
import spacy
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sn
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.neighbors import KNeighborsClassifier

In [None]:
nlp = spacy.load("ru2_combined_400ks_96")

df = pd.read_csv("description11 - description3-8.csv")
#df['Label'].value_counts()

In [None]:
#The Name and Author values are our classes
df['Label_num'] = df['Label'].map({'Name' : 0, 'Author': 1, 'Other' : 2})
#The Text column is values we classify
df['Vector'] = df['Text2'].apply(lambda text: nlp(text).vector) 

In [None]:
df.Label_num.unique()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.Vector.values,
    df.Label_num,
    test_size=0.1,
    random_state=6
)

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [None]:
y_test

# TREE

In [None]:
scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed_tree = scaler.transform(X_test_2d)

tree = DecisionTreeClassifier(random_state=0)
tree.fit(scaled_train_embed, y_train)

In [None]:
scaled_test_embed = scaler.transform(X_test_2d)
y_pred = tree.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')

In [None]:
#save the model
joblib.dump(tree, 'model_tree3.pkl') 

# MultinomialNB

In [None]:
scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

nb = GaussianNB()
nb.fit(scaled_train_embed, y_train)

In [None]:
scaled_test_embed = scaler.transform(X_test_2d)
y_pred = nb.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')

In [None]:
#save the model
joblib.dump(clf, 'model_mnb.pkl') 

# KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors = 10, metric = 'euclidean')
knn.fit(X_train_2d, y_train)

In [None]:
y_pred = knn.predict(X_test_2d)
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')

In [None]:
#save the model
joblib.dump(clf, 'model_knn6-test9.pkl') 

# Gradient

In [None]:
grad = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train_2d, y_train)

In [None]:
y_pred = grad.predict(X_test_2d)
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')

# Model testing

In [None]:
import joblib
import spacy
import numpy as np
#load the model
model = joblib.load('model_tree1.pkl')
nlp = spacy.load("ru2_combined_400ks_96")

In [None]:
test_text = 'бестселле'
vector = nlp(test_text).vector
vector_2d = np.stack(vector)
res = model.predict(vector_2d.reshape(1, -1))
print(res)