# Необходимые импорты

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import neighbors
from sklearn.metrics import accuracy_score
from cross_val import knn_cv_score, kfold_split

import numpy as np
import json
import pickle

# Загрузка данных

In [6]:
with open('dev-dataset-task2022-04.json', 'rb') as f:
    data = json.load(f)

In [18]:
X = np.array([elem[0] for elem in data])
y = np.array([elem[1] for elem in data])

# Инициализация нормализаторов

In [25]:
count_vec = CountVectorizer(max_df=0.8, min_df=10)
tf_idf = TfidfVectorizer(max_df=0.8, min_df=10)

# Кросс-валидация

In [28]:
parameters = {
    'n_neighbors': [i for i in range(1, 11)],
    'metrics': ['euclidean', 'cosine'],
    'weights': ['uniform', 'distance'],
    'normalizers': [(count_vec, 'CountVectorizer'), (tf_idf, 'TfidfVectorizer')]
}

In [29]:
folds = kfold_split(X.shape[0], 6)
out = knn_cv_score(X, y, parameters, accuracy_score, folds, neighbors.KNeighborsClassifier)
out

{('CountVectorizer', 1, 'euclidean', 'uniform'): 0.6515792910727595,
 ('CountVectorizer', 1, 'euclidean', 'distance'): 0.6515792910727595,
 ('CountVectorizer', 1, 'cosine', 'uniform'): 0.8033039581501711,
 ('CountVectorizer', 1, 'cosine', 'distance'): 0.8033039581501711,
 ('CountVectorizer', 2, 'euclidean', 'uniform'): 0.596352562194393,
 ('CountVectorizer', 2, 'euclidean', 'distance'): 0.6494067274476819,
 ('CountVectorizer', 2, 'cosine', 'uniform'): 0.766073278736567,
 ('CountVectorizer', 2, 'cosine', 'distance'): 0.8026832256858634,
 ('CountVectorizer', 3, 'euclidean', 'uniform'): 0.5640860118285302,
 ('CountVectorizer', 3, 'euclidean', 'distance'): 0.6177655246573858,
 ('CountVectorizer', 3, 'cosine', 'uniform'): 0.7545920357025007,
 ('CountVectorizer', 3, 'cosine', 'distance'): 0.7961684192551672,
 ('CountVectorizer', 4, 'euclidean', 'uniform'): 0.5668746928066568,
 ('CountVectorizer', 4, 'euclidean', 'distance'): 0.6428878827946345,
 ('CountVectorizer', 4, 'cosine', 'uniform'): 0

In [30]:
max([(out[i], i) for i in out])

(0.8337071573452957, ('TfidfVectorizer', 1, 'cosine', 'uniform'))

# Создание модели с лучшими параметрами

In [32]:
scaler = TfidfVectorizer(max_df=0.8, min_df=10)
scaler.fit(X)
X_tf = scaler.transform(X)

knn = neighbors.KNeighborsClassifier(metric='cosine', n_neighbors=1, weights='distance')
knn.fit(X_tf, y)

KNeighborsClassifier(metric='cosine', n_neighbors=1, weights='distance')

# Сохранение данных

In [33]:
with open('knn.pkl', 'wb') as f:
    pickle.dump(knn, f)

with open('normalizer.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('X.pkl', 'wb') as f:
    pickle.dump(X, f)

with open('y.pkl', 'wb') as f:
    pickle.dump(y, f)