In [1]:
import os

from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
)

from ya_cup_2022.knn import KNNModel

In [2]:
ARTISTS_FN = None  # 'track_artists.csv'

VEC_CLS = CountVectorizer  #TfidfVectorizer

INDEX_S = 'cosinesimil_sparse_fast'
INDEX_M = 50
INDEX_EFC = 500
INDEX_P = 2
QUERY_EFS = 500

TRAIN_FN = '_train_s'
K = 30

N = 100

TA_MRK = 't' if ARTISTS_FN is None else 'a'
VR_MRK = 'tfidf' if VEC_CLS == TfidfVectorizer else 'cv'
SUFFIX = f'{TRAIN_FN}-{VR_MRK}-s{INDEX_S}-m{INDEX_M}-efc{INDEX_EFC}-p{INDEX_P}-{TA_MRK}'

TEST_FN = '_test_xxl_1k'
PRED_FN = f'_pred_xxl_1k_knn_{SUFFIX}-efs{QUERY_EFS}_@{N}'
INDEX_FN = f'nmslib_index_{SUFFIX}'

WD = os.path.join('D:', 'education', 'yaintern', 'yandex_cup_2022', 'data')
WD

'D:education\\yaintern\\yandex_cup_2022\\data'

In [3]:
track_artist_file = None
if ARTISTS_FN is not None:
    track_artist_file = os.path.join(WD, ARTISTS_FN)

model = KNNModel(
    vec_cls=VEC_CLS,
    n_neighbors=K,
    space=INDEX_S,
    index_params={'M': INDEX_M, 'efConstruction': INDEX_EFC, 'post': INDEX_P},
    query_params={'efSearch': QUERY_EFS},
    num_threads=12,
    track_artist_file=track_artist_file,
)

index_path = os.path.join(WD, INDEX_FN)
index_exists = os.path.exists(index_path)

model.fit(
    os.path.join(WD, TRAIN_FN),
    index_file=index_path if index_exists else None)

if not index_exists:
    model.index.saveIndex(index_path, save_data=True)

 ---> Fitting KNN model
Reading file: D:education\yaintern\yandex_cup_2022\data\_train_s
Fitting CV
Space dimension: 70000 x 372579
Fitting NN
 ---> KNN model fit finished


In [4]:
with open(os.path.join(WD, TEST_FN), 'rt') as f_test:
    queries = [line.strip() for line in f_test]

In [5]:
with open(os.path.join(WD, PRED_FN), 'wt') as f_pred:
    for preds in model.batch_predict(queries, cutoff=N):
        f_pred.write(f'{" ".join(p.track for p in preds)}\n')