In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.1-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.1


In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import catboost
from catboost.core import CatBoostRanker, CatBoostClassifier
import joblib

In [3]:
from google.colab import output
output.enable_custom_widget_manager()

### Загрузка матрицы признаков, индексов и данных, полученных на предыдущем этапе

In [4]:
with open('drive/MyDrive/Colab Notebooks/data/preprocessed_data.npy', 'rb') as f:
  preprocessed_data = np.load(f)

In [5]:
data_train = pd.read_csv('drive/MyDrive/Colab Notebooks/data/train.csv', index_col=0)
target_train = data_train['Target']

data_answers = pd.read_csv('drive/MyDrive/Colab Notebooks/data/validation_answer.csv', index_col=0)

In [6]:
with open('drive/MyDrive/Colab Notebooks/data/preprocessed_data_v.npy', 'rb') as f:
  preprocessed_data_v = np.load(f)

In [7]:
with open('drive/MyDrive/Colab Notebooks/data/common_index.pkl', 'rb') as f:
    common_index = pickle.load(f)

In [8]:
X = preprocessed_data[:, 0:140]
y = preprocessed_data[:, 140]
X_valid = preprocessed_data_v[:, 0:140]

### Выбор модели для точного поиска

*В этот раздел включены не все эксперименты, которые были проведены*

### Логистическая регрессия. Для полной матицы и для сокращенной: дистанция и метка "0-1"

In [9]:
model_LR = LogisticRegression(class_weight='balanced', random_state=2908, C=.25, solver='lbfgs')
cross_val_score(model_LR, X, y, cv=5, scoring='accuracy', verbose=2)

[CV] END .................................................... total time=   6.4s
[CV] END .................................................... total time=   5.8s
[CV] END .................................................... total time=   3.9s
[CV] END .................................................... total time=   5.5s
[CV] END .................................................... total time=   6.0s


array([0.993445, 0.99345 , 0.99345 , 0.99345 , 0.99345 ])

In [10]:
X_short = preprocessed_data[:, 138].reshape(-1,1)

In [11]:
y_t = y.astype('int')

In [12]:
w1 = sum(y) / y.shape[0]
w0 = (y.shape[0] - sum(y)) / y.shape[0]

In [17]:
model_LR_S = LogisticRegression(class_weight='balanced', random_state=2908, C=.25, solver='lbfgs')
cross_val_score(model_LR, X_short, y_t, cv=5, scoring='accuracy', verbose=2)

[CV] END .................................................... total time=   1.2s
[CV] END .................................................... total time=   1.3s
[CV] END .................................................... total time=   1.2s
[CV] END .................................................... total time=   1.2s
[CV] END .................................................... total time=   1.8s


array([0.846465, 0.854935, 0.86152 , 0.85995 , 0.856445])

In [18]:
model_LR = model_LR.fit(X,y)

In [19]:
model_LR_S = model_LR_S.fit(X_short,y_t)

In [20]:
joblib.dump(model_LR, 'drive/MyDrive/Colab Notebooks/data/model_LR.joblib')

['drive/MyDrive/Colab Notebooks/data/model_LR.joblib']

In [21]:
joblib.dump(model_LR_S, 'drive/MyDrive/Colab Notebooks/data/model_LR_S.joblib')

['drive/MyDrive/Colab Notebooks/data/model_LR_S.joblib']

### CatBoost Ranker [Источник](https://github.com/catboost/catboost/blob/master/catboost/tutorials/ranking/ranking_tutorial.ipynb)

In [22]:
group_id = []
for _ in range(int(preprocessed_data.shape[0] / 100)):
  group_id += (np.repeat(_, 100)).tolist()

In [23]:
batch1 = catboost.Pool(data=X,
                  label=y_t,
                       group_id=group_id)

In [55]:
params = {"iterations": 200,
          "depth": 2,
          "loss_function":'RMSE',
          "verbose": False}

scores = catboost.cv(batch1,
            params,
            fold_count=3,
            plot="True")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 0.07086805195
bestIteration = 199

Training on fold [1/3]

bestTest = 0.07092150056
bestIteration = 199

Training on fold [2/3]

bestTest = 0.07054659101
bestIteration = 199



In [78]:
params = {"iterations": 200,
          "depth": 2,
          "loss_function":'RMSE',
          "verbose": 0,
          "eval_metric": "AUC"}
model_CR = CatBoostRanker(**params, random_state=2908)
model_CR.fit(batch1)



<catboost.core.CatBoostRanker at 0x79157a12b8b0>

In [26]:
joblib.dump(model_CR, 'drive/MyDrive/Colab Notebooks/data/model_CR.joblib')

['drive/MyDrive/Colab Notebooks/data/model_CR.joblib']

### CatBoost Classifier

In [56]:
params_cl = {"iterations": 200,
          "depth": 5,
          "eval_metric":'AUC',
          "class_weights":(w0, w1), #еще раз обучить FAISS на GPU и брать оттуда баланс классов
          "verbose": 0}
model_CC = CatBoostClassifier(**params_cl, random_state=2908)
model_CC.fit(X, y_t)

<catboost.core.CatBoostClassifier at 0x7914da17fca0>

In [57]:
model_CC.best_score_

{'learn': {'Logloss': 0.00029362152143965056}}

In [58]:
joblib.dump(model_CC, 'drive/MyDrive/Colab Notebooks/data/model_CC.joblib')

['drive/MyDrive/Colab Notebooks/data/model_CC.joblib']

### Предсказания вероятностей для валидационной выборки

In [59]:
prob_pred_LR = model_LR.predict_proba(X_valid)

In [60]:
prob_pred_LR_S = model_LR_S.predict_proba(X_valid[:, 138].reshape(-1,1))

In [61]:
prob_pred_LR = prob_pred_LR[:,1]

In [62]:
prob_pred_LR_S = prob_pred_LR_S[:,1]

### Для Catboost Ranker - [костыльное решение для вероятностей](https://habr.com/ru/articles/599827/)

In [63]:
raw_predictions = model_CR.predict(X_valid)

In [64]:
sigmoid = lambda x: 1 / (1 + np.exp(-x))
prob_pred_CR = sigmoid(raw_predictions)

In [65]:
prob_pred_CR

array([0.53927891, 0.52850297, 0.53401933, ..., 0.50092435, 0.50083365,
       0.4992888 ])

In [66]:
prob_pred_CC = model_CC.predict_proba(X_valid)

In [67]:
prob_pred_CC = prob_pred_CC[:,1]

In [39]:
#p.save('drive/MyDrive/Colab Notebooks/data/prob_LR.npy', prob_pred_LR)
#np.save('drive/MyDrive/Colab Notebooks/data/prob_CR.npy', prob_pred_CR)

### Сборка результатов

In [68]:
valid_batch_size = 5000
ans = data_answers[:valid_batch_size]

In [69]:
candidates = (X_valid[:, 139]).astype('int')

In [70]:
true_answer = pd.DataFrame(np.repeat(ans.values, 100, axis=0))[0]

In [71]:
NeiNum = 100 # из FAISS

### Расчет accuracy @ 100 и accuracy @ 5

In [72]:
def acc_final(candidates, true_values, probs, number):
  result_100 = 0
  result_5 = 0
  answer = pd.DataFrame({'id':candidates.tolist(), 'probability':probs.tolist(), 'true_answer':true_values})
  for i in range(0, answer.shape[0], number):
    ids_5 = answer[i:i+number].sort_values(by='probability', ascending=False)[:5]['id']
    ids_100 = answer[i:i+number]['id']
    real_ans = answer[i:i+number]['true_answer'].values[0]
    if real_ans in [common_index[_] for _ in ids_100.values]:
      result_100 += 1
    if real_ans in [common_index[_] for _ in ids_5.values]:
      result_5 += 1
    size = answer.shape[0] / number

  return result_100 / size, result_5 / size

In [73]:
acc_100, acc_5 = acc_final(candidates, true_answer, prob_pred_LR, NeiNum)
print('RESULTS FOR VALID DATASET FOR LOGISTIC REGRESSION')
print(f'ACCURACY @ 100 --- {acc_100}, ACCURACY @ 5 --- {acc_5},')

RESULTS FOR VALID DATASET FOR LOGISTIC REGRESSION
ACCURACY @ 100 --- 0.6428, ACCURACY @ 5 --- 0.0776,


In [74]:
acc_100, acc_5 = acc_final(candidates, true_answer, prob_pred_LR_S, NeiNum)
print('RESULTS FOR VALID DATASET FOR LOGISTIC REGRESSION WITH SINGLE FEATURE')
print(f'ACCURACY @ 100 --- {acc_100}, ACCURACY @ 5 --- {acc_5},')

RESULTS FOR VALID DATASET FOR LOGISTIC REGRESSION WITH SINGLE FEATURE
ACCURACY @ 100 --- 0.6428, ACCURACY @ 5 --- 0.567,


In [75]:
acc_100, acc_5 = acc_final(candidates, true_answer, prob_pred_CR, NeiNum)
print('RESULTS FOR VALID DATASET FOR CATBOOST RANKER')
print(f'ACCURACY @ 100 --- {acc_100}, ACCURACY @ 5 --- {acc_5},')

RESULTS FOR VALID DATASET FOR CATBOOST RANKER
ACCURACY @ 100 --- 0.6428, ACCURACY @ 5 --- 0.5616,


In [76]:
acc_100, acc_5 = acc_final(candidates, true_answer, prob_pred_CC, NeiNum)
print('RESULTS FOR VALID DATASET FOR CATBOOST CLASSIFIER')
print(f'ACCURACY @ 100 --- {acc_100}, ACCURACY @ 5 --- {acc_5},')

RESULTS FOR VALID DATASET FOR CATBOOST CLASSIFIER
ACCURACY @ 100 --- 0.6428, ACCURACY @ 5 --- 0.5608,


**Вывод**

Модель требует доработки.

Также надо заново обучить на этапе грубого поиска на GPU и повторить точный поиск