In [93]:
import pandas as pd
from sklearn.metrics import ndcg_score
import numpy as np
from sklearn.preprocessing import StandardScaler

from catboost import CatBoostClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ndcg_score

import category_encoders as ce

In [104]:
def sample_target_split(data): #выделение признаков и целевой переменной
    X = data.drop(['target'], axis=1)
    y = data['target']
    return X, y

def test_model(model, X_train, X_test, y_train, y_test): #тестирование модели
    model.fit(X_train, y_train)
    predictions = model.predict_proba(X_test)[:,1]
    validation_ndcg = ndcg_score([y_test], [predictions])
    print(f'Model: {model.__class__}\nNDCG on validation set: {validation_ndcg}\n\n')

def scaling(X_train, X_test): #масштабирование признаков
    sc = StandardScaler()
    X_train_scaled = sc.fit_transform(X_train)
    X_test_scaled = sc.transform(X_test)
    return X_train_scaled, X_test_scaled

def add_context(data): #добавление контекста об остальных объектах в группе
    features = data.drop(['search_id'], axis=1).columns
    context_of_group = pd.DataFrame()

    context_ways =['min','max','mean','median']

    for context_trainsform in context_ways:
        context_of_group = pd.concat([context_of_group, data.groupby('search_id')[features].transform(context_trainsform).add_suffix("_"+context_trainsform)], axis=1)

    return pd.concat([data, context_of_group], axis=1).drop('search_id', axis=1)

def find_const_features(X_train, X_test): #выделение константных признаков
    constant_columns_train = X_train.loc[:, X_train.nunique() == 1].columns.to_list()
    constant_columns_test = X_test.loc[:, X_test.nunique() == 1].columns.to_list()

    if constant_columns_train == constant_columns_test:
        constant_columns = constant_columns_train
    else:
        raise Exception("!!!")
    return constant_columns

def find_cat_features(X_train, X_test): #выделение категориальных признаков
    categorical_columns_train = X_train.loc[:, (1 < X_train.nunique()) & (X_train.nunique() <= 10)].columns.to_list()
    categorical_columns_test = X_test.loc[:, (1 < X_test.nunique()) & (X_test.nunique() <= 9)].columns.to_list()

    if categorical_columns_train == categorical_columns_test:
        categorical_columns = categorical_columns_train
    else:
        raise Exception("!!!")
    return categorical_columns


In [105]:
test = pd.read_csv('test_df.csv')
train = pd.read_csv('train_df.csv')

In [106]:
X_train, y_train = sample_target_split(train)
X_test, y_test = sample_target_split(test)

In [107]:
constant_columns = find_const_features(X_train, X_test)
categorical_columns = find_cat_features(X_train, X_test)

X_train = X_train.drop(constant_columns, axis=1)
X_test = X_test.drop(constant_columns, axis=1)

In [108]:
encoder = ce.OneHotEncoder(cols=categorical_columns)

X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [109]:
X_train = add_context(X_train)
X_test = add_context(X_test)

In [110]:
X_train, X_test = scaling(X_train, X_test)

In [102]:
list_classifiers = [LogisticRegression(), CatBoostClassifier(verbose=False), DummyClassifier(strategy='most_frequent'), KNeighborsClassifier(), SVC(probability=True), \
                DecisionTreeClassifier(), GradientBoostingClassifier(), RandomForestClassifier(),]

for model in list_classifiers:
    test_model(model, X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: <class 'sklearn.linear_model._logistic.LogisticRegression'>
NDCG on validation set: 0.5238461260485848


Model: <class 'catboost.core.CatBoostClassifier'>
NDCG on validation set: 0.6260610646995569


Model: <class 'sklearn.dummy.DummyClassifier'>
NDCG on validation set: 0.3899032693596343


Model: <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
NDCG on validation set: 0.37889392362474755


Model: <class 'sklearn.svm._classes.SVC'>
NDCG on validation set: 0.4335656749473137


Model: <class 'sklearn.tree._classes.DecisionTreeClassifier'>
NDCG on validation set: 0.5008501123213425


Model: <class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
NDCG on validation set: 0.5307288562116922


Model: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
NDCG on validation set: 0.4888098195053024




In [111]:
clf = CatBoostClassifier()

clf.fit(X_train, y_train, verbose=False)

predictions = clf.predict_proba(X_test)
ndcg_score([y_test.values], [predictions[:,1]])

0.6260610646995569