In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import json
import string
import requests
import pymorphy2
from bs4 import BeautifulSoup
from collections import Counter

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, ClassifierMixin

Загружаем данные

In [2]:
train_pool = pd.read_csv('train.csv', encoding='utf-8')
test_pool = pd.read_csv('test.csv', encoding='utf-8')

with open('train_org_information.json', encoding='utf8') as f:
    train_orgs = json.load(f)

with open('test_org_information.json', encoding='utf8') as f:
    test_orgs = json.load(f)

with open('train_rubric_information.json', encoding='utf8') as f:
    train_rubrics = json.load(f)

with open('test_rubric_information.json', encoding='utf8') as f:
    test_rubrics = json.load(f)

with open('train_clicks_information.json', encoding='utf8') as f:
    train_clicks = json.load(f)

with open('test_clicks_information.json', encoding='utf8') as f:
    test_clicks = json.load(f)

In [3]:
table = str.maketrans({key: None for key in string.punctuation})

def preprocess(s):
    return s.strip().translate(table).casefold()

def get_keywords(org_id, orgs, rubrics, locale):
    keywords = ''
    org_rubrics = orgs[org_id]['rubrics']
    for rubric_id in org_rubrics:
        rubric = rubrics[str(rubric_id)]
        for obj in rubric['descriptions']:
            # if obj['value']['locale'] == locale:
                keywords += ' ' + preprocess(obj['value']['value'])
        for key in ['keywords', 'phrases', 'names']:
            for keywords_with_locale in rubric[key]:
                # if keywords_with_locale['locale'] == locale:
                    keywords += ' ' + preprocess(keywords_with_locale['value'])
    
    return ' '.join(set(keywords.split(' ')))

def get_org_name(org_id, orgs, locale):
    for org_name in orgs[org_id]['names']:
        if org_name['value']['locale'] == locale:
            return preprocess(org_name['value']['value'])
    return ''

def get_words_in_query(words, query):
    query = preprocess(query).split(' ')
    words = preprocess(words).split(' ')
    
    count_words = Counter()
    for word in words:
        if len(word) > 2:
            count_words[word] += 1
    
    coef = 0
    for word in query:
        if word in count_words:
            coef += count_words[word]
    return coef

def get_org_clicks(org_id, clicks, query):
    if org_id not in clicks:
        return 0
    return get_words_in_query(' '.join(clicks[org_id]), query)

def get_org_addr(query, org_addr):
    return get_words_in_query(org_addr, query)

def enrich(pool, orgs, rubrics, clicks):
    pool['query'] = pool['query'].map(preprocess)
    pool['org_name'] = pool['org_name'].map(preprocess)
    pool['org_name_ru'] = pool['org_id'].map(lambda x: get_org_name(str(x), orgs, 'ru'))
    pool['org_region'] = pool['org_id'].map(lambda x: orgs[str(x)]['address']['region_code'])
    pool['org_geo_id'] = pool['org_id'].map(lambda x: orgs[str(x)]['address']['geo_id'])
    pool['org_address'] = pool['org_id'].map(lambda x: orgs[str(x)]['address']['formatted']['value'])
    pool['org_keywords_ru'] = pool['org_id'].map(lambda x: get_keywords(str(x), orgs, rubrics, 'ru'))
    pool['org_clicks_in_query'] = pool.apply(lambda row: get_org_clicks(str(row.org_id), clicks, row.query), axis=1)
    pool['org_addr_in_query'] = pool.apply(lambda row: get_org_addr(row.query, row.org_address), axis=1)
    pool['org_keyw_in_query'] = pool.apply(lambda row: get_words_in_query(row.org_keywords_ru, row.query), axis=1)
    pool['same_region'] = pool.apply(lambda row: 1 if row.region == orgs[str(row.org_id)]['address']['geo_id'] else 0, axis=1)
    return pool

In [4]:
train_pool = enrich(train_pool, train_orgs, train_rubrics, train_clicks)
test_pool = enrich(test_pool, test_orgs, test_rubrics, test_clicks)

In [5]:
train_pool.head()

Unnamed: 0,query_id,query,region,org_name,org_id,window_center,window_size,relevance,org_name_ru,org_region,org_geo_id,org_address,org_keywords_ru,org_clicks_in_query,org_addr_in_query,org_keyw_in_query,same_region
0,11,суд украина днепропетровская область днепродзе...,21775,суд жовтневого району міста дніпропетровськ,1021049127,"34.613119,48.506531","0.025928,0.017380",0.0,суд жовтневого района города днепропетровск,UA,141,"Днепропетровская обл., Днепр г., ул. Паторжинс...",işlemler adliyeleri gerichtsbehörde судьи isl...,11,1,1,0
1,11,суд украина днепропетровская область днепродзе...,21775,дніпропетровський окружний адміністративний суд,1602348889,"34.613119,48.506531","0.025928,0.017380",0.0,днепропетровский окружной административный суд,UA,141,"Украина, Днепр, улица Академика Янгеля, 4",işlemler adliyeleri gerichtsbehörde судьи isl...,10,1,1,0
2,11,суд украина днепропетровская область днепродзе...,21775,бабушкінський районний суд,1105837793,"34.613119,48.506531","0.025928,0.017380",0.0,бабушкинский районный суд,UA,141,"Дніпропетровська обл., Дніпро, просп. Дмитра Я...",işlemler adliyeleri gerichtsbehörde судьи isl...,9,0,1,0
3,11,суд украина днепропетровская область днепродзе...,21775,красногвардійський районний суд,1066267658,"34.613119,48.506531","0.025928,0.017380",0.0,красногвардейский районный суд,UA,141,"Дніпропетровська обл., Дніпро, просп. Пушкіна,...",işlemler adliyeleri gerichtsbehörde судьи isl...,4,0,1,0
4,11,суд украина днепропетровская область днепродзе...,21775,жовтневий суд,1661586235,"34.613119,48.506531","0.025928,0.017380",0.0,жовтневый суд,UA,141,"Украина, Днепр, улица Паторжинского, 18А",işlemler adliyeleri gerichtsbehörde судьи isl...,5,1,1,0


Напишем функции, генерирующие простые признаки основанные на пересечении триграмм между запросом и названием организации

In [6]:
def get_trigrams(string):
    string = '^^' + string + '$$'
    trigrams = set()
    trigrams_count = 0
    
    for i in range(len(string) - 2):
        trigrams.add(string[i:i+3])
        trigrams_count += 1
        
    return trigrams, trigrams_count

def common_trigrams_factors(query, org_name, org_name_ru, org_address, org_keywords_ru):
    query_trigrams, query_trigrams_count = get_trigrams(query)
    org_name_trigrams, org_name_trigrams_count = get_trigrams(org_name if org_name_ru == '' else org_name_ru)

    org_addr_trigrams, org_addr_trigrams_count = get_trigrams(org_address)
    org_keyw_trigrams, org_keyw_trigrams_count = get_trigrams(org_keywords_ru)
    
    
    factors = [float(len(query_trigrams.intersection(org_name_trigrams)))]

    factors.append(0. if query_trigrams_count == 0. else 0.1 + factors[0] / query_trigrams_count)
    factors.append(0. if org_name_trigrams_count == 0. else 0.1 + factors[0] / org_name_trigrams_count)    
    
    factors.append(float(len(query_trigrams.intersection(org_addr_trigrams))))
    factors.append(0. if org_addr_trigrams_count == 0. else 0.1 + factors[3] / org_addr_trigrams_count)  
    
    factors.append(float(len(query_trigrams.intersection(org_keyw_trigrams))))
    factors.append(0. if org_keyw_trigrams_count == 0. else 0.1 + factors[5] / org_keyw_trigrams_count)
    
    return factors

Посчитаем данные факторы для каждого файла

In [7]:
def calc_trigram_factors(row):
    return row.append(
        pd.Series(common_trigrams_factors(row.query, row.org_name, row.org_name_ru, row.org_address, row.org_keywords_ru)))
    
train_factors = train_pool.apply(calc_trigram_factors, axis=1)
test_factors = test_pool.apply(calc_trigram_factors, axis=1)

In [8]:
train_factors.head()

Unnamed: 0,query_id,query,region,org_name,org_id,window_center,window_size,relevance,org_name_ru,org_region,...,org_addr_in_query,org_keyw_in_query,same_region,0,1,2,3,4,5,6
0,11,суд украина днепропетровская область днепродзе...,21775,суд жовтневого району міста дніпропетровськ,1021049127,"34.613119,48.506531","0.025928,0.017380",0.0,суд жовтневого района города днепропетровск,UA,...,1,1,0,23.0,0.419444,0.611111,22.0,0.485965,10.0,0.1125
1,11,суд украина днепропетровская область днепродзе...,21775,дніпропетровський окружний адміністративний суд,1602348889,"34.613119,48.506531","0.025928,0.017380",0.0,днепропетровский окружной административный суд,UA,...,1,1,0,18.0,0.35,0.475,6.0,0.239535,10.0,0.1125
2,11,суд украина днепропетровская область днепродзе...,21775,бабушкінський районний суд,1105837793,"34.613119,48.506531","0.025928,0.017380",0.0,бабушкинский районный суд,UA,...,0,1,0,7.0,0.197222,0.359259,10.0,0.25873,10.0,0.1125
3,11,суд украина днепропетровская область днепродзе...,21775,красногвардійський районний суд,1066267658,"34.613119,48.506531","0.025928,0.017380",0.0,красногвардейский районный суд,UA,...,0,1,0,6.0,0.183333,0.2875,10.0,0.292308,10.0,0.1125
4,11,суд украина днепропетровская область днепродзе...,21775,жовтневий суд,1661586235,"34.613119,48.506531","0.025928,0.017380",0.0,жовтневый суд,UA,...,1,1,0,2.0,0.127778,0.233333,11.0,0.361905,10.0,0.1125


Обучаем модель

In [9]:
""" Reference from https://gist.github.com/bwhite/3726239
"""

def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


print(ndcg_at_k([0],5, method=1))
print(ndcg_at_k([1],5, method=1))
print(ndcg_at_k([1,0],5, method=1))
print(ndcg_at_k([0,1],5, method=1))
print(ndcg_at_k([0,1,1],5, method=1))
print(ndcg_at_k([0,1,1,1],5, method=1))

0.0
1.0
1.0
0.6309297535714575
0.6934264036172708
0.7328286204777911


In [10]:
print(ndcg_at_k([.3,.2,.1,0], 10))

1.0


In [11]:
print(ndcg_at_k([0, .1, .2, .3], 10))

0.6680707577211257


В итоге сформировались такие признаки:
* 0, 1 - факторы для имени организации (как в бейзлайне)
* 2, 3 - факторы для адреса организации
* `org_clicks_in_query` - количество слов из кликов по организации, присутствующих в запросах
* `org_addr_in_query` - количество слов из адреса организации, присутствующих в запросе
* `same_region` - совпадает ли регион организации с регионом запроса
* `org_keyw_in_query` - количество ключевых слов по организации, присутствующих в запросе

In [21]:
train_features = [0, 1, 2, 'org_clicks_in_query', 'org_addr_in_query']
train_features = [0, 1, 2, 3, 4, 5, 'org_clicks_in_query', 'org_addr_in_query']
train_features = [0, 1, 2, 3, 'org_clicks_in_query', 'org_addr_in_query', 'same_region', 'org_keyw_in_query']

In [22]:
def k_fold(data, cv=5, random_state=None):
    query_ids = data.query_id.unique()
    queries = len(query_ids)//cv
    rng = np.random.RandomState(random_state)
    query_ids_divided = rng.choice(query_ids, size=(cv, queries), replace=False)
    for i in range(cv):
        test_ids = query_ids_divided[i]
        train_ids = np.delete(query_ids_divided, i, axis=0).flatten()
        train = data[np.isin(data.query_id, train_ids)].reset_index()
        test = data[np.isin(data.query_id, test_ids)].reset_index()
        yield train, test

In [23]:
def cross_validation(clf, data, features, target, cv=5, random_state=None):
    ndcgs_all = []
    for train, test in k_fold(data, cv, random_state):
        ndcgs = []
        clf.fit(train[features], train[target])
        y_pred = clf.predict(test[features])
        for test_id in test.query_id.unique():
            test_by_query_id = test[test.query_id == test_id][['index', target]]
            test_index = test_by_query_id.index
            relevances_pred = y_pred[test_index]
            positions_pred = np.argsort(relevances_pred)[::-1]
            relevances_by_positions = test_by_query_id.iloc[positions_pred][target]
            ndcgs.append(ndcg_at_k(relevances_by_positions, k=10, method=1))
        ndcgs_all.append(np.mean(ndcgs))
    return np.mean(ndcgs_all)

In [24]:
def cross_validation_ranking(params, data, features, target, cv=5, random_state=None, clf2=None, alpha=0.5):
    ndcgs_all = []
    for train, test in k_fold(data, cv, random_state):
        ndcgs = []
        
        train_queries = train.query_id.unique()
        train_counts = train.query_id.value_counts()
        train_dmatrix = xgb.DMatrix(train[features], train[target])
        train_dmatrix.set_group(train_counts[train_queries].values)
        
        test_queries = test.query_id.unique()
        test_counts = test.query_id.value_counts()
        test_dmatrix = xgb.DMatrix(test[features])
        test_dmatrix.set_group(test_counts[test_queries].values)
        
        clf = xgb.train(params, train_dmatrix, 200)
        y_pred = clf.predict(test_dmatrix)
        if clf2 is not None:
            clf2.fit(train[features], train[target])
            clf2_pred = clf2.predict(test[features])
            y_pred = y_pred*alpha + clf2_pred*(1 - alpha)
        for test_id in test.query_id.unique():
            test_by_query_id = test[test.query_id == test_id][['index', target]]
            test_index = test_by_query_id.index
            relevances_pred = y_pred[test_index]
            positions_pred = np.argsort(relevances_pred)[::-1]
            relevances_by_positions = test_by_query_id.iloc[positions_pred][target]
            ndcgs.append(ndcg_at_k(relevances_by_positions, k=10, method=1))
        ndcgs_all.append(np.mean(ndcgs))
    return np.mean(ndcgs_all)

In [25]:
clf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
cross_validation(clf, train_factors, train_features, 'relevance', random_state=42)

0.7213761546212488

In [26]:
clf = XGBRegressor(n_estimators=100)
cross_validation(clf, train_factors, train_features, 'relevance', random_state=42)

0.7368110809063395

In [27]:
clf = XGBRegressor(n_estimators=100, objective='rank:pairwise')
cross_validation(clf, train_factors, train_features, 'relevance', random_state=42)

0.7115195759750484

In [28]:
params = {'objective': 'rank:pairwise', 'silent': True}
cross_validation_ranking(params, train_factors, train_features, 'relevance', random_state=42)

0.7444459251168742

In [29]:
params = []
for max_features in ['auto', 'sqrt', 'log2']:
    for min_samples_leaf in [5, 7, 9, 11, 12]:
        clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42, max_features=max_features, min_samples_leaf=min_samples_leaf)
        cv = cross_validation(clf, train_factors, train_features, 'relevance', random_state=42)
        print(f'max_features={max_features}, min_samples_leaf={min_samples_leaf}', cv)
        params.append(((max_features, min_samples_leaf), cv))

max_features=auto, min_samples_leaf=5 0.7301539483357561
max_features=auto, min_samples_leaf=7 0.7303341354914473
max_features=auto, min_samples_leaf=9 0.7308422558882863
max_features=auto, min_samples_leaf=11 0.7314003763976309
max_features=auto, min_samples_leaf=12 0.7317502035199684
max_features=sqrt, min_samples_leaf=5 0.7368603108414223
max_features=sqrt, min_samples_leaf=7 0.7357126718668784
max_features=sqrt, min_samples_leaf=9 0.7349020028806139
max_features=sqrt, min_samples_leaf=11 0.7346955858719417
max_features=sqrt, min_samples_leaf=12 0.734898277846845
max_features=log2, min_samples_leaf=5 0.735132560907155
max_features=log2, min_samples_leaf=7 0.7374298863250618
max_features=log2, min_samples_leaf=9 0.7361794694356987
max_features=log2, min_samples_leaf=11 0.7349086633526896
max_features=log2, min_samples_leaf=12 0.7346392456943419


In [32]:
print(sorted(params, key=lambda x: -x[1])[0])

(('log2', 7), 0.7374298863250618)


In [34]:
params_xgb = []
for max_depth in [2, 3, 4]:
    for learning_rate in [0.05, 0.1, 0.5]:
        for colsample_bytree in [0.7, 1]:
            params = {'objective': 'rank:pairwise', 'silent': True, 'colsample_bytree': colsample_bytree, 'max_depth': max_depth, 'learning_rate': learning_rate}
            cv = cross_validation_ranking(params, train_factors, train_features, 'relevance', random_state=42)
            print(f'max_depth={max_depth}, learning_rate={learning_rate}, colsample_bytree={colsample_bytree}', cv)
            params_xgb.append(((max_depth, learning_rate, colsample_bytree), cv))

max_depth=2, learning_rate=0.05, colsample_bytree=0.7 0.7725087082242401
max_depth=2, learning_rate=0.05, colsample_bytree=1 0.772546663122022
max_depth=2, learning_rate=0.1, colsample_bytree=0.7 0.7758517156981153
max_depth=2, learning_rate=0.1, colsample_bytree=1 0.7762676463411127
max_depth=2, learning_rate=0.5, colsample_bytree=0.7 0.770820618820302
max_depth=2, learning_rate=0.5, colsample_bytree=1 0.7707165570126264
max_depth=3, learning_rate=0.05, colsample_bytree=0.7 0.7732282963231759
max_depth=3, learning_rate=0.05, colsample_bytree=1 0.7748738044456009
max_depth=3, learning_rate=0.1, colsample_bytree=0.7 0.7730455358407755
max_depth=3, learning_rate=0.1, colsample_bytree=1 0.7728854975973662
max_depth=3, learning_rate=0.5, colsample_bytree=0.7 0.766027813232627
max_depth=3, learning_rate=0.5, colsample_bytree=1 0.7630931924624571
max_depth=4, learning_rate=0.05, colsample_bytree=0.7 0.7700483091859418
max_depth=4, learning_rate=0.05, colsample_bytree=1 0.7701605042402144
max

In [35]:
print(sorted(params_xgb, key=lambda x: -x[1])[0])

((2, 0.1, 1), 0.7762676463411127)


In [36]:
for max_depth in [1, 2]:
    for learning_rate in [0.05, 0.1, 0.5, 0.8]:
        for colsample_bytree in [0.6, 0.8, 1]:
            params = {'objective': 'rank:pairwise', 'silent': True, 'colsample_bytree': colsample_bytree, 'max_depth': max_depth, 'learning_rate': learning_rate}
            cv = cross_validation_ranking(params, train_factors, train_features, 'relevance', random_state=42)
            print(f'max_depth={max_depth}, learning_rate={learning_rate}, colsample_bytree={colsample_bytree}', cv)
            params_xgb.append(((max_depth, learning_rate, colsample_bytree), cv))

max_depth=1, learning_rate=0.05, colsample_bytree=0.6 0.7648316915232478
max_depth=1, learning_rate=0.05, colsample_bytree=0.8 0.7655348737102792
max_depth=1, learning_rate=0.05, colsample_bytree=1 0.7655162774763604
max_depth=1, learning_rate=0.1, colsample_bytree=0.6 0.7679442416388768
max_depth=1, learning_rate=0.1, colsample_bytree=0.8 0.7669297193292857
max_depth=1, learning_rate=0.1, colsample_bytree=1 0.7669036901205029
max_depth=1, learning_rate=0.5, colsample_bytree=0.6 0.7756254383249124
max_depth=1, learning_rate=0.5, colsample_bytree=0.8 0.7744627085844897
max_depth=1, learning_rate=0.5, colsample_bytree=1 0.7751155334918174
max_depth=1, learning_rate=0.8, colsample_bytree=0.6 0.7744513247730833
max_depth=1, learning_rate=0.8, colsample_bytree=0.8 0.7764170532931025
max_depth=1, learning_rate=0.8, colsample_bytree=1 0.7753673632361378
max_depth=2, learning_rate=0.05, colsample_bytree=0.6 0.7709907875323398
max_depth=2, learning_rate=0.05, colsample_bytree=0.8 0.772874619887

In [37]:
print(sorted(params_xgb, key=lambda x: -x[1])[0])

((1, 0.8, 0.8), 0.7764170532931025)


In [51]:
for i in [0.2, 0.3, 0.5, 0.7, 0.9]:
    rf = RandomForestRegressor(n_estimators=200, criterion='mse', max_features='sqrt', min_samples_leaf=7, n_jobs=-1, random_state=42)
    params = {'objective': 'rank:pairwise', 'silent': True, 'colsample_bytree': 0.8, 'max_depth': 1, 'learning_rate': 0.8}
    print(i, cross_validation_ranking(params, train_factors, train_features, 'relevance', random_state=42, clf2=rf, alpha=i))

0.2 0.7764858495399266
0.3 0.7752615971949454
0.5 0.774209294784679
0.7 0.7735902514455598
0.9 0.7738404699521434


Используем Blending RandomForest и XGBoost с подобраными гиперпараметрами

In [52]:
train_queries = train_factors.query_id.unique()
train_counts = train_factors.query_id.value_counts()
train_dmatrix = xgb.DMatrix(train_factors[train_features], train_factors['relevance'])
train_dmatrix.set_group(train_counts[train_queries].values)

test_queries = test_factors.query_id.unique()
test_counts = test_factors.query_id.value_counts()
test_dmatrix = xgb.DMatrix(test_factors[train_features])
test_dmatrix.set_group(test_counts[test_queries].values)

params = {'objective': 'rank:pairwise', 'silent': True, 'colsample_bytree': 0.8, 'max_depth': 1, 'learning_rate': 0.8}
clf = xgb.train(params, train_dmatrix, 200)
xgb_pred = clf.predict(test_dmatrix)

rf = RandomForestRegressor(n_estimators=200, max_features='sqrt', min_samples_leaf=7, n_jobs=-1, random_state=42)
rf.fit(train_factors[train_features], train_pool['relevance'].ravel())
rf_pred = rf.predict(test_factors[train_features])

test_pool['relevance'] = 0.2*xgb_pred + 0.8*rf_pred

Сортируем организации по предсказанной релевантности для каждой организации и записываем их в файл

In [53]:
test_pool.sort_values(['query_id', 'relevance'], ascending=[True, False])[['query_id', 'org_id']].to_csv('baseline.csv', index=None)