# Links
- https://www.kaggle.com/c/crowdflower-search-relevance


# Solutions
- 1-st place: https://github.com/ChenglongChen/Kaggle_CrowdFlower
- 42-th place: https://github.com/marknagelberg/search-relevance/
- 107-th place: https://www.kaggle.com/lancerts/crowdflower-search-relevance/combined

# Discussions
- Александр Дьяконов: https://www.youtube.com/watch?v=kzNJEMR4ltY
- CrowdFlower Winner's Interview: 1st place, Chenglong Chen: https://github.com/ChenglongChen/Kaggle_CrowdFlower

In [2]:
import pandas as pd
import numpy as np
from functools import partial

from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer

from sklearn.tree import DecisionTreeClassifier

from nltk.util import ngrams

In [3]:
%run ds_tools/dstools/ml/metrics.py

In [4]:
%run ds_tools/dstools/ml/ensemble.py

In [5]:
%run ds_tools/dstools/ml/xgboost_tools.py



In [1]:
def preds_to_rank(preds):
    splits = [0, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]
    res = np.digitize(preds, splits)
    return res

In [2]:
def same_terms_count(left, right):
    return [len(set(l).intersection(set(r))) for l, r in zip(left, right)]


def text2ngrams(text):
    return [''.join(ng) for ng in ngrams(text, 3)]


def query_match(df):
    res = pd.DataFrame()

    query_ngrams = df['query'].fillna('').map(text2ngrams)
    title_ngrams = df.product_title.fillna('').map(text2ngrams)
    desc_ngrams = df.product_description.fillna('').map(text2ngrams)

    res['query_len'] = query_ngrams.map(len)
    res['title_len'] = title_ngrams.map(len)
    res['desc_len'] = desc_ngrams.map(len)

    res['query_ngrams_in_title'] = same_terms_count(query_ngrams, title_ngrams)
    res['query_ngrams_in_desc'] = same_terms_count(query_ngrams, desc_ngrams)

    res['ratio_title'] = res['query_ngrams_in_title']/(res['query_len']+.00001)
    res['ratio_description'] = res['query_ngrams_in_desc']/(res['query_len']+.00001)

    return res

In [10]:
def cv_test(est):
    pd.read_csv('train.csv.gz', index_col='id')

    df.fillna({'query': '', 'product_title': '', 'product_description': ''}, inplace=True)

    features = df.drop(['median_relevance', 'relevance_variance'], axis=1)
    target = df['median_relevance']

    scores = cross_val_score(
        estimator=est,
        X=features,
        y=target,
        cv=3,
        scoring=qwk_score,
        n_jobs=1,
        verbose=1)
    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [11]:
def qwk_score(est, features, labels):
    pred = preds_to_rank(est.predict(features))
    return quadratic_weighted_kappa(labels, pred)

In [12]:
def column_transformer(name):
    return FunctionTransformer(partial(pd.DataFrame.__getitem__, key=name), validate=False)

transf1 = make_union(
    make_pipeline(
        column_transformer('query'),
        CountVectorizer(),
    ),
    make_pipeline(
        column_transformer('product_title'),
        CountVectorizer(),
    ),
    make_pipeline(
        column_transformer('product_description'),
        CountVectorizer(),
    ),
)

In [13]:
# mean: 0.48871016400118511, std: 0.0085599656942586966
# cv execution time: 1725.75149703 sec
pl1 = make_pipeline(
    transf1,
    RandomForestRegressor(
        n_estimators=200,
        n_jobs=2,
        min_samples_split=4,
        random_state=1,
        verbose=1,
    )
)

In [14]:
xgb_params = {
    "objective": "reg:linear",
    "eta": 0.01,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "silent": 1,
    "max_depth": 6,
    "num_rounds": 10000,
    "num_es_rounds": 120,
    "es_share": .05,
}

In [15]:
# mean: 0.455176433113, std: 0.00380420601188
# cv execution time: 313.007899046 sec
pl2 = make_pipeline(
    transf1,
    XGBoostRegressor(**xgb_params),
)

In [16]:
# mean: 0.501266249762, std: 0.0286023436439
# cv execution time: 209.756793976 sec
pl3 = make_pipeline(
    transf1,
    ModelEnsembleRegressor(
        intermediate_estimators=[
            XGBoostRegressor(**xgb_params),
        ],
        assembly_estimator=DecisionTreeClassifier(max_depth=2),
        ensemble_train_size=1
    ),
)

In [17]:
# mean: 0.437250268495, std: 0.0101637687806
# cv execution time: 19.1492609978 sec
pl4 = make_pipeline(
    FunctionTransformer(query_match, validate=False),
    RandomForestRegressor(
        n_estimators=200,
        n_jobs=2,
        min_samples_split=4,
        random_state=1,
        verbose=1,
    )
)

In [18]:
# mean: 0.557947182125, std: 0.00811891669329
# cv execution time: 1239.4671309 sec
pl5 = make_pipeline(
    make_union(
        FunctionTransformer(query_match, validate=False),
        transf1,
    ),
    RandomForestRegressor(
        n_estimators=200,
        n_jobs=2,
        min_samples_split=4,
        random_state=1,
        verbose=1,
    )
)