# Links
- https://www.kaggle.com/c/home-depot-product-search-relevance/
    
# Discussions
- https://www.youtube.com/watch?v=LJH6tnN0WDE

In [6]:
from functools import partial

import pandas as pd
import numpy as np
import re
from nltk import SnowballStemmer
from nltk.corpus import stopwords
from gensim import matutils
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_union, make_pipeline
from gensim.models import Word2Vec
from sklearn.preprocessing import FunctionTransformer

In [2]:
%run ds_tools/dstools/ml/xgboost_tools.py



In [10]:
%run hdsr.py

In [1]:
def dataset(query_file):
    q_df = pd.read_csv(query_file, encoding='ISO-8859-1', index_col='id')
    q_df['search_term'] = q_df.search_term.str.encode('utf-8')
    q_df['product_title'] = q_df.product_title.str.encode('utf-8')

    prod_df = pd.read_csv('product_descriptions.csv.gz', encoding='ISO-8859-1', index_col='product_uid')
    prod_df['product_description'] = prod_df.product_description.str.encode('utf-8')

    df_attr = pd.read_csv('attributes.csv.gz', encoding='ISO-8859-1').dropna()
    df_attr['product_uid'] = df_attr['product_uid'].astype(int)
    df_attr['name'] = df_attr.name.str.encode('utf-8')
    df_attr['value'] = df_attr.value.str.encode('utf-8')
    df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]]\
        .rename(columns={"value": "brand"}).set_index("product_uid")

    gr_attr_df = df_attr[["product_uid", "value"]].groupby('product_uid')\
        .agg(lambda x: ' '.join(x)).rename(columns={"value": "attrs"})

    df = q_df.merge(prod_df, left_on='product_uid', right_index=True, how='left')
    df = df.merge(df_brand, left_on='product_uid', right_index=True, how='left')
    df = df.merge(gr_attr_df, left_on='product_uid', right_index=True, how='left')

    df = df.fillna('none')

    return df

In [3]:
def cv_test(est):
    df = dataset('train.csv.gz')

    transf, estimator = est

    features = transf.fit_transform(df.drop('relevance', axis=1), df.relevance)

    scores = cross_val_score(
            estimator=estimator,
            X=features,
            y=df.relevance,
            cv=3,
            n_jobs=1,
            verbose=1,
            scoring=rmse)
    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [4]:
def submission(est, name='results'):
    df = dataset('train.csv.gz')

    features = df.drop(['relevance'], axis=1)
    target = df.relevance

    transf, estimator = est
    pl = make_pipeline(transf, estimator)

    model = pl.fit(features, target)

    df_test = dataset('test.csv.gz')

    y_pred = model.predict(df_test)

    y_pred[y_pred < 1] = 1
    y_pred[y_pred > 3] = 3

    res = pd.Series(y_pred, index=df_test.index, name='relevance')
    res.to_csv(name+'.csv', index_label='id', header=True)

In [7]:
def column_transformer(name):
    return FunctionTransformer(partial(pd.DataFrame.__getitem__, key=name), validate=False)

def count_vec():
    return CountVectorizer(stop_words=stopwords.words("english"))

transf_count = make_union(
    make_pipeline(
        column_transformer('search_term'),
        count_vec()
    ),
    make_pipeline(
        column_transformer('product_title'),
        count_vec()
    ),
    make_pipeline(
        column_transformer('product_description'),
        count_vec()
    ),
)

In [11]:
transf_wv = W2VTransformer(tokenizer=StopwordTokenizer())

In [12]:
transf3 = make_union(transf_count, transf_wv)

In [13]:
def tfidf_vec():
    return TfidfVectorizer(stop_words=stopwords.words("english"))

def tsvd():
    return TruncatedSVD(n_components=10)

transf_tfidf = make_union(
    make_pipeline(
        column_transformer('search_term'),
        tfidf_vec(),
        tsvd(),
    ),
    make_pipeline(
        column_transformer('product_title'),
        tfidf_vec(),
        tsvd(),
    ),
    make_pipeline(
        column_transformer('product_description'),
        tfidf_vec(),
        tsvd(),
    ),
)

In [14]:
transf_qm = QueryMatchTransformer()

In [15]:
col2dict = FunctionTransformer(
    lambda x: pd.DataFrame(x).to_dict(orient='records'), validate=False)

transf_br = make_pipeline(
    column_transformer('brand'),
    col2dict,
    DictVectorizer(),
)

In [16]:
transf5 = make_union(transf_qm, transf_br)

In [17]:
transf_tfidf2 = make_union(
    make_pipeline(
        column_transformer('search_term'),
        tfidf_vec(),
        tsvd(),
    ),
    make_pipeline(
        column_transformer('product_title'),
        tfidf_vec(),
        tsvd(),
    ),
    make_pipeline(
        column_transformer('brand'),
        tfidf_vec(),
        tsvd(),
    ),
)

In [18]:
transf7 = make_union(
    transf_qm,
    transf_br,
    transf_tfidf2,
)

In [19]:
transf_qms = QueryMatchScoreTransformer()

In [20]:
def tfidf_vec2():
    return TfidfVectorizer(analyzer=CleanupStemTokenizer().tokenize)

transf_tfidf3 = make_union(
    make_pipeline(
        column_transformer('search_term'),
        tfidf_vec2(),
        tsvd(),
    ),
    make_pipeline(
        column_transformer('product_title'),
        tfidf_vec2(),
        tsvd(),
    ),
    make_pipeline(
        column_transformer('brand'),
        tfidf_vec2(),
        tsvd(),
    ),
)

In [21]:
transf10 = make_union(transf_qm, transf_qms, transf_wv)

In [22]:
transf_qma = QueryMatchAttrTransformer()

In [23]:
transf11 = make_union(transf_qm, transf_qma)

In [24]:
transf12 = make_union(
    transf_qm,
    transf_qma,
    transf_qms,
    transf_wv,
    transf_br,
    transf_tfidf3,
)

In [25]:
xgb_params = {
    "objective": "reg:linear",
    "eta": 0.01,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "silent": 1,
    "max_depth": 6,
    "num_rounds": 10000,
    "num_es_rounds": 120,
    "es_share": .05,
}

est_xgb = XGBoostRegressor(**xgb_params)

In [26]:
xgb_params2 = {
    "objective": "reg:linear",
    "eta": 0.01,
    "min_child_weight": 6,
    "subsample": 0.6,
    "colsample_bytree": 0.6,
    "silent": 1,
    "max_depth": 6,
    "num_rounds": 10000,
    "num_es_rounds": 120,
    "es_share": .05,
}

est_xgb2 = XGBoostRegressor(**xgb_params2)

In [27]:
est_rf = RandomForestRegressor(n_estimators=500, n_jobs=2, max_features=10, max_depth=20)

In [28]:
# mean: 0.524247008053, std: 0.0133952508428
# cv execution time: 5424.41613007 sec
pl1 = transf_count, est_xgb

In [29]:
# mean: 0.531791158177, std: 0.00301742594142
# cv execution time: 273.821596146 sec
pl2 = transf_wv, est_xgb

In [30]:
# mean: 0.51385278959, std: 0.0149009902673
# cv execution time: 4414.65872407 sec
pl3 = transf3, est_xgb

In [31]:
# mean: 0.512169639091, std: 0.0127859600449
# cv execution time: 969.822750092 sec
pl4 = transf_tfidf, est_xgb

In [32]:
# mean: 0.495558379131, std: 0.00875619528366
# cv execution time: 1017.98022795 sec
pl5 = transf5, est_xgb

In [33]:
# mean: 0.526997056765, std: 0.0113952020154
# cv execution time: 467.067214966 sec
pl6 = transf7, est_rf

In [34]:
# mean: 0.474707501422, std: 0.0129297984942
# cv execution time: 2155.61947298 sec
pl7 = transf7, est_xgb

In [35]:
# mean: 0.511426873236, std: 0.00848039053981
# cv execution time: 673.606554031 sec
pl8 = transf_qms, est_xgb

In [36]:
# mean: 0.498169906059, std: 0.00804649346606
# cv execution time: 329.776066065 sec
pl9 = transf_qm, est_xgb

In [37]:
# mean: 0.491104751269, std: 0.00663380880545
# cv execution time: 1032.76950693 sec
pl10 = transf10, est_xgb

In [38]:
# mean: 0.498106474773, std: 0.00859122009882
# cv execution time: 345.627609015 sec
pl11 = transf_qm, est_xgb2

In [39]:
# mean: 0.514115117917, std: 0.00339352964921
# cv execution time: 221.978795052 sec
pl12 = transf_qma, est_xgb

In [40]:
# mean: 0.472303356125, std: 0.0111595095046
# cv execution time: 2385.81992888 sec
pl13 = transf12, est_xgb2