# Improving a winning ML model with Target Encoding : MDSF2018.

#### First let's import the necessary Python packages.

In [1]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from stop_words import get_stop_words

import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from category_encoders import *

#### Now let's read the data.

In [2]:
%%time
X_train = pd.read_csv("X_train.csv", index_col=0, error_bad_lines=False)
X_test = pd.read_csv("X_test.csv", index_col=0, error_bad_lines=False)
y_train = pd.read_csv("y_train.csv", index_col=0)

CPU times: user 111 ms, sys: 24 ms, total: 135 ms
Wall time: 133 ms


b'Skipping line 2168: expected 31 fields, saw 33\nSkipping line 4822: expected 31 fields, saw 37\nSkipping line 4859: expected 31 fields, saw 37\nSkipping line 7342: expected 31 fields, saw 37\n'


#### Now let's concatenate the train and test datasets for joint pre-processing.

In [3]:
train_test = pd.concat((X_train, X_test), axis=0)
train_test=train_test.reset_index(drop=True)
train_test.head(3)

Unnamed: 0,images_count,image_width,image_height,image_url,product_description,product_size,material,age,warranty,year,...,product_height,weight,price,category,sub_category_1,sub_category_2,sub_category_3,sub_category_4,product_name,store_name
0,3,3458.0,2552.0,https://d1kvfoyrif6wzg.cloudfront.net/assets/i...,Superbe petit top bustier avec explosion de co...,44.0,100 % polyester,,,,...,,200.0,4.5,mode,"tops, t-shirts, débardeurs femme",,,,Top bustier multicolore,Emmaüs 88 Neufchateau
1,2,2486.0,2254.0,https://d1kvfoyrif6wzg.cloudfront.net/assets/i...,"Radio ITT Océnic Flirt, année 70\nPour déco",,Plastique,,,,...,,1000.0,15.0,mobilier - deco,bibelots et objets déco,,,,Radio ITT Océanic,Communauté Emmaüs Thouars (magasin Parthenay)
2,3,1536.0,1536.0,https://d1kvfoyrif6wzg.cloudfront.net/assets/i...,Veste boléro à manches courtes NÛMPH. Gris chi...,40.0,"Polyester, coton, laine",,,,...,,360.0,16.0,label selection,mode,mode femme,,,,Label Emmaüs Chambéry


###       

### Now let's focus on feature engineering.

#### First let's convert some categorical columns into integers. 

In [4]:
train_test['image_url'] = train_test['image_url'].astype('category').cat.codes
train_test['material'] = train_test['material'].astype('category').cat.codes
train_test['product_name'] = train_test['product_name'].astype('category').cat.codes
train_test['product_size'] = train_test['product_size'].astype('category').cat.codes
train_test['age'] = train_test['age'].astype('category').cat.codes
train_test['warranty'] = train_test['warranty'].astype('category').cat.codes
train_test['color'] = train_test['color'].astype('category').cat.codes
train_test['condition'] = train_test['condition'].astype('category').cat.codes
train_test['wifi'] = train_test['wifi'].astype('category').cat.codes
train_test['vintage'] = train_test['vintage'].astype('category').cat.codes

#### Now let's derive a new set of features from the product_description column.

In [5]:
stop_words = get_stop_words('fr')
nltk_stopwords = stopwords.words('french')
stop_words.extend(nltk_stopwords)

In [6]:
pd_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3),
                                stop_words = stop_words, lowercase=True,
                                max_features=50, binary=True, norm=None,
                                use_idf=False)
pd_tfidf = pd_vectorizer.fit_transform(train_test.product_description)
pd_tfidf_cols = pd_vectorizer.get_feature_names()
pd_temp = pd.DataFrame(data=pd_tfidf.todense(), columns=['tfidf_' + 'pd' + '_' + i for i in pd_tfidf_cols])
train_test = pd.concat([train_test, pd_temp], axis=1)
train_test.drop('product_description', inplace=True, axis=1)
train_test.head(3)

Unnamed: 0,images_count,image_width,image_height,image_url,product_size,material,age,warranty,year,color,...,tfidf_pd_réemploi,tfidf_pd_sein,tfidf_pd_sein librairie,tfidf_pd_solidaire,tfidf_pd_soutenez,tfidf_pd_strong,tfidf_pd_taille,tfidf_pd_vente,tfidf_pd_écrit,tfidf_pd_éditions
0,3,3458.0,2552.0,3238,7,100,-1,-1,,10,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,2486.0,2254.0,4878,-1,1391,-1,-1,,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1536.0,1536.0,5654,5,1448,-1,-1,,6,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Let's also do the same for the sub_category_1 and sub_category_2 columns.

In [7]:
#custom tokenizer for tfifd
def custom_tokenizer(s):
    return s.split(',')

In [8]:
train_test['sub_category_1'] = train_test['sub_category_1'].astype(str)
sc1_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3),
                                 stop_words = stop_words, lowercase=True,
                                 max_features=50, binary=True, norm=None,
                                 use_idf=False, tokenizer=custom_tokenizer)
sc1_tfidf = sc1_vectorizer.fit_transform(train_test.sub_category_1)
sc1_tfidf_cols = sc1_vectorizer.get_feature_names()
sc1_temp = pd.DataFrame(data=sc1_tfidf.todense(), columns=['tfidf_' + 'sc1' + '_' + i for i in sc1_tfidf_cols])
train_test = pd.concat([train_test, sc1_temp], axis=1)
train_test.drop('sub_category_1', inplace=True, axis=1)

train_test['sub_category_2'] = train_test['sub_category_2'].astype(str)
sc2_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3),
                                 stop_words = stop_words, lowercase=True,
                                 max_features=50, binary=True, norm=None,
                                 use_idf=False)
sc2_tfidf = sc2_vectorizer.fit_transform(train_test.sub_category_2)
sc2_tfidf_cols = sc2_vectorizer.get_feature_names()
sc2_temp = pd.DataFrame(data=sc2_tfidf.todense(), columns=['tfidf_' + 'sc2' + '_' + i for i in sc2_tfidf_cols])
train_test = pd.concat([train_test, sc2_temp], axis=1)
train_test.drop('sub_category_2', inplace=True, axis=1)
train_test.head(3)

Unnamed: 0,images_count,image_width,image_height,image_url,product_size,material,age,warranty,year,color,...,tfidf_sc2_objets déco,tfidf_sc2_objets déco vaisselle,tfidf_sc2_pulls,tfidf_sc2_pulls femme,tfidf_sc2_robes,tfidf_sc2_selection,tfidf_sc2_vaisselle,tfidf_sc2_vendeurs,tfidf_sc2_vintage,tfidf_sc2_vêtements
0,3,3458.0,2552.0,3238,7,100,-1,-1,,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2486.0,2254.0,4878,-1,1391,-1,-1,,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1536.0,1536.0,5654,5,1448,-1,-1,,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Finally let's compute some additional features. 

In [9]:
#count agregate
def get_count(df, field, by_field):
    tmp=df[[by_field]+[field]].copy()
    tmp[field].fillna('xxx', inplace=True)
    tmp = tmp.groupby([by_field]).count()[[field]].reset_index()
    tmp.columns = [i for i in [by_field]]+['count_of_' + field + '_by_' + str(by_field)]
    df = df.merge(tmp, on=by_field, how='left')
    return df


#distinct count agregate
def get_distinct_count(df, field, by_field):
    tmp=df[[by_field]+[field]].copy()
    tmp[field].fillna('xxx', inplace=True)
    tmp = tmp[[by_field]+[field]]
    tmp = tmp.drop_duplicates(inplace=False)
    tmp = tmp.groupby([by_field]).count()[[field]].reset_index()
    tmp.columns = [i for i in [by_field]]+['distinct_count_of_' + field + '_by_' + str(by_field)]
    df = df.merge(tmp, on=by_field, how='left')
    return df

train_test=get_count(train_test, 'product_name', 'store_name')
train_test=get_distinct_count(train_test, "product_name", 'store_name')
train_test['cat_store'] = train_test['category'] + '/' + train_test['store_name']

###       

### Let's now focus on building our xgboost model. 

#### First let's retrieve the train and test datasets from the global dataset.

In [10]:
X_train = train_test.iloc[:X_train.shape[0], :]
X_test = train_test.iloc[X_train.shape[0]:, :]    
X_train.head(3)

Unnamed: 0,images_count,image_width,image_height,image_url,product_size,material,age,warranty,year,color,...,tfidf_sc2_pulls femme,tfidf_sc2_robes,tfidf_sc2_selection,tfidf_sc2_vaisselle,tfidf_sc2_vendeurs,tfidf_sc2_vintage,tfidf_sc2_vêtements,count_of_product_name_by_store_name,distinct_count_of_product_name_by_store_name,cat_store
0,3,3458.0,2552.0,3238,7,100,-1,-1,,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1092,1070,mode/Emmaüs 88 Neufchateau
1,2,2486.0,2254.0,4878,-1,1391,-1,-1,,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45,45,mobilier - deco/Communauté Emmaüs Thouars (mag...
2,3,1536.0,1536.0,5654,5,1448,-1,-1,,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,886,194,label selection/Label Emmaüs Chambéry


#### Now let's cross-validate our new xgboost model. This new model will also apply a target encoder to a group of categorical columns on the fly. This specific pre-processing step will allow us to surpass the winning solution. 

In [11]:
numeric_features = ["images_count", "image_width", "image_height",
                    "year", "product_width", "product_length",
                    "shoe_size", "product_height", "weight", "price"]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999))])

extra_features = ['category', 'store_name', 'cat_store',
                  'sub_category_3', 'sub_category_4', 'brand',
                  'author', 'editor']
extra_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('targetencoder', TargetEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ext', extra_transformer, extra_features)
    ], remainder='passthrough')

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('xgb', xgb.XGBClassifier())])

param_grid = {
    "xgb__n_estimators": [350],
    "xgb__learning_rate": [0.05],
    'xgb__n_jobs':[4],
    'xgb__objective':['multi:softprob'],
    'xgb__colsample_bytree': [1],
    'xgb__verbosity': [0],
    'xgb__subsample': [0.8],
    'xgb__max_depth': [6],
    'xgb__min_child_weight': [7],
    'xgb__lambda': [1],
    'xgb__alpha': [1],
    'preprocessor__ext__targetencoder__min_samples_leaf': [20],
    'preprocessor__ext__targetencoder__smoothing': [0.1]
}

In [12]:
searchCV = GridSearchCV(clf, cv=5, scoring='neg_log_loss', param_grid=param_grid, n_jobs=-1, verbose=1)
searchCV.fit(X_train, np.ravel(y_train))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.8min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
             

#### Let's inspect the results :

In [13]:
cv_results_df = pd.DataFrame.from_dict(searchCV.cv_results_)
cv_results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessor__ext__targetencoder__min_samples_leaf,param_preprocessor__ext__targetencoder__smoothing,param_xgb__alpha,param_xgb__colsample_bytree,param_xgb__lambda,param_xgb__learning_rate,...,param_xgb__verbosity,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,119.626819,42.417202,0.27281,0.11854,20,0.1,1,1,1,0.05,...,0,{'preprocessor__ext__targetencoder__min_sample...,-0.891002,-0.93006,-0.919564,-0.907156,-0.898357,-0.90923,0.014112,1


#### We can see that our model has improved the log-loss of the winning model by ~0.01. 