# read in data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('mercari-price-suggestion-challenge/train.tsv', sep='\t')
df_train, df_test = train_test_split(df)

In [2]:
df_train.shape, df_test.shape

((1111901, 8), (370634, 8))

In [3]:
df_train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
381548,381548,Dance gymnastics leotard and shorts,3,Sports & Outdoors/Exercise/Dance/Ballet,,13.0,0,Child medium- large. Fits size 10. Like new. U...
87881,87881,Fossil key fob hang tag,3,Women/Women's Accessories/Wallets,Fossil,7.0,1,Euc Green leather hang tag Penny for size Smok...
1411934,1411934,Shopkins valentine target retired,1,Women/Jewelry/Bracelets,,14.0,0,Shopkins exclusive target
841742,841742,AirPort Express,3,Electronics/Computers & Tablets/Networking & C...,Apple,24.0,0,Working conditions Model no A1264
588200,588200,Under Armour Dri Fit Shirt Capt America,3,Kids/Boys (4+)/Top & T-shirts,,16.0,0,Under Armour Fitted heat gear size YSM Don't f...


In [4]:
df_train.describe(include='all')

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
count,1111901.0,1111901,1111901.0,1107161,637059,1111901.0,1111901.0,1111898
unique,,936806,,1260,4446,,,967812
top,,Bundle,,"Women/Athletic Apparel/Pants, Tights, Leggings",PINK,,,No description yet
freq,,1699,,45146,40633,,,61768
mean,741171.7,,1.907756,,,26.74288,0.4470389,
std,428013.4,,0.9031632,,,38.65995,0.4971874,
min,0.0,,1.0,,,0.0,0.0,
25%,370585.0,,1.0,,,10.0,0.0,
50%,741045.0,,2.0,,,17.0,0.0,
75%,1111970.0,,3.0,,,29.0,1.0,


In [5]:
import keras

def make_Xy(df, *, tokenizer=None, num_words=2000, maxlen=35):
    category_ids = {v: i for i, v in enumerate(df.category_name.unique(), start=1)}
    brand_ids = {v: i for i, v in enumerate(df.brand_name.unique(), start=1)}
    df['category_id'] = df.category_name.map(category_ids)
    df['brand_id'] = df.brand_name.map(brand_ids)
    
    df[['category_id', 'brand_id', 'item_condition_id']].fillna(0, inplace=True)
    df['text'] = df.name + ' ' + df.item_description.str.replace('No description yet', '')
    df['text'] = df.text.astype(str)

    X = {
        'category_input': df.category_id,
        'brand_input': df.brand_id,
        'item_condition_input': df.item_condition_id
    }
    y = df.price

    return X, y, tokenizer

X_train, y_train, tokenizer = make_Xy(df_train, num_words=2000, maxlen=35)
X_test, y_test, _ = make_Xy(df_test, tokenizer=tokenizer, num_words=2000, maxlen=35)

Using TensorFlow backend.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
df_train.text.str.count(' ').describe()

count    1.111901e+06
mean     2.896458e+01
std      3.086041e+01
min      0.000000e+00
25%      1.000000e+01
50%      1.900000e+01
75%      3.500000e+01
max      2.460000e+02
Name: text, dtype: float64

In [7]:
df_train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_id,brand_id,text
381548,381548,Dance gymnastics leotard and shorts,3,Sports & Outdoors/Exercise/Dance/Ballet,,13.0,0,Child medium- large. Fits size 10. Like new. U...,1,1,Dance gymnastics leotard and shorts Child medi...
87881,87881,Fossil key fob hang tag,3,Women/Women's Accessories/Wallets,Fossil,7.0,1,Euc Green leather hang tag Penny for size Smok...,2,2,Fossil key fob hang tag Euc Green leather hang...
1411934,1411934,Shopkins valentine target retired,1,Women/Jewelry/Bracelets,,14.0,0,Shopkins exclusive target,3,1,Shopkins valentine target retired Shopkins exc...
841742,841742,AirPort Express,3,Electronics/Computers & Tablets/Networking & C...,Apple,24.0,0,Working conditions Model no A1264,4,3,AirPort Express Working conditions Model no A1264
588200,588200,Under Armour Dri Fit Shirt Capt America,3,Kids/Boys (4+)/Top & T-shirts,,16.0,0,Under Armour Fitted heat gear size YSM Don't f...,5,1,Under Armour Dri Fit Shirt Capt America Under ...


# build byte pair encoder

In [8]:
%load_ext Cython

In [45]:
%%cython
def get_stats(list vocab, set removed_indices):
    cdef int i_left, i_right
    cdef dict pairs = {}
    cdef dict indices = {}
    valid_indices = (i for i in range(len(vocab) - 1)
                     if not i in removed_indices)
    i_left = next(valid_indices)
    for i_right in valid_indices:
        pair = vocab[i_left], vocab[i_right]
        if not pair in pairs:
            pairs[pair] = 0
        pairs[pair] += 1
        if not pair in indices:
            indices[pair] = []
        indices[pair].append(i_left)
    return pairs, indices

def merge_vocab(tuple pair, list vocab, list pair_indices, set removed_indices):
    cdef str new = ''.join(pair)
    cdef int i
    for i in pair_indices:
        vocab[i] = new
    removed_indices.update(pair_indices)
    return vocab

In [32]:
# import collections


# def pairwise(iterable):
#     "s -> (s0,s1), (s1,s2), (s2, s3), ..."
#     a, b = itertools.tee(iterable)
#     next(b, None)
#     return zip(a, b)

# def get_stats(vocab, removed_indices):
#     pairs = collections.defaultdict(int)
#     indices = collections.defaultdict(list)
#     valid_indices = (i for i in range(len(vocab) - 1)
#                      if not i in removed_indices)
#     for i_left, i_right in pairwise(valid_indices):
#         pair = vocab[i_left], vocab[i_right]
#         pairs[pair] += 1
#         indices[pair].append(i_left)
#     return pairs, indices

# def merge_vocab(pair, vocab, pair_indices, removed_indices):
#     new = ''.join(pair)
#     for i in reversed(pair_indices):
#         vocab[i] = new
#     removed_indices.update(pair_indices)
#     return vocab

In [60]:
import collections
import sklearn
import itertools
import time
import multiprocessing as mp
import numpy as np


def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    iterable = itertools.zip_longest(*args, fillvalue=fillvalue)
    return ([g for g in group if g] for group in iterable)


class BytePairEncoder(sklearn.base.TransformerMixin):
    def __init__(self, n_merges, n_jobs=None, chunksize=None, log_level=None):
        self.n_merges = n_merges
        self.n_jobs = n_jobs
        self.chunksize = chunksize
        self.log_level = log_level
        self._space_escape = '▁'
        self._unkown_token = 0

    def fit(self, X):
        vocab = list(self._process_X(X))
        initial_vocab = set(vocab)
        removed_indices = set()
        for i in range(self.n_merges):
            if self.log_level is not None and i % self.log_level == 0:
                print(f'{i+1} iterations complete')
            pairs, pair_index = get_stats(vocab, removed_indices)
            best = max(pairs, key=pairs.get)
            vocab = merge_vocab(best, vocab, pair_index[best], removed_indices)

        # reserve 0 for unkowns
        vocab = set(vocab)
        vocab.update(initial_vocab)
        self.vocab = {k: i for i, k in enumerate(vocab, start=1)}
        bpe._reverse_vocab = {v: k for k, v in bpe.vocab.items()}
        self._bpe_tree = build_bpe_tree(self.vocab)

    def transform(self, X):
        X = self._process_X(X)
        tokens = apply_bpe_tree(X, self._bpe_tree)
        return np.array([self._unkown_token if t is None else t for t in tokens])

    def inverse_transform(self, X):
        return [bpe._reverse_vocab[t] if t > 0 else '<unk>' for t in tokens]

    def _process_X(self, X):
         return self._space_escape.join(X.split())    

In [61]:
class Node:
    def __init__(self):
        self.children = {}
        self.index = None

    def __repr__(self):
        return f'Node(index={self.index}, children={self.children})'
    
    def get(self, key, default=None):
        return self.children.get(key, default)
    
    def __getitem__(self, key):
        return self.children[key]
    
    def __setitem__(self, key, value):
        self.children[key] = value
        
    def __contains__(self, key):
        return key in self.children   

def build_bpe_tree(vocab):
    root = Node()
    for word, index in vocab.items():
        current_node = root
        for n, c in enumerate(word, start=1):
            if not c in current_node:
                current_node[c] = Node()
            current_node = current_node[c]
            if n == len(word):
                current_node.index = index
    return root
    
def apply_bpe_tree(text, tree):
    output = []
    last_node = tree
    pos = 0
    while pos <= len(text) - 1:
        node = last_node.get(text[pos])
        if node is None:
            output.append(last_node.index)
            if last_node is not tree:
                last_node = tree
                continue
            node = tree
        last_node = node
        pos += 1
    output.append(last_node.index)
    return output

In [62]:
bpe_text = ' '.join(df_train.item_description.sample(30000))

In [63]:
len(bpe_text)

4401094

In [64]:
bpe = BytePairEncoder(2000, log_level=50)
%time bpe.fit(bpe_text)

1 iterations complete
51 iterations complete
101 iterations complete
151 iterations complete
201 iterations complete
251 iterations complete
301 iterations complete
351 iterations complete
401 iterations complete
451 iterations complete
501 iterations complete
551 iterations complete
601 iterations complete
651 iterations complete
701 iterations complete
751 iterations complete
801 iterations complete
851 iterations complete
901 iterations complete
951 iterations complete
1001 iterations complete
1051 iterations complete
1101 iterations complete
1151 iterations complete
1201 iterations complete
1251 iterations complete
1301 iterations complete
1351 iterations complete
1401 iterations complete
1451 iterations complete
1501 iterations complete
1551 iterations complete
1601 iterations complete
1651 iterations complete
1701 iterations complete
1751 iterations complete
1801 iterations complete
1851 iterations complete
1901 iterations complete
1951 iterations complete
CPU times: user 1h 5s, 

In [65]:
len(bpe.vocab)

372

In [66]:
bpe.vocab

{'\x16': 357,
 '!': 63,
 '!▁': 305,
 '"': 187,
 '#': 370,
 '$': 146,
 '%': 25,
 '%▁': 96,
 '&': 193,
 "'": 92,
 "'▁": 263,
 '(': 269,
 ')': 175,
 '*': 50,
 '*▁': 323,
 '+': 348,
 ',': 3,
 ',▁': 158,
 '-': 298,
 '-▁': 366,
 '.': 18,
 '.▁': 229,
 '/': 59,
 '/▁': 148,
 '0': 116,
 '0▁': 22,
 '1': 302,
 '1▁': 210,
 '2': 170,
 '2▁': 248,
 '3': 307,
 '4': 267,
 '5': 88,
 '6': 179,
 '7': 349,
 '7▁': 33,
 '8': 131,
 '9': 120,
 ':': 74,
 ':▁': 157,
 ';': 100,
 '=': 284,
 '?': 206,
 '@': 38,
 'A': 141,
 'A▁': 85,
 'B': 352,
 'B▁': 197,
 'C': 314,
 'C▁': 340,
 'D': 198,
 'D▁': 313,
 'E': 86,
 'F': 174,
 'F▁': 250,
 'G': 294,
 'G▁': 254,
 'H': 371,
 'I': 337,
 'I▁': 14,
 'J': 311,
 'K': 266,
 'K▁': 39,
 'L': 114,
 'L▁': 261,
 'M': 67,
 'M▁': 112,
 'N': 6,
 'N▁': 205,
 'O': 17,
 'P': 333,
 'P▁': 2,
 'Q': 281,
 'Q▁': 106,
 'R': 293,
 'R▁': 165,
 'S': 272,
 'S▁': 223,
 'T': 228,
 'T▁': 336,
 'U': 221,
 'U▁': 234,
 'V': 82,
 'W': 7,
 'W▁': 140,
 'X': 318,
 'X▁': 160,
 'Y': 145,
 'Z': 117,
 '[': 211,
 '

In [67]:
%time tokens = bpe.transform(bpe_text)
tokens

CPU times: user 5.21 s, sys: 120 ms, total: 5.33 s
Wall time: 5.43 s


array([174, 295, 154, ..., 352, 257, 361])

In [68]:
bpe._reverse_vocab = {v: k for k, v in bpe.vocab.items()}
inv_tokens = [bpe._reverse_vocab[t] if t > 0 else '<unk>' for t in tokens]

In [69]:
inv_tokens[:30]

['F',
 'r',
 'e',
 'e▁',
 's',
 'h',
 'i',
 'p',
 'p',
 'i',
 'n',
 'g▁',
 'L',
 'i',
 'k',
 'e▁',
 'n',
 'e',
 'w▁',
 'L',
 'a',
 'l',
 'i▁',
 'L',
 'a',
 'y',
 'l',
 'a▁',
 'A',
 'r']

In [70]:
''.join(inv_tokens[:30])

'Free▁shipping▁Like▁new▁Lali▁Layla▁Ar'

In [71]:
bpe_text[:100]

'Free shipping Like new Lali Layla Ariel Top, NWT in Quartz. Super sparkly!! No longer sold, rare and'

# add encodings to training data

In [None]:
%time X_train['text_input'] = keras.preprocessing.sequence.pad_sequences(df_train.text.apply(bpe.transform), maxlen=40)
%time X_test['text_input'] = keras.preprocessing.sequence.pad_sequences(df_test.text.apply(bpe.transform), maxlen=40)

# build model

In [None]:
category_input = keras.layers.Input(shape=(1,), name='category_input')
brand_input = keras.layers.Input(shape=(1,), name='brand_input')
item_condition_input = keras.layers.Input(shape=(1,), name='item_condition_input')
text_input = keras.layers.Input(shape=(None,), name='text_input')
inputs = [category_input, brand_input, item_condition_input, text_input]

# categorical feature embeddings
category_embedding = keras.layers.Embedding(
    input_dim=df_train.category_id.nunique()+1,
    output_dim=3, input_length=1)(category_input)

brand_embedding = keras.layers.Embedding(
    input_dim=df_train.brand_id.nunique()+1,
    output_dim=3, input_length=1)(brand_input)

item_condition_embedding = keras.layers.Embedding(
    input_dim=df_train.item_condition_id.nunique()+1,
    output_dim=3, input_length=1)(item_condition_input)

embedding_tensors = [category_embedding, brand_embedding, item_condition_embedding]
x_embeddings = keras.layers.Concatenate()([
    keras.layers.Flatten()(embedding) for embedding in embedding_tensors
])


# text features
import keras.backend as K
Sum = keras.layers.Lambda(lambda x: K.sum(x, axis=1))

def SelfAttention(X):
    dim = K.int_shape(X)[-1]
    q = keras.layers.Dense(dim)(X)
    k = keras.layers.Dense(dim)(X)
    v = keras.layers.Dense(dim)(X)
    w = keras.layers.Dot((2, 2))([q, k])
    w = keras.layers.Softmax(axis=1)(w)
    return keras.layers.Dot((2, 1))([w, v])
    

text_embeddings = keras.layers.Embedding(
    input_dim=len(bpe.vocab)+1, output_dim=5, input_length=None)(text_input)
text_embeddings = keras.layers.SpatialDropout1D(0.4)(text_embeddings)
attention = SelfAttention(text_embeddings)
x_text = Sum(attention)


x = keras.layers.Concatenate()([x_embeddings, x_text])
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(K.int_shape(x)[-1], activation='relu')(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Dense(1, activation='relu')(x)

model = keras.models.Model(inputs=inputs, outputs=x)

In [None]:
model.summary()

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
def rmsle(y_true, y_pred):
    return K.sqrt(K.mean(K.square(K.log(y_pred+1.) - K.log(y_true+1.))))
model.compile(loss=rmsle, optimizer='adam')

In [None]:
model.fit(
    X_train, y_train,
    epochs=25,
    validation_data=(X_test, y_test),
    callbacks=[keras.callbacks.ReduceLROnPlateau(patience=2),
               keras.callbacks.EarlyStopping(patience=3),
               keras.callbacks.TerminateOnNaN()]
)

In [None]:
attention_model = keras.models.Model(inputs=descr_input, outputs=attention)
counties, county_descriptions = df[['county', 'county_description']].drop_duplicates().T.values

# process descriptions through the tokenizer
tokens = [s[:250] for s in tokenizer.texts_to_sequences(county_descriptions)]
county_descriptions = [t.split(' ') for t in tokenizer.sequences_to_texts(tokens)]

attention_scores = attention_model.predict(keras.preprocessing.sequence.pad_sequences(tokens, maxlen=250))
# resize the scores to eliminate redundant axis
attention_scores = descr_attention.reshape(descr_attention.shape[:2])

In [None]:
zipped = zip(counties, descr_texts, attention_scores)
word_importances = [
    (county, tuple([(w, i) for w, i in zip(description, description_importances)]))
    for county, description, description_importances in zipped
]

In [None]:
word_importances

In [None]:
county_word_importances = {
    county: sorted(set(importances), key=lambda x: x[-1], reverse=True)[:10]
     for county, importances in word_importances
}
county_word_importances

In [None]:
county_word_importances['Del Norte County']