# read in data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('mercari-price-suggestion-challenge/train.tsv', sep='\t')
df_train, df_test = train_test_split(df)

In [2]:
df_train.shape, df_test.shape

((1111901, 8), (370634, 8))

In [3]:
df_train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
968857,968857,Charlotte Russe Skirt,3,Women/Skirts/Mini,Charlotte Russe,12.0,0,size small in good condition not worn much
13766,13766,Adidas Shadow Tubular,1,Women/Shoes/Athletic,Adidas,96.0,0,Recently released a few days ago. Cream/grey/t...
1006475,1006475,2x dress,2,"Women/Dresses/Above Knee, Mini",Charlotte Russe,15.0,0,Charlotte Russe. Only worn for a couple hours
1175496,1175496,LA Rams vs ATL Falcons 2 PREMIUM SEATS,1,Sports & Outdoors/Fan Shop/NFL,,310.0,1,Pair of Los Angeles Rams vs Atlanta Falcons (2...
163835,163835,LulaRoe Randy XL,1,Women/Tops & Blouses/T-Shirts,,32.0,1,Brand new never worn. Dark purple and navy wit...


In [4]:
df_train.describe(include='all')

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
count,1111901.0,1111901,1111901.0,1107127,637437,1111901.0,1111901.0,1111898
unique,,936401,,1263,4457,,,967722
top,,Bundle,,"Women/Athletic Apparel/Pants, Tights, Leggings",PINK,,,No description yet
freq,,1690,,45073,40663,,,61976
mean,741763.9,,1.907211,,,26.71932,0.4474265,
std,427969.6,,0.9030715,,,38.64842,0.4972286,
min,0.0,,1.0,,,0.0,0.0,
25%,371187.0,,1.0,,,10.0,0.0,
50%,742160.0,,2.0,,,17.0,0.0,
75%,1112531.0,,3.0,,,29.0,1.0,


In [5]:
import keras

def make_Xy(df, *, tokenizer=None, num_words=2000, maxlen=35):
    category_ids = {v: i for i, v in enumerate(df.category_name.unique(), start=1)}
    brand_ids = {v: i for i, v in enumerate(df.brand_name.unique(), start=1)}
    df['category_id'] = df.category_name.map(category_ids)
    df['brand_id'] = df.brand_name.map(brand_ids)
    
    df[['category_id', 'brand_id', 'item_condition_id']].fillna(0, inplace=True)
    df['text'] = df.name + ' ' + df.item_description.str.replace('No description yet', '')
    df['text'] = df.text.astype(str)

    X = {
        'category_input': df.category_id,
        'brand_input': df.brand_id,
        'item_condition_input': df.item_condition_id
    }
    y = df.price

    return X, y, tokenizer

X_train, y_train, tokenizer = make_Xy(df_train, num_words=2000, maxlen=35)
X_test, y_test, _ = make_Xy(df_test, tokenizer=tokenizer, num_words=2000, maxlen=35)

Using TensorFlow backend.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
df_train.text.str.count(' ').describe()

count    1.111901e+06
mean     2.898562e+01
std      3.089176e+01
min      0.000000e+00
25%      1.000000e+01
50%      1.900000e+01
75%      3.500000e+01
max      2.510000e+02
Name: text, dtype: float64

In [20]:
df_train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_id,brand_id,text
968857,968857,Charlotte Russe Skirt,3,Women/Skirts/Mini,Charlotte Russe,12.0,0,size small in good condition not worn much,1,1,Charlotte Russe Skirt size small in good condi...
13766,13766,Adidas Shadow Tubular,1,Women/Shoes/Athletic,Adidas,96.0,0,Recently released a few days ago. Cream/grey/t...,2,2,Adidas Shadow Tubular Recently released a few ...
1006475,1006475,2x dress,2,"Women/Dresses/Above Knee, Mini",Charlotte Russe,15.0,0,Charlotte Russe. Only worn for a couple hours,3,1,2x dress Charlotte Russe. Only worn for a coup...
1175496,1175496,LA Rams vs ATL Falcons 2 PREMIUM SEATS,1,Sports & Outdoors/Fan Shop/NFL,,310.0,1,Pair of Los Angeles Rams vs Atlanta Falcons (2...,4,3,LA Rams vs ATL Falcons 2 PREMIUM SEATS Pair of...
163835,163835,LulaRoe Randy XL,1,Women/Tops & Blouses/T-Shirts,,32.0,1,Brand new never worn. Dark purple and navy wit...,5,3,LulaRoe Randy XL Brand new never worn. Dark pu...


# build byte pair encoder

In [6]:
%load_ext Cython

In [7]:
%%cython
def get_stats(list vocab):
    cdef int i
    cdef dict pairs = {}
    cdef dict indices = {}
    for i in range(len(vocab) - 1):
        pair = vocab[i], vocab[i+1]
        if not pair in pairs:
            pairs[pair] = 0
        pairs[pair] += 1
        if not pair in indices:
            indices[pair] = []
        indices[pair].append(i)
    return pairs, indices

def merge_vocab(tuple pair, list vocab, list indices):
    cdef str new = ''.join(pair)
    cdef int i
    for i in reversed(indices):
        vocab[i] = new
        vocab.pop(i+1)
    return vocab

In [10]:
import collections
import sklearn

class BytePairEncoder(sklearn.base.TransformerMixin):
    def __init__(self, n_merges, n_jobs=None):
        self.n_merges = n_merges
        self.n_jobs = n_jobs
        self._space_escape = '▁'
        self._unkown_token = 0

    def fit(self, X):
        vocab = list(self._process_X(X))
        initial_vocab = set(vocab)
        for i in range(self.n_merges):
            if self.n_jobs is None:
                pairs, pair_index = get_stats(vocab)
            else:
                
            best = max(pairs, key=pairs.get)
            vocab = merge_vocab(best, vocab, pair_index[best])

        # reserve 0 for unkowns
        vocab = set(vocab)
        vocab.update(initial_vocab)
        self.vocab = {k: i for i, k in enumerate(vocab, start=1)}
        bpe._reverse_vocab = {v: k for k, v in bpe.vocab.items()}
        self._bpe_tree = build_bpe_tree(self.vocab)

    def transform(self, X):
        X = self._process_X(X)
        tokens = apply_bpe_tree(X, self._bpe_tree)
        return np.array([self._unkown_token if t is None else t for t in tokens])

    def inverse_transform(self, X):
        return [bpe._reverse_vocab[t] if t > 0 else '<unk>' for t in tokens]

    def _process_X(self, X):
         return self._space_escape.join(X.split())
    
    def _build_encoding_map(self):
        pass
    

In [11]:
class Node:
    def __init__(self):
        self.children = {}
        self.index = None

    def __repr__(self):
        return f'Node(index={self.index}, children={self.children})'
    
    def get(self, key, default=None):
        return self.children.get(key, default)
    
    def __getitem__(self, key):
        return self.children[key]
    
    def __setitem__(self, key, value):
        self.children[key] = value
        
    def __contains__(self, key):
        return key in self.children   

def build_bpe_tree(vocab):
    root = Node()
    for word, index in vocab.items():
        current_node = root
        for n, c in enumerate(word, start=1):
            if not c in current_node:
                current_node[c] = Node()
            current_node = current_node[c]
            if n == len(word):
                current_node.index = index
    return root
    
def apply_bpe_tree(text, tree):
    output = []
    last_node = tree
    pos = 0
    while pos <= len(text) - 1:
        node = last_node.get(text[pos])
        if node is None:
            output.append(last_node.index)
            if last_node is not tree:
                last_node = tree
                continue
            node = tree
        last_node = node
        pos += 1
    output.append(last_node.index)
    return output

In [12]:
bpe_text = ' '.join(df_train.item_description.sample(100))

In [13]:
bpe = BytePairEncoder(2000, -1)
%time bpe.fit(bpe_text)

CPU times: user 13.1 s, sys: 85 ms, total: 13.2 s
Wall time: 13.3 s


In [14]:
len(bpe.vocab)

1140

In [15]:
bpe.vocab

{'very▁': 1,
 'back▁': 2,
 'ad': 3,
 '▁re': 4,
 'gg': 5,
 'ard▁': 6,
 "'s▁": 7,
 'for▁looking': 8,
 'clip▁on▁earrings-▁Length': 9,
 'open': 10,
 'clip▁on▁': 11,
 'of▁the▁': 12,
 'NEW▁': 13,
 'great▁condition': 14,
 'No▁description▁yet▁': 15,
 'm▁s': 16,
 'mor': 17,
 'min': 18,
 '2': 19,
 'All▁': 20,
 'ensitiv': 21,
 'wom': 22,
 'e▁with▁': 23,
 'oun': 24,
 's,▁': 25,
 'edress▁#': 26,
 'Thank▁you': 27,
 'or': 28,
 'air▁': 29,
 'a▁few▁times.▁': 30,
 '0%': 31,
 'Soothing▁': 32,
 'your▁p': 33,
 'Comes▁with▁': 34,
 'ed▁in▁': 35,
 'h▁': 36,
 'as▁': 37,
 'mater': 38,
 'great▁': 39,
 "men's▁": 40,
 'peach': 41,
 '"': 42,
 'co': 43,
 'but▁': 44,
 'otherwise▁': 45,
 'urns.▁': 46,
 '&▁': 47,
 'any▁questions▁': 48,
 'lu': 49,
 'ray▁': 50,
 'plus': 51,
 'from▁': 52,
 'ed▁on▁': 53,
 'out▁': 54,
 'gap▁': 55,
 '!▁Th': 56,
 '14': 57,
 'men': 58,
 'ater▁': 59,
 'e,▁': 60,
 'ab': 61,
 'day,▁': 62,
 'for▁iPhone▁': 63,
 'es▁': 64,
 'da': 65,
 'eets▁': 66,
 'ad▁': 67,
 'me▁': 68,
 'S▁': 69,
 'av': 70,
 '2▁x▁

In [16]:
%time tokens = bpe.transform(bpe_text)
tokens

CPU times: user 18.5 ms, sys: 1.07 ms, total: 19.5 ms
Wall time: 19.5 ms


array([ 915,  367, 1069, ...,    4,    0,    0])

In [17]:
bpe._reverse_vocab = {v: k for k, v in bpe.vocab.items()}
inv_tokens = [bpe._reverse_vocab[t] if t > 0 else '<unk>' for t in tokens]
''.join(inv_tokens)[:100]

'Size▁6▁gap▁Jean▁shorts▁New▁All▁new▁black▁phone▁grip▁and▁Stan.▁Never▁drop▁your▁phone,▁take▁better▁sel'

In [18]:
bpe_text[:100]

'Size 6 gap Jean shorts New All new black phone grip and Stan. Never drop your phone, take better sel'

# build model

In [None]:
category_input = keras.layers.Input(shape=(1,), name='category_input')
brand_input = keras.layers.Input(shape=(1,), name='brand_input')
item_condition_input = keras.layers.Input(shape=(1,), name='item_condition_input')
text_input = keras.layers.Input(shape=(None,), name='text_input')
inputs = [category_input, brand_input, item_condition_input, text_input]

# categorical feature embeddings
category_embedding = keras.layers.Embedding(
    input_dim=df_train.category_id.nunique()+1,
    output_dim=3, input_length=1)(category_input)

brand_embedding = keras.layers.Embedding(
    input_dim=df_train.brand_id.nunique()+1,
    output_dim=3, input_length=1)(brand_input)

item_condition_embedding = keras.layers.Embedding(
    input_dim=df_train.item_condition_id.nunique()+1,
    output_dim=3, input_length=1)(item_condition_input)

embedding_tensors = [category_embedding, brand_embedding, item_condition_embedding]
x_embeddings = keras.layers.Concatenate()([
    keras.layers.Flatten()(embedding) for embedding in embedding_tensors
])


# text features
import keras.backend as K
Sum = keras.layers.Lambda(lambda x: K.sum(x, axis=1))

def SelfAttention(X):
    dim = K.int_shape(X)[-1]
    q = keras.layers.Dense(dim)(X)
    k = keras.layers.Dense(dim)(X)
    v = keras.layers.Dense(dim)(X)
    w = keras.layers.Dot((2, 2))([q, k])
    w = keras.layers.Softmax(axis=1)(w)
    return keras.layers.Dot((2, 1))([w, v])
    

text_embeddings = keras.layers.Embedding(
    input_dim=2000, output_dim=5, input_length=None)(text_input)
text_embeddings = keras.layers.SpatialDropout1D(0.4)(text_embeddings)
attention = SelfAttention(text_embeddings)
x_text = Sum(attention)


x = keras.layers.Concatenate()([x_embeddings, x_text])
x = keras.layers.Dense(K.int_shape(x)[-1], activation='relu')(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Dense(1, activation='relu')(x)

model = keras.models.Model(inputs=inputs, outputs=x)

In [None]:
model.summary()

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
def rmsle(y_true, y_pred):
    return K.sqrt(K.mean(K.square(K.log(y_pred+1.) - K.log(y_true+1.))))
model.compile(loss=rmsle, optimizer='adam')

In [None]:
model.fit(
    X_train, y_train,
    epochs=25,
    validation_data=(X_test, y_test),
    callbacks=[keras.callbacks.ReduceLROnPlateau(patience=2),
               keras.callbacks.EarlyStopping(patience=3),
               keras.callbacks.TerminateOnNaN()]
)

In [None]:
attention_model = keras.models.Model(inputs=descr_input, outputs=attention)
counties, county_descriptions = df[['county', 'county_description']].drop_duplicates().T.values

# process descriptions through the tokenizer
tokens = [s[:250] for s in tokenizer.texts_to_sequences(county_descriptions)]
county_descriptions = [t.split(' ') for t in tokenizer.sequences_to_texts(tokens)]

attention_scores = attention_model.predict(keras.preprocessing.sequence.pad_sequences(tokens, maxlen=250))
# resize the scores to eliminate redundant axis
attention_scores = descr_attention.reshape(descr_attention.shape[:2])

In [None]:
zipped = zip(counties, descr_texts, attention_scores)
word_importances = [
    (county, tuple([(w, i) for w, i in zip(description, description_importances)]))
    for county, description, description_importances in zipped
]

In [None]:
word_importances

In [None]:
county_word_importances = {
    county: sorted(set(importances), key=lambda x: x[-1], reverse=True)[:10]
     for county, importances in word_importances
}
county_word_importances

In [None]:
county_word_importances['Del Norte County']