In [1]:
import pandas as pd
import numpy as np

train_file = './train.tsv'

train = pd.read_csv(train_file, sep='\t')

In [2]:
from sklearn.preprocessing import LabelEncoder

def categorinizer(df, 
                  col_lists=[
                      'brand_name', 
                      'general_cat', 
                      'subcat_1', 
                      'subcat_2'
                  ]):
    for col in col_lists:
        df[col] = \
            df[col].apply(lambda x: str(x))
        encoder = LabelEncoder()
        encoder.fit(df[col])
        df[col] = encoder.transform(df[col])
        del encoder
    
    return df

def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

train['general_cat'], train['subcat_1'], train['subcat_2'] = \
    zip(*train['category_name'].apply(lambda x: split_cat(x)))

# remove missing values in item description
train = train[pd.notnull(train['item_description'])]

train['item_description'] = train['name'] + ' ' + train['item_description']
train_data = categorinizer(train)
train_data = train_data.drop(columns=['name', 'category_name', 'train_id'])

In [3]:
def one_hot_encoder(alphabet):
    encoder_dict = {}
    encoder = []
    
    encoder_dict['UNK'] = 0
    encoder.append(np.zeros(len(alphabet), dtype='float32'))
    
    for i, alpha in enumerate(alphabet):
        onehot = np.zeros(len(alphabet), dtype='float32')
        encoder_dict[alpha] = i + 1
        onehot[i] = 1
        encoder.append(onehot)
    
    encoder = np.array(encoder, dtype='float32')
    return encoder, encoder_dict

def doc_process(desc, e_dict, l=256):
        desc = desc.strip().lower()
        min_len = min(l, len(desc))
        doc_vec = np.zeros(l, dtype='int64')
        for j in range(min_len):
            if desc[j] in e_dict:
                doc_vec[j] = e_dict[desc[j]]
            else:
                doc_vec[j] = e_dict['UNK']
        return doc_vec

def make_alphabet(alstr='abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}]'):
    return [char for char in alstr]

alphabet = make_alphabet()
encoder, e_dict = one_hot_encoder(alphabet)

In [4]:
desc = train['item_description'][1]
print(doc_process(desc, e_dict))
print(encoder.shape)

[18  1 26  5 18 58  2 12  1  3 11 23  9  4 15 23 58  3  8 18 15 13  1 58 11
  5 25  2 15  1 18  4 58 20  8  9 19 58 11  5 25  2 15  1 18  4 58  9 19 58]
(73, 72)


In [90]:
import mxnet as mx
from mxnet import gluon
from mxnet.gluon.loss import Loss

def embedding_lookup(params, ids):
    # ids must be 2-D tensor [batch_size, encoded_words]
    vecs = []
    for i in ids:
        vec = []
        for j in i:
            vec.append(params[j])
        vecs.append(vec)
    return list(reversed(vecs))

class RMSLE(Loss):
    def __init__(self, weight=1., batch_axis=0, **kwargs):
        super(RMSLE, self).__init__(weight, batch_axis, **kwargs)

    def hybrid_forward(self, F, pred, label, sample_weight=None):
#         label = super()._reshape_like(F, label, pred)
        loss = F.square(F.log(pred+1) - F.log(label+1))
        loss = _apply_weighting(F, loss, self._weight/2, sample_weight)
        return F.mean(loss, axis=self._batch_axis, exclude=True)


def CharNet():
    "See Zhang and LeCun, 2015"
    
    net = gluon.nn.Sequential()
    with net.name_scope():
        net.add(gluon.nn.Conv1D(256, 7, activation='relu'))
        net.add(gluon.nn.MaxPool1D(3, 1))
        net.add(gluon.nn.Conv1D(256, 7, activation='relu'))
        net.add(gluon.nn.MaxPool1D(3, 1))
        net.add(gluon.nn.Conv1D(256, 3, activation='relu'))
        net.add(gluon.nn.Conv1D(256, 3, activation='relu'))
        net.add(gluon.nn.Conv1D(256, 3, activation='relu'))
        net.add(gluon.nn.Conv1D(256, 3, activation='relu'))
        net.add(gluon.nn.MaxPool1D(3, 1))
        net.add(gluon.nn.Flatten())
        net.add(gluon.nn.Dense(1024, activation="relu"))
        net.add(gluon.nn.Dropout(0.5))
        net.add(gluon.nn.Dense(1024, activation="relu"))
        net.add(gluon.nn.Dropout(0.5))
        net.add(gluon.nn.Dense(1))
        
    return net

ctx = mx.cpu()
model = CharNet()
model.initialize(ctx=ctx)
batch_size = 128
rmsle = RMSLE()
trainer = gluon.Trainer(model.collect_params(), 'adam', {'wd': 1e-4})

In [None]:
X = train_data['item_description'].apply(lambda x: doc_process(x, e_dict))
X = np.array([x.tolist() for x in X]).tolist()
X = embedding_lookup(encoder, X)

y = train_data.price.values.tolist()

training = gluon.data.DataLoader(gluon.data.ArrayDataset(X, y),
                            batch_size=batch_size, shuffle=True)

In [None]:
from mxnet import autograd
epochs = 10
loss_sequence = []

for e in range(epochs):
    cumulative_loss = 0
    # inner loop
    for i, (data, label) in enumerate(training):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        with autograd.record():
            output = model(data)
            loss = rmsle(output, label)
        loss.backward()
        trainer.step(batch_size)
        cumulative_loss += nd.mean(loss).asscalar()
    print("Epoch %s, loss: %s" % (e, cumulative_loss / num_examples))
    loss_sequence.append(cumulative_loss)

In [81]:
print(model)

Sequential(
  (0): Conv1D(None -> 256, kernel_size=(7,), stride=(1,))
  (1): MaxPool1D(size=(3,), stride=(1,), padding=(0,), ceil_mode=False)
  (2): Conv1D(None -> 256, kernel_size=(7,), stride=(1,))
  (3): MaxPool1D(size=(3,), stride=(1,), padding=(0,), ceil_mode=False)
  (4): Conv1D(None -> 256, kernel_size=(3,), stride=(1,))
  (5): Conv1D(None -> 256, kernel_size=(3,), stride=(1,))
  (6): Conv1D(None -> 256, kernel_size=(3,), stride=(1,))
  (7): Conv1D(None -> 256, kernel_size=(3,), stride=(1,))
  (8): MaxPool1D(size=(3,), stride=(1,), padding=(0,), ceil_mode=False)
  (9): Flatten
  (10): Dense(None -> 1024, Activation(relu))
  (11): Dropout(p = 0.5)
  (12): Dense(None -> 1024, Activation(relu))
  (13): Dropout(p = 0.5)
  (14): Dense(None -> 1, linear)
)


In [82]:
mx.nd.array(X[3]).shape

(50, 72)