In [56]:
from collections import Counter
from fastcache import clru_cache as lru_cache
import numpy as np
from nltk.stem.porter import PorterStemmer
import os
import pandas as pd
from time import time
import tensorflow as tf
import re

In [57]:
# 正则匹配
whitespace = re.compile(r'\s+')
non_letter = re.compile(r'\W+')

In [58]:
stemmer = PorterStemmer()

@lru_cache(1024)
def stem(s):
    return stemmer.stem(s)

def tokenize(text):
    """提取词干"""
    text = text.lower()
    text = non_letter.sub(' ', text)

    tokens = []

    for t in text.split():
        t = stem(t)
        tokens.append(t)

    return tokens

In [59]:
def paths(tokens):
    all_paths = ['/'.join(tokens[0:(i+1)]) for i in range(len(tokens))]
    return ' '.join(all_paths)

@lru_cache(1024)
def cat_process(cat):
    """category不同层级类目串联
    
    eg: 输入 a/b/c， 输出 "a a/b a/b/c"
    """
    cat = cat.lower()
    cat = whitespace.sub('', cat)
    split = cat.split('/')

    return paths(split)

In [60]:
class Tokenizer:
    def __init__(self, min_df=10, tokenizer=str.split):
        self.min_df = min_df
        self.tokenizer = tokenizer
        self.doc_freq = None
        self.vocab = None
        self.vocab_idx = None
        self.max_len = None

    def fit_transform(self, texts):
        """输入sentence数组，输出二维数组"""
        tokenized = []
        doc_freq = Counter()
        n = len(texts)

        # 收集sentence list中的词语频次
        for text in texts:
            sentence = self.tokenizer(text)
            tokenized.append(sentence)
            doc_freq.update(set(sentence))

        # 生成词表
        vocab = sorted([t for (t, c) in doc_freq.items() if c >= self.min_df])
        vocab_idx = {t: (i + 1) for (i, t) in enumerate(vocab)}
        doc_freq = [doc_freq[t] for t in vocab]

        self.doc_freq = doc_freq
        self.vocab = vocab
        self.vocab_idx = vocab_idx

        # 将sentence list中的元素转换为数字
        max_len = 0
        result_list = []
        for text in tokenized:
            text = self.text_to_idx(text)
            max_len = max(max_len, len(text))
            result_list.append(text)

        # 每一个一维数组等长的二维数组
        self.max_len = max_len
        result = np.zeros(shape=(n, max_len), dtype=np.int32)
        for i in range(n):
            text = result_list[i]
            result[i, :len(text)] = text

        return result    

    def text_to_idx(self, tokenized):
        return [self.vocab_idx[t] for t in tokenized if t in self.vocab_idx]

    def transform(self, texts):
        """输出类似fit_transform的二维数组"""
        n = len(texts)
        result = np.zeros(shape=(n, self.max_len), dtype=np.int32)

        count = 0
        for text_raw in texts:
            text = self.tokenizer(text_raw)
            text = self.text_to_idx(text)[:self.max_len]
            result[count, :len(text)] = text
            count += 1

        return result
    
    def vocabulary_size(self):
        return len(self.vocab) + 1

In [61]:
print('reading data...')
train_ratio = 0.7
valid_ratio = 0.2
test_ratio = 0.1 # 暴露test数据集，方便后续拓展

df = pd.read_csv('../input/train.tsv', sep='\t')
df = df[df.price != 0].reset_index(drop=True)
df_index = df.index

# train
df_train = df.sample(int(train_ratio * len(df)))
train_index = df_train.index
print("len(df_train) is {}".format(len(df_train)))

# minus train
residue_index = df_index.difference(train_index)
df_residue = df.loc[residue_index]

# valid
valid_ratio_in_residue = float(valid_ratio) / (valid_ratio + test_ratio)
df_valid = df_residue.sample(int(valid_ratio_in_residue * len(df_residue)))
valid_index = df_valid.index
print("len(df_valid) is {}".format(len(df_valid)))

# test
test_index = residue_index.difference(valid_index)
df_test = df_residue.loc[test_index]
print("len(df_test) is {}".format(len(df_test)))

reading data...
len(df_train) is 1037162
len(df_valid) is 296332
len(df_test) is 148167


In [62]:
def get_label(df):
    """生成label"""
    price = df.pop('price')
    
    # 平滑处理
    y = np.log1p(price.values)
    y = (y - y.mean()) / y.std()
    
    return y.reshape(-1, 1)

y_train = get_label(df_train)
y_valid = get_label(df_valid)
y_test = get_label(df_test)

In [63]:
df_train.name.fillna('unkname', inplace=True)
df_train.category_name.fillna('unk_cat', inplace=True)
df_train.brand_name.fillna('unk_brand', inplace=True)
df_train.item_description.fillna('nodesc', inplace=True)

df_valid.name.fillna('unkname', inplace=True)
df_valid.category_name.fillna('unk_cat', inplace=True)
df_valid.brand_name.fillna('unk_brand', inplace=True)
df_valid.item_description.fillna('nodesc', inplace=True)

df_test.name.fillna('unkname', inplace=True)
df_test.category_name.fillna('unk_cat', inplace=True)
df_test.brand_name.fillna('unk_brand', inplace=True)
df_test.item_description.fillna('nodesc', inplace=True)

In [74]:
df_train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description
592471,592781,Size 2 Womens Bundle,2,Women/Pants/Dress Pants,old navy,0,1 pair of capris- Rockies Jeans Brand. Midrise...
881053,881541,Federal bowl,3,Vintage & Collectibles/Housewares/Bowl,unk_brand,0,Vintage federal mixing bowl. milk glass yellow...
1376697,1377503,Victoria secret pink long sleeve,2,Women/Tops & Blouses/T-Shirts,pink,1,No rips or tears Really soft lettering on it S...
1411685,1412514,H&M basics bundle,2,Women/Tops & Blouses/T-Shirts,h&m,1,-free ship- Includes 2 basic tees from H&M in ...
1191720,1192406,Feminist Pin,1,Women/Women's Accessories/Other,unk_brand,1,"Handmade. One of a kind button that reads ""I'm..."


In [64]:
# 每一个category、title、description都可以类比成一个句子，因而需要用Tokenizer生成二维数组
print('processing category...')

cat_tok = Tokenizer(min_df=50)

# 不同类目下的相同词语代表的含义不同比如nike/fashionsneakers和shoes/fashionsneakers的含义不同
df_train_category_name = df_train.category_name.apply(cat_process)
X_cat = cat_tok.fit_transform(df_train_category_name)
cat_voc_size = cat_tok.vocabulary_size()

df_valid_category_name = df_valid.category_name.apply(cat_process)
X_cat_valid = cat_tok.transform(df_valid_category_name)

df_test_category_name = df_test.category_name.apply(cat_process)
X_cat_test = cat_tok.transform(df_test_category_name)

processing category...


In [65]:
print('processing title...')

name_tok = Tokenizer(min_df=10, tokenizer=tokenize)
X_name = name_tok.fit_transform(df_train.name)
name_voc_size = name_tok.vocabulary_size()

X_name_valid = name_tok.transform(df_valid.name)
X_name_test = name_tok.transform(df_test.name)

processing title...


In [66]:
print('processing description...')

desc_num_col = 40
desc_tok = Tokenizer(min_df=50, tokenizer=tokenize)

X_desc = desc_tok.fit_transform(df_train.item_description)
X_desc_valid = desc_tok.transform(df_valid.item_description)
X_desc_test = desc_tok.transform(df_test.item_description)

X_desc = X_desc[:, :desc_num_col] # 截断长尾特征
X_desc_valid = X_desc_valid[:, :desc_num_col]
X_desc_test = X_desc_test[:, :desc_num_col]

desc_voc_size = desc_tok.vocabulary_size()

processing description...


In [67]:
print('processing brand...')

df_train.brand_name = df_train.brand_name.str.lower().replace(' ', '_')
df_valid.brand_name = df_valid.brand_name.str.lower().replace(' ', '_')
df_test.brand_name = df_test.brand_name.str.lower().replace(' ', '_')

# 生成词表
brand_cnt = Counter(df_train.brand_name[df_train.brand_name != 'unk_brand'])
brands = sorted(b for (b, c) in brand_cnt.items() if c >= 50) # 词表
brands_idx = {b: (i + 1) for (i, b) in enumerate(brands)} # 单词编号

X_brand = df_train.brand_name.apply(lambda b: brands_idx.get(b, 0))
X_brand_valid = df_valid.brand_name.apply(lambda b: brands_idx.get(b, 0))
X_brand_test = df_test.brand_name.apply(lambda b: brands_idx.get(b, 0))

X_brand = X_brand.values.reshape(-1, 1) 
X_brand_valid = X_brand_valid.values.reshape(-1, 1) 
X_brand_test = X_brand_test.values.reshape(-1, 1)

brand_voc_size = len(brands) + 1

processing brand...


In [68]:
print('processing other features...')

X_item_cond = (df_train.item_condition_id - 1).astype('uint8').values.reshape(-1, 1)
X_item_cond_valid = (df_valid.item_condition_id - 1).astype('uint8').values.reshape(-1, 1)
X_item_cond_test = (df_test.item_condition_id - 1).astype('uint8').values.reshape(-1, 1)

X_shipping = df_train.shipping.astype('float32').values.reshape(-1, 1)
X_shipping_valid = df_valid.shipping.astype('float32').values.reshape(-1, 1)
X_shipping_test = df_test.shipping.astype('float32').values.reshape(-1, 1)

processing other features...


In [69]:
def conv1d(inputs, num_filters, filter_size, padding='same'):
    he_std = np.sqrt(2 / (filter_size * num_filters))
    out = tf.layers.conv1d(
        inputs=inputs, filters=num_filters, padding=padding,
        kernel_size=filter_size,
        activation=tf.nn.relu, 
        kernel_initializer=tf.random_normal_initializer(stddev=he_std))
    return out

def dense(X, size, reg=0.0, activation=None):
    he_std = np.sqrt(2 / int(X.shape[1]))
    out = tf.layers.dense(X, units=size, activation=activation, 
                     kernel_initializer=tf.random_normal_initializer(stddev=he_std),
                     kernel_regularizer=tf.contrib.layers.l2_regularizer(reg))
    return out

def embed(inputs, size, dim):
    """输出[sie, dim]中经过inputs过滤的embedding向量"""
    std = np.sqrt(2 / dim)
    emb = tf.Variable(tf.random_uniform([size, dim], -std, std))
    lookup = tf.nn.embedding_lookup(emb, inputs)
    return lookup

In [70]:
name_embeddings_dim = 32
name_seq_len = X_name.shape[1]

desc_embeddings_dim = 32
desc_seq_len = X_desc.shape[1]

brand_embeddings_dim = 4

cat_embeddings_dim = 12
cat_seq_len = X_cat.shape[1]

In [71]:
graph = tf.Graph()
graph.seed = 1

with graph.as_default():
    place_name = tf.placeholder(tf.int32, shape=(None, name_seq_len))
    place_desc = tf.placeholder(tf.int32, shape=(None, desc_seq_len))
    place_brand = tf.placeholder(tf.int32, shape=(None, 1))
    place_cat = tf.placeholder(tf.int32, shape=(None, cat_seq_len))
    place_ship = tf.placeholder(tf.float32, shape=(None, 1))
    place_cond = tf.placeholder(tf.uint8, shape=(None, 1))
    place_y = tf.placeholder(dtype=tf.float32, shape=(None, 1))
    place_lr = tf.placeholder(tf.float32, shape=(), )
    
    # title
    name = embed(place_name, name_voc_size, name_embeddings_dim)
    name = conv1d(name, num_filters=10, filter_size=3)
    name = tf.nn.dropout(name, keep_prob=0.5)
    name = tf.layers.flatten(name)
    tf.summary.histogram("name", name)
    print("name.shape is {}".format(name.shape))
    
    # description 
    desc = embed(place_desc, desc_voc_size, desc_embeddings_dim)
    desc = conv1d(desc, num_filters=10, filter_size=3)
    desc = tf.nn.dropout(desc, keep_prob=0.5)
    desc = tf.layers.flatten(desc)
    tf.summary.histogram("desc", desc)
    print("desc.shape is {}".format(desc.shape))
    
    # brand
    brand = embed(place_brand, brand_voc_size, brand_embeddings_dim)
    brand = tf.layers.flatten(brand)
    tf.summary.histogram("brand", brand)
    print("brand.shape is {}".format(brand.shape))
    
    # category 
    cat = embed(place_cat, cat_voc_size, cat_embeddings_dim)
    cat = tf.layers.average_pooling1d(cat, pool_size=cat_seq_len, strides=1, padding='valid')
    cat = tf.layers.flatten(cat)
    tf.summary.histogram("cat", cat)
    print("cat.shape is {}".format(cat.shape))
    
    # ship
    ship = place_ship

    # condition
    cond = tf.one_hot(place_cond, 5)
    cond = tf.layers.flatten(cond)

    out = tf.concat([name, desc, brand, cat, ship, cond], axis=1)
    print('concatenated dim:', out.shape)
    
    out = dense(out, size=100, activation=None)
    out = tf.nn.dropout(out, keep_prob=0.5)
    out = dense(out, size=1)
    
    loss = tf.losses.mean_squared_error(place_y, out)
    tf.summary.scalar("loss", loss)
    rmse = tf.sqrt(loss)
    
    train_step = tf.train.AdamOptimizer(learning_rate=place_lr).minimize(loss)
    init = tf.global_variables_initializer()
    merged = tf.summary.merge_all() # merge_all需要在graph的定义中声明，否则无效

name.shape is (?, 170)
desc.shape is (?, 400)
brand.shape is (?, 4)
cat.shape is (?, 12)
concatenated dim: (?, 592)
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.


In [72]:
def prepare_batches(inputs, batch_size):
    """提供batch_size数据"""
    n = len(inputs)
    for i in range(0, n, batch_size):
        yield inputs[i: i + batch_size]

In [73]:
writer = tf.summary.FileWriter("loss_log/", session.graph)
session = tf.Session(graph=graph)
session.run(init)

count = 0
for i in range(20):
    np.random.seed(i)
    train_indices = list(range(len(X_name)))
    np.random.shuffle(train_indices)
    
    # 控制lr 
    if i <= 2:
        lr = 0.001
    elif i <= 10:
        lr = 0.0001
    else:
        lr = 0.00005
    
    for idx in prepare_batches(train_indices, 500):
        feed_dict_op = {
            place_name: X_name[idx],
            place_desc: X_desc[idx],
            place_brand: X_brand[idx],
            place_cat: X_cat[idx],
            place_ship: X_shipping[idx],
            place_cond: X_item_cond[idx],
            place_y: y_train[idx],
            place_lr: lr
        }
        
        _ = session.run(train_step, feed_dict=feed_dict_op)
        
        count += 1
        if count % 300 == 0:
            feed_dict_op = {
                place_name: X_name_valid,
                place_desc: X_desc_valid,
                place_brand: X_brand_valid,
                place_cat: X_cat_valid,
                place_ship: X_shipping_valid,
                place_cond: X_item_cond_valid,
                place_y: y_valid
            }
            loss_op, merged_op = session.run([loss, merged], feed_dict_op)
            print("iter: {}, loss: {}".format(count, loss_op))
            writer.add_summary(merged_op, count)

iter: 300, loss: 0.6367616057395935
iter: 600, loss: 0.5309112071990967
iter: 900, loss: 0.4952414035797119
iter: 1200, loss: 0.47608503699302673
iter: 1500, loss: 0.4624232053756714
iter: 1800, loss: 0.45448222756385803
iter: 2100, loss: 0.44892504811286926
iter: 2400, loss: 0.4470916986465454
iter: 2700, loss: 0.441667765378952
iter: 3000, loss: 0.438157320022583
iter: 3300, loss: 0.4363723695278168
iter: 3600, loss: 0.433505654335022
iter: 3900, loss: 0.43136122822761536
iter: 4200, loss: 0.4304935932159424
iter: 4500, loss: 0.43019935488700867
iter: 4800, loss: 0.4289141297340393
iter: 5100, loss: 0.4275100827217102
iter: 5400, loss: 0.4270131587982178
iter: 5700, loss: 0.42648443579673767
iter: 6000, loss: 0.42400550842285156
iter: 6300, loss: 0.4231116473674774
iter: 6600, loss: 0.42257654666900635
iter: 6900, loss: 0.421375572681427
iter: 7200, loss: 0.4211971163749695
iter: 7500, loss: 0.4205610752105713
iter: 7800, loss: 0.4205142855644226
iter: 8100, loss: 0.42000412940979004