In [1]:
import pandas
import numpy as np
import tensorflow as tf

tf.__version__

'1.4.1'

In [2]:
data = pandas.read_parquet('../data/to_send.pq')
print(data.shape)
data = data[['description', 'products']][pandas.notnull(data['products'])].copy().reset_index(drop=True)
print(data.shape)

(83897, 18)
(47516, 2)


In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from collections import Counter
from nltk.stem.lancaster import LancasterStemmer

# nltk.download('stopwords')

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stopwords_cached = stopwords.words('english')
lemma = nltk.wordnet.WordNetLemmatizer()
st = LancasterStemmer()

# lemmatize categories to make 'detail' and 'details' as one category
def clean_categories(x):
    result = []
    for category in x:
        tmp_ = ' '.join([lemma.lemmatize(word)
                         for word in tokenizer.tokenize(category.lower())
                         if word not in stopwords_cached])
        if tmp_ != '':
            result.append(tmp_)
    if len(result) == 0:
        return np.nan
    return np.array(result)

data['products'] = data['products'].apply(clean_categories)
data = data[data['products'].notnull()].copy().reset_index(drop=True)

all_products = []
for prod_list in data['products'].values:
    all_products += [' '.join(tokenizer.tokenize(product))
                     for product in prod_list.tolist()]
    
counter = Counter(all_products)
print('unique categories', len(counter.most_common()))

most_common = [product[0] for product in counter.most_common(100)]

# remove all categories which is not in top 100
def filter_categories(x):
    new_categories = np.array([product 
                               for product in x.tolist()
                               if product in most_common])
    if new_categories.shape[0] == 0:
        return np.nan
    return new_categories

stopwords_cached = stopwords.words('english')

# basic text processing on descriptions
def filter_descriptions(text):
    cleaned_text = [lemma.lemmatize(token)
                      for token in tokenizer.tokenize(text.lower())
                      if token not in stopwords_cached]
    if len(cleaned_text) == 0:
        return np.nan
    return ' '.join(cleaned_text)

data['products'] = data['products'].apply(filter_categories)
data['description'] = data['description'].apply(filter_descriptions)

data = data[(data['products'].notnull()) & (data['description'].notnull())].reset_index(drop=True)
data.shape

unique categories 66682


(23413, 2)

# TEXT PROCESSING

In [7]:
data.head()

Unnamed: 0,description,products
0,welch allyn combine practical understanding cl...,"[power supply, medical]"
1,line company intention support international g...,[imo]
2,service redaelli ricambi offer ability produce...,"[auto spare part, auto spare, spare part]"
3,established tien jiang enterprise co ltd one s...,"[rubber, polyester]"
4,songwei dedicate become benchmark manufacturin...,[electric]


### fasttext model

In [8]:
from gensim.models.wrappers.fasttext import FastText

model = FastText.load_fasttext_format('../models/wiki.simple.bin')

In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from sklearn.preprocessing import MultiLabelBinarizer


# nltk.download('stopwords')

# tokenizer = RegexpTokenizer(r'\w+')
tokenizer = RegexpTokenizer(r'[a-z]+')
lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def get_embedding_matrix(text, first_n=50):
#     print(text)
    matrix = []
    text = [token#lemma.lemmatize(token)
            for token in tokenizer.tokenize(text)]
    for word in text[:first_n]:
        try:
            word_embedding = model.wv[word]
            matrix.append(model.wv[word])
        except KeyError:
            print("error in word", word)
            matrix.append(np.zeros(300))
    
    matrix = np.array(matrix)
    # fill text embeddings with seq_len < first_n with zeros
#     print(matrix.shape[0])
    if matrix.shape[0] < first_n:
        matrix = np.vstack((matrix, np.zeros((first_n - matrix.shape[0], 300))))
    return matrix

embeddings = []
for text in data['description'].values:
    embeddings.append(get_embedding_matrix(text))

train_data = np.array(embeddings)

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([item.tolist() for item in data['products'].values])

error in word zc
error in word zc
error in word vz
error in word vz
error in word vz


In [10]:
# TODO Clusterize for example "aluminium" and "aluminum";  'spare part' and 'spare parts' and others into one category.
mlb.classes_

array(['aluminium', 'aluminum', 'assembly', 'auto part', 'auto spare',
       'auto spare part', 'automotive part', 'automotive spare',
       'automotive spare part', 'ball', 'battery', 'bearing', 'book',
       'bottle', 'brake', 'cable', 'cap', 'car', 'car part', 'ceramic',
       'ceramic tile', 'chair', 'coil', 'component', 'compressor',
       'computer', 'computer part', 'connector', 'cotton', 'cover',
       'cylinder', 'display', 'electric', 'electrical', 'engine',
       'fabric', 'film', 'filter', 'fitting', 'frame', 'furniture',
       'gear', 'glass', 'granite', 'hand', 'hand tool', 'hose',
       'hydraulic', 'imo', 'industrial', 'lamp', 'leather', 'led',
       'light', 'lighting', 'machinery', 'machinery part', 'medical',
       'motor', 'motor vehicle', 'nut', 'oil', 'panel', 'pipe',
       'plastic part', 'plate', 'polyester', 'powder', 'power',
       'power supply', 'pump', 'pvc', 'ring', 'rubber', 'screw', 'sheet',
       'shoe', 'slab', 'software', 'solid wood', '

# LSTM

In [11]:
import math

class DataBatcher():
    def __init__(self, _X, _y, _batch_size=30):
        self._X = _X
        self._y = _y
        self._batch_size = _batch_size
        self._resplit = True
        self._num_examples = self._y.shape[0]
    
    def next_batch(self):
        if self._resplit:
            perm0 = np.arange(self._num_examples)
            np.random.shuffle(perm0)
            self._batches_indexes = np.array_split(perm0, math.ceil(perm0.shape[0] / self._batch_size))
            self._batch_counter = -1
            self._resplit = False

        self._batch_counter += 1
        if self._batches_indexes[self._batch_counter].shape[0] < self._batch_size:
            self._resplit = True
            ind = self._batch_counter
            missing_num = self._batch_size - self._batches_indexes[ind].shape[0]
            return self._X[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))],\
                   self._y[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))]
        
        return self._X[self._batches_indexes[self._batch_counter]], self._y[self._batches_indexes[self._batch_counter]]



In [None]:
from sklearn.model_selection import train_test_split

graph = tf.Graph()

with graph.as_default():

    # Training Parameters
    learn_rate = 0.001
    training_steps = 220000
    batch_size = 128
    display_step = 1000
    checkpoint_step = 10000
    
    # Network Parameters
    num_input = 300 
    timesteps = 50 
    num_hidden = 40 
    num_classes = 100 
    num_layers = 3
    cell = tf.contrib.rnn.LSTMCell

    # tf Graph input
    X = tf.placeholder("float", [None, timesteps, num_input])
    Y = tf.placeholder("float", [None, num_classes])
    lr = tf.placeholder("float")

    # Define weights
    weights = {
        'out': tf.Variable(tf.random_normal([2 * num_hidden, num_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([num_classes]))
    }

    def RNN(x, weights, biases):
        x = tf.unstack(x, timesteps, 1)
        initializer = tf.truncated_normal_initializer(stddev=0.18632568, dtype=tf.float32)

        # Cerate forward and backward cells
        cells_fw = [cell(num_hidden, activation=tf.tanh, 
                         initializer=initializer) for _ in range(num_layers)]
        cells_bw = [cell(num_hidden, activation=tf.tanh,
                         initializer=initializer) for _ in range(num_layers)]
        
        keep_probs = [0.75, 0.65, 0.65]
        # Add dropout
        cells_fw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=keep_prob_
                                                 ) for cell, keep_prob_ in zip(cells_fw, keep_probs)]
        cells_bw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=keep_prob_
                                                 ) for cell, keep_prob_ in zip(cells_bw, keep_probs)]
        
        outputs, _, _ = tf.contrib.rnn.stack_bidirectional_rnn(
            cells_fw=cells_fw,
            cells_bw=cells_bw,
            inputs=x,
            dtype=tf.float32)
        
        return tf.matmul(outputs[-1], weights['out']) + biases['out']

    logits = RNN(X, weights, biases)
    prediction = tf.nn.sigmoid(logits)

    # Define loss and optimizer
    loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        logits=logits, labels=Y))
    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_op = optimizer.minimize(loss_op)

    init = tf.global_variables_initializer()
    
def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
        real_ = np.nonzero(y_true_tmp)[0].tolist()
        pred_ = np.nonzero(y_pred_tmp)[0].tolist()
        if len(real_) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real_).intersection(set(pred_))) / len(real_))
    return(np.array(acc).mean())

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
sess = tf.InteractiveSession(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options))

# with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
# Run the initializer
sess.run(init)
batcher = DataBatcher(X_train, y_train, _batch_size=batch_size)

for step in range(1, training_steps+1):
    batch_x, batch_y = batcher.next_batch()
    if step == int(training_steps * 0.75):
        learn_rate /= 2
        print("lr now " + str(learn_rate))
    if step == int(training_steps * 0.90):
        learn_rate /= 2
        print("lr now " + str(learn_rate))
    sess.run(train_op, feed_dict={X: batch_x, Y: batch_y, lr: learn_rate})
    if step % display_step == 0 or step == 1:
        # Calculate batch loss and accuracy
        loss, pred = sess.run([loss_op, tf.round(prediction)], feed_dict={X: batch_x, Y: batch_y})
        
        test_out = sess.run(tf.round(prediction), feed_dict={X: X_test})
        print("Step " + str(step) + ", Loss= " + \
              "{:.4f}".format(loss) + ", Train Acc= " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=pred, y_true=batch_y)) +  ", Test Acc = " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=test_out, y_true=y_test)))
        
    if step % checkpoint_step == 0:
        saver = tf.train.Saver()
        saver.save(sess, '../models/lstm_big_dropout/lstm_100_classes', global_step=step)
        

saver = tf.train.Saver()
saver.save(sess, '../models/lstm_big_dropout/lstm_100_classes', global_step=step)
        

print("Optimization Finished!")

Step 1, Loss= 0.9236, Train Acc= 0.452, Test Acc = 0.482
Step 1000, Loss= 0.0747, Train Acc= 0.000, Test Acc = 0.000
Step 2000, Loss= 0.0694, Train Acc= 0.000, Test Acc = 0.000
Step 3000, Loss= 0.0697, Train Acc= 0.000, Test Acc = 0.001
Step 4000, Loss= 0.0641, Train Acc= 0.016, Test Acc = 0.021
Step 5000, Loss= 0.0651, Train Acc= 0.040, Test Acc = 0.036
Step 6000, Loss= 0.0562, Train Acc= 0.042, Test Acc = 0.041
Step 7000, Loss= 0.0598, Train Acc= 0.051, Test Acc = 0.056
Step 8000, Loss= 0.0584, Train Acc= 0.096, Test Acc = 0.060
Step 9000, Loss= 0.0555, Train Acc= 0.124, Test Acc = 0.069
Step 10000, Loss= 0.0537, Train Acc= 0.148, Test Acc = 0.072
Step 11000, Loss= 0.0569, Train Acc= 0.118, Test Acc = 0.073
Step 12000, Loss= 0.0482, Train Acc= 0.182, Test Acc = 0.086
Step 13000, Loss= 0.0502, Train Acc= 0.141, Test Acc = 0.085
Step 14000, Loss= 0.0522, Train Acc= 0.209, Test Acc = 0.097
Step 15000, Loss= 0.0556, Train Acc= 0.161, Test Acc = 0.092


# Testing

In [None]:
with tf.Session(graph=graph) as session:
    save = tf.train.Saver()
    save.restore(session, '../models/lstm_big_dropout/lstm_100_classes-40000')
    pred = tf.round(tf.nn.sigmoid(logits))
    test_preds = session.run(pred, feed_dict={X: X_test})
    for num in range(0, test_preds.shape[0]):
        real = y_test[num]

        real = np.nonzero(real)[0].tolist()
        right_num = len(real)
        pred = test_preds[num]
        pred = np.nonzero(pred)[0].tolist()
#         print(real, pred)
        print('=================================\nreal_classes', mlb.classes_[np.array(real)])
        if len(pred) > 0:
            print('predicted classes', mlb.classes_[np.array(pred)])