In [1]:
import pandas
import numpy as np
import tensorflow as tf

tf.__version__

'1.4.1'

In [2]:
data = pandas.read_parquet('../data/to_send.pq')
print(data.shape)
data = data[['description', 'products']][pandas.notnull(data['products'])].copy().reset_index(drop=True)
print(data.shape)

(83897, 18)
(47516, 2)


In [3]:
data.head()

Unnamed: 0,description,products
0,Welch allyn combines its practical understandi...,"[power supply, body sub assy, medical, valve b..."
1,In line with the company s intention to ...,"[imo, advertising materials, point, imo label,..."
2,Services redaelli ricambi offers the ability t...,"[auto spare parts, tie rod, tie rod end, auto ..."
3,STROTHMANN not only delivers suitable mechanic...,"[covers non automated, demurrage rules form, r..."
4,"Established\nin 1991, tien jiang enterprise co...","[rubber, polyester, nylon, boot, support]"


In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from collections import Counter
from nltk.stem.lancaster import LancasterStemmer

# nltk.download('stopwords')

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stopwords_cached = stopwords.words('english')
lemma = nltk.wordnet.WordNetLemmatizer()
st = LancasterStemmer()

# lemmatize categories to make 'detail' and 'details' as one category
def clean_categories(x):
    result = []
    for category in x:
        tmp_ = ' '.join([lemma.lemmatize(word)
                         for word in tokenizer.tokenize(category.lower())
                         if word not in stopwords_cached])
        if tmp_ != '':
            result.append(tmp_)
    if len(result) == 0:
        return np.nan
    return np.array(result)

data['products'] = data['products'].apply(clean_categories)
data = data[data['products'].notnull()].copy().reset_index(drop=True)

all_products = []
for prod_list in data['products'].values:
    all_products += [' '.join(tokenizer.tokenize(product))
                     for product in prod_list.tolist()]
    
counter = Counter(all_products)
print('unique categories', len(counter.most_common()))

most_common = [product[0] for product in counter.most_common(100)]

# remove all categories which is not in top 100
def filter_categories(x):
    new_categories = np.array([product 
                               for product in x.tolist()
                               if product in most_common])
    if new_categories.shape[0] == 0:
        return np.nan
    return new_categories

stopwords_cached = stopwords.words('english')

# basic text processing on descriptions
def filter_descriptions(text):
    cleaned_text = [lemma.lemmatize(token)
                      for token in tokenizer.tokenize(text.lower())
                      if token not in stopwords_cached]
    if len(cleaned_text) == 0:
        return np.nan
    return ' '.join(cleaned_text)

data['products'] = data['products'].apply(filter_categories)
data['description'] = data['description'].apply(filter_descriptions)

data = data[(data['products'].notnull()) & (data['description'].notnull())].reset_index(drop=True)
data.shape

In [None]:
data

# doc2vec

In [None]:
from nltk.tokenize import RegexpTokenizer
import gensim

lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')

def read_corpus(tokens_only=False):
#     with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
    for line, id_ in zip(data['description'].values, 
                         data.index.values):
        
        # For training data, add tags
        yield gensim.models.doc2vec.TaggedDocument([token
                                                    for token in tokenizer.tokenize(line)], 
                                                   ['_*' + str(id_)])
        
train_corpus = list(read_corpus())

In [None]:
train_corpus[:2]

In [None]:
# PV-DBOW 

import gensim

model = gensim.models.doc2vec.Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, iter=10, workers=7)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)
model.save('../models/company_descriptions.bin')


In [None]:
import gensim
model = gensim.models.doc2vec.Doc2Vec.load('../models/company_descriptions.bin')

In [None]:
model.docvecs['_*1']

In [None]:
X_ = []
y_ = []

for i in data.index.values:
    X_.append(model.docvecs['_*' + str(i)])
    y_.append(data.loc[i]['products'])
X_ = np.array(X_)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_ = mlb.fit_transform(y_)

In [None]:
mlb.classes_

In [None]:
model.docvecs

# simple NN

In [None]:
import math

class DataBatcher():
    def __init__(self, _X, _y, _batch_size=30):
        self._X = _X
        self._y = _y
        self._batch_size = _batch_size
        self._resplit = True
        self._num_examples = self._y.shape[0]
    
    def next_batch(self):
        if self._resplit:
            perm0 = np.arange(self._num_examples)
            np.random.shuffle(perm0)
            self._batches_indexes = np.array_split(perm0, math.ceil(perm0.shape[0] / self._batch_size))
            self._batch_counter = -1
            self._resplit = False

        self._batch_counter += 1
        if self._batches_indexes[self._batch_counter].shape[0] < self._batch_size:
            self._resplit = True
            ind = self._batch_counter
            missing_num = self._batch_size - self._batches_indexes[ind].shape[0]
            return self._X[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))],\
                   self._y[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))]
        
        return self._X[self._batches_indexes[self._batch_counter]], self._y[self._batches_indexes[self._batch_counter]]

In [None]:
from sklearn.model_selection import train_test_split
graph = tf.Graph()

with graph.as_default():
    # Parameters
    learn_rate = 0.001
    num_steps = 200000
    batch_size = 128
    display_step = 1000
    checkpoint_step = 20000

    # Network Parameters
    n_hidden = 70 
    num_input = 200 
    num_classes = 100 

    # Graph input
    X = tf.placeholder("float", [None, num_input])
    y = tf.placeholder("float", [None, num_classes])
    lr = tf.placeholder("float")

    # Define the neural network
    def simple_nn(x):
        initializer = tf.truncated_normal_initializer(stddev=0.18632568, dtype=tf.float32)
        regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
        layer_1 = tf.layers.dense(inputs=x, units=n_hidden, activation=tf.tanh, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        layer_2 = tf.layers.dense(inputs=layer_1, units=n_hidden, activation=tf.tanh, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        layer_3 = tf.layers.dense(inputs=layer_2, units=n_hidden, activation=tf.tanh, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        layer_4 = tf.layers.dense(inputs=layer_3, units=n_hidden, activation=tf.tanh, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        # Output fully connected layer with a neuron for each class
        out_layer = tf.layers.dense(layer_4, num_classes, activation=None, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        return out_layer

    logits = simple_nn(X)
    prediction = tf.nn.sigmoid(logits)
    loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_op = optimizer.minimize(loss_op)
    init = tf.global_variables_initializer()

def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
        real_ = np.nonzero(y_true_tmp)[0].tolist()
        pred_ = np.nonzero(y_pred_tmp)[0].tolist()
        if len(real_) == 0:
            acc.append(0.0)
            continue
        acc.append(len(set(real_).intersection(set(pred_))) / len(real_))
    return(np.array(acc).mean())

# Start training
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
sess = tf.InteractiveSession(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options))


# Run the initializer
sess.run(init)
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)
batcher = DataBatcher(X_train, y_train, _batch_size=batch_size)


for step in range(1, num_steps + 1):
    batch_x, batch_y = batcher.next_batch()
    if step == int(num_steps * 0.75):
        learn_rate /= 2
        print("lr now " + str(learn_rate))
    if step == int(num_steps * 0.90):
        learn_rate /= 2
        print("lr now " + str(learn_rate))
    sess.run(train_op, feed_dict={X: batch_x, y: batch_y, lr: learn_rate})
    if step % display_step == 0 or step == 1:
        # Calculate batch loss and accuracy
        loss, pred = sess.run([loss_op, tf.round(prediction)], feed_dict={X: batch_x, y: batch_y})
        
        test_out = sess.run(tf.round(prediction), feed_dict={X: X_test})
        print("Step " + str(step) + ", Loss= " + \
              "{:.4f}".format(loss) + ", Train Acc= " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=pred, y_true=batch_y)) +  ", Test Acc = " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=test_out, y_true=y_test)))

    if step % checkpoint_step == 0:
        saver = tf.train.Saver()
        saver.save(sess, '../models/simple_nn/mlp', global_step=step)
        
print("done")

## test

In [None]:
with tf.Session(graph=graph) as session:
    save = tf.train.Saver()
    save.restore(session, '../models/lstm_big_dropout/lstm_100_classes-40000')
    pred = tf.round(tf.nn.sigmoid(logits))
    test_preds = session.run(pred, feed_dict={X: X_test})
    for num in range(0, test_preds.shape[0]):
        real = y_test[num]

        real = np.nonzero(real)[0].tolist()
        right_num = len(real)
        pred = test_preds[num]
        pred = np.nonzero(pred)[0].tolist()
        print('=================================\nreal_classes', mlb.classes_[np.array(real)])
        if len(pred) > 0:
            print('predicted classes', mlb.classes_[np.array(pred)])

In [None]:
query = ['seat']
def filter_categories(cat_list):
    return set(cat_list).intersection(query) == set(query)
data[data['products'].apply(filter_categories)]

In [None]:
data.to_csv('../data/categ_preprocessed.csv')