In [1]:
import pandas
import numpy as np
import tensorflow as tf

tf.__version__

'1.4.1'

In [2]:
data = pandas.read_parquet('../data/data.csv')
print(data.shape)
data = data[['description', 'products']][pandas.notnull(data['products'])].copy().reset_index(drop=True)
print(data.shape)

(83897, 18)
(47516, 2)


# TEXT PROCESSING

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from collections import Counter

# nltk.download('stopwords')

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

data['products'] = data['products'].apply(lambda x: np.array([' '.join(tokenizer.tokenize(product)).lower()
                                                              for product in x.tolist()]))

all_products = []
for prod_list in data['products'][pandas.notnull(data['products'])].values:
    all_products += [' '.join(tokenizer.tokenize(product))
                     for product in prod_list.tolist()]
    
counter = Counter(all_products)
print('unique categories', len(counter.most_common()))

most_common = [product[0] for product in counter.most_common(600)]

def filter_categories(x):
    new_categories = np.array([product 
                               for product in x.tolist()
                               if product in most_common])
    if new_categories.shape[0] == 0:
        return np.nan
    return new_categories

lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def filter_descriptions(text):
    cleaned_text = [token#lemma.lemmatize(token)
                      for token in tokenizer.tokenize(text.lower())
                      if token not in stopwords_cached]
    if len(cleaned_text) == 0:
        return np.nan
    return ' '.join(cleaned_text)

data['products'] = data['products'].apply(filter_categories)
data['description'] = data['description'].apply(filter_descriptions)

cleaned_data = data[(data['products'].notnull()) & (data['description'].notnull())].copy().reset_index(drop=True)
cleaned_data.shape

unique categories 71111


(35452, 2)

In [7]:
cleaned_data.head()

Unnamed: 0,description,products
0,welch allyn combines practical understanding c...,"[power supply, medical]"
1,line company intention support international g...,"[imo, point, marine pollutant]"
2,services redaelli ricambi offers ability produ...,"[auto spare parts, auto spare, spare parts]"
3,strothmann delivers suitable mechanical system...,[line]
4,established tien jiang enterprise co ltd one s...,"[rubber, polyester, nylon, support]"


### fasttext model

In [8]:
from gensim.models.wrappers.fasttext import FastText

model = FastText.load_fasttext_format('../models/wiki.simple.bin')

In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from sklearn.preprocessing import MultiLabelBinarizer


# nltk.download('stopwords')

# tokenizer = RegexpTokenizer(r'\w+')
tokenizer = RegexpTokenizer(r'[a-z]+')
lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def get_embedding_matrix(text, first_n=50):
#     print(text)
    matrix = []
    text = [token#lemma.lemmatize(token)
            for token in tokenizer.tokenize(text)]
    for word in text[:first_n]:
        try:
            word_embedding = model.wv[word]
            matrix.append(model.wv[word])
        except KeyError:
            print("error in word", word)
            matrix.append(np.zeros(300))
    
    matrix = np.array(matrix)
    # fill text embeddings with seq_len < first_n with zeros
#     print(matrix.shape[0])
    if matrix.shape[0] < first_n:
        matrix = np.vstack((matrix, np.zeros((first_n - matrix.shape[0], 300))))
    return matrix

embeddings = []
for text in cleaned_data['description'].values:
    embeddings.append(get_embedding_matrix(text))
#     print(embeddings[-1].shape)
    
# onehot_y_df = pandas.get_dummies(data['severity'])

train_data = np.array(embeddings)

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([item.tolist() for item in cleaned_data['products'].values])

error in word zc
error in word zc
error in word vz
error in word vz
error in word vz


# LSTM

In [10]:
import math

class DataBatcher():
    def __init__(self, _X, _y, _batch_size=30):
        self._X = _X
        self._y = _y
        self._batch_size = _batch_size
        self._resplit = True
        self._num_examples = self._y.shape[0]
    
    def next_batch(self):
        if self._resplit:
#             print('splitting')
            perm0 = np.arange(self._num_examples)
            np.random.shuffle(perm0)
            self._batches_indexes = np.array_split(perm0, math.ceil(perm0.shape[0] / self._batch_size))
#             print(self._batches_indexes)
            self._batch_counter = -1
            self._resplit = False

        self._batch_counter += 1
        if self._batches_indexes[self._batch_counter].shape[0] < self._batch_size:
#             print('hstacking')
            self._resplit = True
            ind = self._batch_counter
#             self._batch_counter = -1
            missing_num = self._batch_size - self._batches_indexes[ind].shape[0]
            return self._X[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))],\
                   self._y[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))]
        
        return self._X[self._batches_indexes[self._batch_counter]], self._y[self._batches_indexes[self._batch_counter]]



In [11]:
# from tensorflow.examples.tutorials.mnist import input_data
# mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
from sklearn.model_selection import train_test_split

graph = tf.Graph()

with graph.as_default():

    # Training Parameters
    learning_rate = 0.001
    training_steps = 220000
    batch_size = 80
    display_step = 1000
    checkpoint_step = 20000
    
    # Network Parameters
    num_input = 300 # MNIST data input (img shape: 28*28)
    timesteps = 50 # timesteps
    num_hidden = 400 # hidden layer num of features
    num_classes = 600 # MNIST total classes (0-9 digits)
    num_layers = 3
    input_keep_prob = 0.5
    output_keep_prob = 0.5
    cell = tf.contrib.rnn.LSTMCell

    # tf Graph input
    X = tf.placeholder("float", [None, timesteps, num_input])
    Y = tf.placeholder("float", [None, num_classes])

    # Define weights
    weights = {
        'out': tf.Variable(tf.random_normal([2 * num_hidden, num_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([num_classes]))
    }

    def RNN(x, weights, biases):

        # Prepare data shape to match `rnn` function requirements
        # Current data input shape: (batch_size, timesteps, n_input)
        # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)

        # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
        x = tf.unstack(x, timesteps, 1)

        # Cerate forward and backward cells
        cells_fw = [cell(num_hidden, activation=tf.tanh) for _ in range(num_layers)]
        cells_bw = [cell(num_hidden, activation=tf.tanh) for _ in range(num_layers)]
#         cells_fw.append(cell(num_hidden, activation=tf.tanh))
#         cells_bw.append(cell(num_hidden, activation=tf.tanh))
        
        # Add dropout
        cells_fw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=input_keep_prob,
                                                  output_keep_prob=output_keep_prob
                                                 ) for cell in cells_fw]
        cells_bw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=input_keep_prob,
                                                  output_keep_prob=output_keep_prob
                                                 ) for cell in cells_bw]
        
        outputs, _, _ = tf.contrib.rnn.stack_bidirectional_rnn(
            cells_fw=cells_fw,
            cells_bw=cells_bw,
            inputs=x,
            dtype=tf.float32)
        
#         rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
        
        return tf.matmul(outputs[-1], weights['out']) + biases['out']
        

    logits = RNN(X, weights, biases)
    prediction = tf.nn.sigmoid(logits)

    # Define loss and optimizer
    

# tf.nn.sigmoid_cross_entropy_with_logits 
#     loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
#         logits=logits, labels=Y))
    loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        logits=logits, labels=Y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss_op)

    # Evaluate model (with test logits, for dropout to be disabled)
#     correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
#     accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()
    
def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
#         print('y_true_tmp', y_true_tmp, 'y_pred_tmp', y_pred_tmp)
        real_ = np.nonzero(y_true_tmp)[0].tolist()
#         right_num = len(real)
        pred_ = np.nonzero(y_pred_tmp)[0].tolist()
#         print('real', real, 'pred', pred)
        if len(real_) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real_).intersection(set(pred_))) / len(real_))
#     print(acc)
    return(np.array(acc).mean())

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
sess = tf.InteractiveSession(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options))

# with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
# Run the initializer
sess.run(init)
batcher = DataBatcher(X_train, y_train, _batch_size=batch_size)

for step in range(1, training_steps+1):
    batch_x, batch_y = batcher.next_batch()
    # Reshape data to get 28 seq of 28 elements
#         batch_x = batch_x.reshape((batch_size, timesteps, num_input))
    # Run optimization op (backprop)
    sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
    if step % display_step == 0 or step == 1:
        # Calculate batch loss and accuracy
        loss, pred = sess.run([loss_op, tf.round(prediction)], feed_dict={X: batch_x, Y: batch_y})
        
        test_out = sess.run(tf.round(prediction), feed_dict={X: X_test})
        print("Step " + str(step) + ", Loss= " + \
              "{:.4f}".format(loss) + ", Train Acc= " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=pred, y_true=batch_y)) +  ", Test Acc = " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=test_out, y_true=y_test)))
        
    if step % checkpoint_step == 0:
        saver = tf.train.Saver()
        saver.save(sess, '../models/lstm_big_dropout/lstm_big_dropout', global_step=step)

saver = tf.train.Saver()
saver.save(sess, '../models/lstm_big_dropout/lstm_big_dropout', global_step=step)
        

print("Optimization Finished!")

#     # Calculate accuracy for 128 mnist test images
#     print("Testing Accuracy:", \
#         sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))

Step 1, Loss= 0.9152, Train Acc= 0.551, Test Acc = 0.417
Step 1000, Loss= 0.0243, Train Acc= 0.000, Test Acc = 0.000
Step 2000, Loss= 0.0240, Train Acc= 0.000, Test Acc = 0.001
Step 3000, Loss= 0.0203, Train Acc= 0.003, Test Acc = 0.005
Step 4000, Loss= 0.0223, Train Acc= 0.000, Test Acc = 0.012
Step 5000, Loss= 0.0227, Train Acc= 0.000, Test Acc = 0.001
Step 6000, Loss= 0.0217, Train Acc= 0.000, Test Acc = 0.013
Step 7000, Loss= 0.0201, Train Acc= 0.006, Test Acc = 0.013
Step 8000, Loss= 0.0182, Train Acc= 0.017, Test Acc = 0.020
Step 9000, Loss= 0.0216, Train Acc= 0.025, Test Acc = 0.023
Step 10000, Loss= 0.0192, Train Acc= 0.019, Test Acc = 0.024
Step 11000, Loss= 0.0171, Train Acc= 0.020, Test Acc = 0.024
Step 12000, Loss= 0.0200, Train Acc= 0.042, Test Acc = 0.025
Step 13000, Loss= 0.0175, Train Acc= 0.019, Test Acc = 0.027
Step 14000, Loss= 0.0164, Train Acc= 0.054, Test Acc = 0.028
Step 15000, Loss= 0.0172, Train Acc= 0.040, Test Acc = 0.036
Step 16000, Loss= 0.0166, Train Acc= 

Step 134000, Loss= 0.0025, Train Acc= 0.791, Test Acc = 0.070
Step 135000, Loss= 0.0015, Train Acc= 0.887, Test Acc = 0.073
Step 136000, Loss= 0.0022, Train Acc= 0.866, Test Acc = 0.070
Step 137000, Loss= 0.0021, Train Acc= 0.841, Test Acc = 0.070
Step 138000, Loss= 0.0021, Train Acc= 0.804, Test Acc = 0.071
Step 139000, Loss= 0.0019, Train Acc= 0.897, Test Acc = 0.067
Step 140000, Loss= 0.0016, Train Acc= 0.904, Test Acc = 0.071
Step 141000, Loss= 0.0015, Train Acc= 0.926, Test Acc = 0.071
Step 142000, Loss= 0.0022, Train Acc= 0.823, Test Acc = 0.069
Step 143000, Loss= 0.0013, Train Acc= 0.864, Test Acc = 0.069
Step 144000, Loss= 0.0012, Train Acc= 0.914, Test Acc = 0.073
Step 145000, Loss= 0.0020, Train Acc= 0.805, Test Acc = 0.071
Step 146000, Loss= 0.0022, Train Acc= 0.863, Test Acc = 0.070
Step 147000, Loss= 0.0018, Train Acc= 0.876, Test Acc = 0.069
Step 148000, Loss= 0.0023, Train Acc= 0.871, Test Acc = 0.072
Step 149000, Loss= 0.0021, Train Acc= 0.890, Test Acc = 0.071
Step 150

# Testing

In [None]:
num = 1
pred = tf.round(tf.nn.sigmoid(logits))

def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
#         print('y_true_tmp', y_true_tmp, 'y_pred_tmp', y_pred_tmp)
        real_ = np.nonzero(y_true_tmp)[0].tolist()
#         right_num = len(real)
        pred_ = np.nonzero(y_pred_tmp)[0].tolist()
#         print('real', real, 'pred', pred)
        if len(real) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real_).intersection(set(pred_))) / len(real_))
#     print(acc)
    return(np.array(acc).mean())

# correct_predictions = tf.equal(tf.round(tf.nn.sigmoid(logits)), Y)
# accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
# pred = sess.run(accuracy, feed_dict={X: X_test, Y: y_test})
pred = sess.run(pred, feed_dict={X: X_test, Y: y_test})
evaluate_multilabel(pred, y_test)
# real = y_test[num]
# tf.equal(tf.round(tf.nn.sigmoid(pred)), tf.round(y_))
# pred > 0.5

# real = np.nonzero(real)[0].tolist()
# right_num = len(real)
# pred = np.argpartition(pred, -right_num)[-right_num:]
# print(real, pred)
# print('real_classes', mlb.classes_[np.array(real)])
# print('predicted classes', mlb.classes_[np.array(pred)])

In [None]:
with tf.Session() as session:
    save = tf.train.Saver()
    save.restore(session, '../models/lstm_relu/lstm_relu-150000')
    pred = tf.round(tf.nn.sigmoid(logits))
    print(session.run(pred, feed_dict={X: X_test, Y: y_test}))

In [None]:
query = ['dthc']
def filter_categories(cat_list):
    return set(cat_list).intersection(query) == set(query)
data[data['products'].apply(filter_categories)]

In [None]:
set(['qwe', 'asd']).intersection(['qwe', 'asd']) == set(['asd', 'qwe'])

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

In [None]:
batcher = DataBatcher(X_train, y_train, batch_size_=batch_size)


In [None]:
print(mnist.train.images.shape)
print(mnist.train.labels.shape)
print(mnist.test.images.shape)

In [None]:
predict_ = sess.run(prediction, feed_dict={X: [batch_x[3]]})

In [None]:
np.argmax(predict_)
# pred
# batch_y[3]
predict_

In [None]:
obj = 5
predict_ = sess.run(prediction, feed_dict={X: [batch_x[obj]]})
real = np.nonzero(batch_y[obj])[0].tolist()
# batch_y[1]
pred = np.argpartition(predict_[0], -len(real))[-len(real):]
print('real', real)
print('pred', pred)
print('classes real', mlb.classes_[real])
print('classes predicted', mlb.classes_[pred])

len(set(real).intersection(set(pred))) / len(real)