In [1]:
import pandas
import numpy as np
import tensorflow as tf

tf.__version__

'1.4.1'

In [2]:
data = pandas.read_parquet('../data/to_send.pq')
print(data.shape)
data = data[['description', 'products']][pandas.notnull(data['products'])].copy().reset_index(drop=True)
print(data.shape)

(83897, 18)
(47516, 2)


In [3]:
data.head()

Unnamed: 0,description,products
0,Welch allyn combines its practical understandi...,"[power supply, body sub assy, medical, valve b..."
1,In line with the company s intention to ...,"[imo, advertising materials, point, imo label,..."
2,Services redaelli ricambi offers the ability t...,"[auto spare parts, tie rod, tie rod end, auto ..."
3,STROTHMANN not only delivers suitable mechanic...,"[covers non automated, demurrage rules form, r..."
4,"Established\nin 1991, tien jiang enterprise co...","[rubber, polyester, nylon, boot, support]"


In [4]:
data.loc[0]['description']

'Welch allyn combines its practical understanding of clinical needs with its visionary spirit to develop solutions that assess, diagnose, treat, and manage a variety of illnesses and diseases.as a leading global manufacturer of medical diagnostic equipment, we offer a range of connected solutions. with nearly 2,500 employees working in 26 countries, we focus on the customer and imagine how healthcare will be delivered in the future to develop tools and future-proof technologies.our professional customers include physicians’ practices, community clinics, skilled nursing facilities, and emergency departments—places where 95% of patients first seek medical treatment.our welch allyn home division delivers solutions that make home monitoring of blood pressure readings accurate and easy. with the same attention to quality that we offer to our professional customers, we’re now bringing physician-trusted solutions to the home, helping people better manage their health.'

In [5]:
data.loc[0]['products']

array(['power supply', 'body sub assy', 'medical', 'valve body sub assy',
       'digital blood pressure'], dtype=object)

# TEXT PROCESSING

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from collections import Counter

# nltk.download('stopwords')

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

data['products'] = data['products'].apply(lambda x: np.array([' '.join(tokenizer.tokenize(product)).lower()
                                                              for product in x.tolist()]))

all_products = []
for prod_list in data['products'][pandas.notnull(data['products'])].values:
    all_products += [' '.join(tokenizer.tokenize(product))
                     for product in prod_list.tolist()]
    
counter = Counter(all_products)
print('unique categories', len(counter.most_common()))

most_common = [product[0] for product in counter.most_common(600)]

def filter_categories(x):
    new_categories = np.array([product 
                               for product in x.tolist()
                               if product in most_common])
    if new_categories.shape[0] == 0:
        return np.nan
    return new_categories

lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def filter_descriptions(text):
    cleaned_text = [token#lemma.lemmatize(token)
                      for token in tokenizer.tokenize(text.lower())
                      if token not in stopwords_cached]
    if len(cleaned_text) == 0:
        return np.nan
    return ' '.join(cleaned_text)

data['products'] = data['products'].apply(filter_categories)
data['description'] = data['description'].apply(filter_descriptions)

cleaned_data = data[(data['products'].notnull()) & (data['description'].notnull())].copy().reset_index(drop=True)
cleaned_data.shape

unique categories 71111


(35475, 2)

In [7]:
cleaned_data.head()

Unnamed: 0,description,products
0,welch allyn combines practical understanding c...,"[power supply, medical]"
1,line company intention support international g...,"[imo, point, marine pollutant]"
2,services redaelli ricambi offers ability produ...,"[auto spare parts, auto spare, spare parts]"
3,strothmann delivers suitable mechanical system...,[line]
4,established tien jiang enterprise co ltd one s...,"[rubber, polyester, nylon, support]"


### fasttext model

In [8]:
from gensim.models.wrappers.fasttext import FastText

model = FastText.load_fasttext_format('../models/wiki.simple.bin')

In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from sklearn.preprocessing import MultiLabelBinarizer


# nltk.download('stopwords')

# tokenizer = RegexpTokenizer(r'\w+')
tokenizer = RegexpTokenizer(r'[a-z]+')
lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def get_embedding_matrix(text, first_n=50):
#     print(text)
    matrix = []
    text = [token#lemma.lemmatize(token)
            for token in tokenizer.tokenize(text)]
    for word in text[:first_n]:
        try:
            word_embedding = model.wv[word]
            matrix.append(model.wv[word])
        except KeyError:
            print("error in word", word)
            matrix.append(np.zeros(300))
    
    matrix = np.array(matrix)
    # fill text embeddings with seq_len < first_n with zeros
#     print(matrix.shape[0])
    if matrix.shape[0] < first_n:
        matrix = np.vstack((matrix, np.zeros((first_n - matrix.shape[0], 300))))
    return matrix

embeddings = []
for text in cleaned_data['description'].values:
    embeddings.append(get_embedding_matrix(text))
#     print(embeddings[-1].shape)
    
# onehot_y_df = pandas.get_dummies(data['severity'])

train_data = np.array(embeddings)

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([item.tolist() for item in cleaned_data['products'].values])

error in word zc
error in word zc
error in word vz
error in word vz
error in word vz


# LSTM

In [10]:
import math

class DataBatcher():
    def __init__(self, _X, _y, _batch_size=30):
        self._X = _X
        self._y = _y
        self._batch_size = _batch_size
        self._resplit = True
        self._num_examples = self._y.shape[0]
    
    def next_batch(self):
        if self._resplit:
#             print('splitting')
            perm0 = np.arange(self._num_examples)
            np.random.shuffle(perm0)
            self._batches_indexes = np.array_split(perm0, math.ceil(perm0.shape[0] / self._batch_size))
#             print(self._batches_indexes)
            self._batch_counter = -1
            self._resplit = False

        self._batch_counter += 1
        if self._batches_indexes[self._batch_counter].shape[0] < self._batch_size:
#             print('hstacking')
            self._resplit = True
            ind = self._batch_counter
#             self._batch_counter = -1
            missing_num = self._batch_size - self._batches_indexes[ind].shape[0]
            return self._X[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))],\
                   self._y[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))]
        
        return self._X[self._batches_indexes[self._batch_counter]], self._y[self._batches_indexes[self._batch_counter]]



In [11]:
# from tensorflow.examples.tutorials.mnist import input_data
# mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
from sklearn.model_selection import train_test_split

graph = tf.Graph()

with graph.as_default():

    # Training Parameters
    learning_rate = 0.001
    training_steps = 150000
    batch_size = 80
    display_step = 1000
    checkpoint_step = 20000
    
    # Network Parameters
    num_input = 300 # MNIST data input (img shape: 28*28)
    timesteps = 50 # timesteps
    num_hidden = 400 # hidden layer num of features
    num_classes = 600 # MNIST total classes (0-9 digits)
    num_layers = 3
    input_keep_prob = 0.75
    output_keep_prob = 0.75
    cell = tf.nn.rnn_cell.BasicLSTMCell

    # tf Graph input
    X = tf.placeholder("float", [None, timesteps, num_input])
    Y = tf.placeholder("float", [None, num_classes])

    # Define weights
    weights = {
        'out': tf.Variable(tf.random_normal([2 * num_hidden, num_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([num_classes]))
    }

    def RNN(x, weights, biases):

        # Prepare data shape to match `rnn` function requirements
        # Current data input shape: (batch_size, timesteps, n_input)
        # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)

        # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
        x = tf.unstack(x, timesteps, 1)

        # Cerate forward and backward cells
        cells_fw = [cell(num_hidden, activation=tf.sigmoid) for _ in range(num_layers)]
        cells_bw = [cell(num_hidden, activation=tf.sigmoid) for _ in range(num_layers)]
        
        # Add dropout
        cells_fw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=input_keep_prob,
                                                  output_keep_prob=output_keep_prob
                                                 ) for cell in cells_fw]
        cells_bw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=input_keep_prob,
                                                  output_keep_prob=output_keep_prob
                                                 ) for cell in cells_bw]
        
        outputs, _, _ = tf.contrib.rnn.stack_bidirectional_rnn(
            cells_fw=cells_fw,
            cells_bw=cells_bw,
            inputs=x,
            dtype=tf.float32)
        
#         rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
        
        return tf.matmul(outputs[-1], weights['out']) + biases['out']
        

    logits = RNN(X, weights, biases)
    prediction = tf.nn.softmax(logits)

    # Define loss and optimizer
    

# tf.nn.sigmoid_cross_entropy_with_logits 
#     loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
#         logits=logits, labels=Y))
    loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        logits=logits, labels=Y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss_op)

    # Evaluate model (with test logits, for dropout to be disabled)
    correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()
    
def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
#         print('y_true_tmp', y_true_tmp, 'y_pred_tmp', y_pred_tmp)
        real = np.nonzero(y_true_tmp)[0].tolist()
        right_num = len(real)
        pred = np.argpartition(y_pred_tmp, -right_num)[-right_num:]
#         print('real', real, 'pred', pred)
        if len(real) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real).intersection(set(pred))) / len(real))
#     print(acc)
    return(np.array(acc).mean())

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
sess = tf.InteractiveSession(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options))

# with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
# Run the initializer
sess.run(init)
batcher = DataBatcher(X_train, y_train, _batch_size=batch_size)

for step in range(1, training_steps+1):
    batch_x, batch_y = batcher.next_batch()
    # Reshape data to get 28 seq of 28 elements
#         batch_x = batch_x.reshape((batch_size, timesteps, num_input))
    # Run optimization op (backprop)
    sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
    if step % display_step == 0 or step == 1:
        # Calculate batch loss and accuracy
        loss, pred = sess.run([loss_op, prediction], feed_dict={X: batch_x, Y: batch_y})
        
        test_out = sess.run(prediction, feed_dict={X: X_test})
        print("Step " + str(step) + ", Loss= " + \
              "{:.4f}".format(loss) + ", Train Acc= " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=pred, y_true=batch_y)) +  ", Test Acc = " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=test_out, y_true=y_test)))
        
    if step % checkpoint_step == 0:
        saver = tf.train.Saver()
        saver.save(sess, '../models/lstm_3_layers/lstm_3_layers', global_step=step)

saver = tf.train.Saver()
saver.save(sess, '../models/lstm_3_layers/lstm_3_layers', global_step=step)
        

print("Optimization Finished!")

#     # Calculate accuracy for 128 mnist test images
#     print("Testing Accuracy:", \
#         sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))

Step 1, Loss= 2.8073, Train Acc= 0.003, Test Acc = 0.006
Step 1000, Loss= 0.0260, Train Acc= 0.021, Test Acc = 0.010
Step 2000, Loss= 0.0262, Train Acc= 0.000, Test Acc = 0.017
Step 3000, Loss= 0.0223, Train Acc= 0.036, Test Acc = 0.020
Step 4000, Loss= 0.0219, Train Acc= 0.037, Test Acc = 0.022
Step 5000, Loss= 0.0209, Train Acc= 0.049, Test Acc = 0.026
Step 6000, Loss= 0.0211, Train Acc= 0.015, Test Acc = 0.032
Step 7000, Loss= 0.0210, Train Acc= 0.039, Test Acc = 0.040
Step 8000, Loss= 0.0208, Train Acc= 0.050, Test Acc = 0.048
Step 9000, Loss= 0.0204, Train Acc= 0.067, Test Acc = 0.049
Step 10000, Loss= 0.0198, Train Acc= 0.030, Test Acc = 0.059
Step 11000, Loss= 0.0180, Train Acc= 0.060, Test Acc = 0.062
Step 12000, Loss= 0.0197, Train Acc= 0.067, Test Acc = 0.070
Step 13000, Loss= 0.0187, Train Acc= 0.085, Test Acc = 0.074
Step 14000, Loss= 0.0192, Train Acc= 0.085, Test Acc = 0.080
Step 15000, Loss= 0.0172, Train Acc= 0.100, Test Acc = 0.087
Step 16000, Loss= 0.0183, Train Acc= 

Step 134000, Loss= 0.0053, Train Acc= 0.735, Test Acc = 0.082
Step 135000, Loss= 0.0047, Train Acc= 0.757, Test Acc = 0.082
Step 136000, Loss= 0.0040, Train Acc= 0.809, Test Acc = 0.083
Step 137000, Loss= 0.0047, Train Acc= 0.724, Test Acc = 0.083
Step 138000, Loss= 0.0042, Train Acc= 0.769, Test Acc = 0.078
Step 139000, Loss= 0.0048, Train Acc= 0.721, Test Acc = 0.082
Step 140000, Loss= 0.0048, Train Acc= 0.729, Test Acc = 0.082
Step 141000, Loss= 0.0051, Train Acc= 0.745, Test Acc = 0.085
Step 142000, Loss= 0.0047, Train Acc= 0.768, Test Acc = 0.081
Step 143000, Loss= 0.0037, Train Acc= 0.800, Test Acc = 0.086
Step 144000, Loss= 0.0045, Train Acc= 0.787, Test Acc = 0.081
Step 145000, Loss= 0.0048, Train Acc= 0.709, Test Acc = 0.076
Step 146000, Loss= 0.0048, Train Acc= 0.759, Test Acc = 0.085
Step 147000, Loss= 0.0042, Train Acc= 0.792, Test Acc = 0.082
Step 148000, Loss= 0.0041, Train Acc= 0.780, Test Acc = 0.083
Step 149000, Loss= 0.0032, Train Acc= 0.883, Test Acc = 0.081
Step 150

In [None]:
from __future__ import print_function

import tensorflow as tf
from tensorflow.contrib import rnn

# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

graph = tf.Graph()

with graph.as_default():
    # Training Parameters
    learning_rate = 0.001
    training_steps = 10000
    batch_size = 128
    display_step = 200

    # Network Parameters
    num_input = 28 # MNIST data input (img shape: 28*28)
    timesteps = 28 # timesteps
    num_hidden = 128 # hidden layer num of features
    num_classes = 10 # MNIST total classes (0-9 digits)

    # tf Graph input
    X = tf.placeholder("float", [None, timesteps, num_input])
    Y = tf.placeholder("float", [None, num_classes])



    # Define weights
    weights = {
        'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([num_classes]))
    }

    def RNN(x, weights, biases):

        # Prepare data shape to match `rnn` function requirements
        # Current data input shape: (batch_size, timesteps, n_input)
        # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)

        # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
        x = tf.unstack(x, timesteps, 1)

        # Define a lstm cell with tensorflow
        lstm_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)

        # Get lstm cell output
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

        # Linear activation, using rnn inner loop last output
        return tf.matmul(outputs[-1], weights['out']) + biases['out']

    logits = RNN(X, weights, biases)
    prediction = tf.nn.softmax(logits)

    # Define loss and optimizer
    loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=logits, labels=Y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss_op)

    # Evaluate model (with test logits, for dropout to be disabled)
    correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()


gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
sess = tf.InteractiveSession(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options))
# Start training
# with tf.Session() as sess:

# Run the initializer
sess.run(init)

for step in range(1, training_steps+1):
    batch_x, batch_y = mnist.train.next_batch(batch_size)
#     batcher = DataBatcher(mnist.train.images, mnist.train.labels, _batch_size=batch_size)
#     batch_x, batch_y = batcher.next_batch()
    # Reshape data to get 28 seq of 28 elements
    batch_x = batch_x.reshape((batch_size, timesteps, num_input))
    # Run optimization op (backprop)
    sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
    if step % display_step == 0 or step == 1:
        # Calculate batch loss and accuracy
        loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x,
                                                             Y: batch_y})
        print("Step " + str(step) + ", Minibatch Loss= " + \
              "{:.4f}".format(loss) + ", Training Accuracy= " + \
              "{:.3f}".format(acc))

print("Optimization Finished!")

# Calculate accuracy for 128 mnist test images
test_len = 128
test_data = mnist.test.images[:test_len].reshape((-1, timesteps, num_input))
test_label = mnist.test.labels[:test_len]
print("Testing Accuracy:", \
    sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))



In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

In [None]:
batcher = DataBatcher(X_train, y_train, batch_size_=batch_size)


In [None]:
print(mnist.train.images.shape)
print(mnist.train.labels.shape)
print(mnist.test.images.shape)

In [None]:
predict_ = sess.run(prediction, feed_dict={X: [batch_x[3]]})

In [None]:
np.argmax(predict_)
# pred
# batch_y[3]
predict_

In [None]:
obj = 5
predict_ = sess.run(prediction, feed_dict={X: [batch_x[obj]]})
real = np.nonzero(batch_y[obj])[0].tolist()
# batch_y[1]
pred = np.argpartition(predict_[0], -len(real))[-len(real):]
print('real', real)
print('pred', pred)
print('classes real', mlb.classes_[real])
print('classes predicted', mlb.classes_[pred])

len(set(real).intersection(set(pred))) / len(real)

In [None]:
mlb.classes_[512]

In [None]:
def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
        print('y_true_tmp', y_true_tmp, 'y_pred_tmp', y_pred_tmp)
        real = np.nonzero(y_true_tmp)[0].tolist()
        right_num = len(real)
        pred = np.argpartition(y_pred_tmp, -right_num)[-right_num:]
        print('real', real, 'pred', pred)
        if len(real) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real).intersection(set(pred))) / len(real))
    print(acc)
    return(np.array(acc).mean())
        
evaluate_multilabel([[0,0,1], [0,0,0]], [[0,0,1], [1,1,0]])

In [None]:
np.nonzero([0,1,1])[0].tolist()