In [1]:
import pandas
import numpy as np
import tensorflow as tf

tf.__version__

'1.4.1'

In [2]:
data = pandas.read_parquet('../data/to_send.pq')
print(data.shape)
data = data[['description', 'products']][pandas.notnull(data['products'])].copy().reset_index(drop=True)
print(data.shape)

(83897, 18)
(47516, 2)


In [3]:
data.head()

Unnamed: 0,description,products
0,Welch allyn combines its practical understandi...,"[power supply, body sub assy, medical, valve b..."
1,In line with the company s intention to ...,"[imo, advertising materials, point, imo label,..."
2,Services redaelli ricambi offers the ability t...,"[auto spare parts, tie rod, tie rod end, auto ..."
3,STROTHMANN not only delivers suitable mechanic...,"[covers non automated, demurrage rules form, r..."
4,"Established\nin 1991, tien jiang enterprise co...","[rubber, polyester, nylon, boot, support]"


In [4]:
data.loc[0]['description']

'Welch allyn combines its practical understanding of clinical needs with its visionary spirit to develop solutions that assess, diagnose, treat, and manage a variety of illnesses and diseases.as a leading global manufacturer of medical diagnostic equipment, we offer a range of connected solutions. with nearly 2,500 employees working in 26 countries, we focus on the customer and imagine how healthcare will be delivered in the future to develop tools and future-proof technologies.our professional customers include physicians’ practices, community clinics, skilled nursing facilities, and emergency departments—places where 95% of patients first seek medical treatment.our welch allyn home division delivers solutions that make home monitoring of blood pressure readings accurate and easy. with the same attention to quality that we offer to our professional customers, we’re now bringing physician-trusted solutions to the home, helping people better manage their health.'

In [5]:
data.loc[0]['products']

array(['power supply', 'body sub assy', 'medical', 'valve body sub assy',
       'digital blood pressure'], dtype=object)

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from collections import Counter

# nltk.download('stopwords')

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stopwords_cached = stopwords.words('english')

def clean_categories(x):
    result = []
    for category in x:
        tmp_ = ' '.join([word 
                         for word in tokenizer.tokenize(category.lower())
                         if word not in stopwords_cached])
        if tmp_ != '':
            result.append(tmp_)
    if len(result) == 0:
        return np.nan
    return np.array(result)

data['products'] = data['products'].apply(clean_categories)
data = data[data['products'].notnull()].copy().reset_index(drop=True)

all_products = []
for prod_list in data['products'].values:
    all_products += [' '.join(tokenizer.tokenize(product))
                     for product in prod_list.tolist()]
    
counter = Counter(all_products)
print('unique categories', len(counter.most_common()))

most_common = [product[0] for product in counter.most_common(100)]

def filter_categories(x):
    new_categories = np.array([product 
                               for product in x.tolist()
                               if product in most_common])
    if new_categories.shape[0] == 0:
        return np.nan
    return new_categories

lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def filter_descriptions(text):
    cleaned_text = [lemma.lemmatize(token)
                      for token in tokenizer.tokenize(text.lower())
                      if token not in stopwords_cached]
    if len(cleaned_text) == 0:
        return np.nan
    return ' '.join(cleaned_text)

data['products'] = data['products'].apply(filter_categories)
data['description'] = data['description'].apply(filter_descriptions)

data = data[(data['products'].notnull()) & (data['description'].notnull())].reset_index(drop=True)
data.shape

unique categories 70713


(22403, 2)

# TEXT PROCESSING

In [7]:
data.head()

Unnamed: 0,description,products
0,welch allyn combine practical understanding cl...,"[power supply, medical]"
1,line company intention support international g...,[imo]
2,service redaelli ricambi offer ability produce...,"[auto spare parts, auto spare, spare parts]"
3,established tien jiang enterprise co ltd one s...,"[rubber, polyester]"
4,songwei dedicate become benchmark manufacturin...,[electric]


### fasttext model

In [8]:
from gensim.models.wrappers.fasttext import FastText

model = FastText.load_fasttext_format('../models/wiki.simple.bin')

In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from sklearn.preprocessing import MultiLabelBinarizer


# nltk.download('stopwords')

# tokenizer = RegexpTokenizer(r'\w+')
tokenizer = RegexpTokenizer(r'[a-z]+')
lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def get_embedding_matrix(text, first_n=50):
#     print(text)
    matrix = []
    text = [token#lemma.lemmatize(token)
            for token in tokenizer.tokenize(text)]
    for word in text[:first_n]:
        try:
            word_embedding = model.wv[word]
            matrix.append(model.wv[word])
        except KeyError:
            print("error in word", word)
            matrix.append(np.zeros(300))
    
    matrix = np.array(matrix)
    # fill text embeddings with seq_len < first_n with zeros
#     print(matrix.shape[0])
    if matrix.shape[0] < first_n:
        matrix = np.vstack((matrix, np.zeros((first_n - matrix.shape[0], 300))))
    return matrix

embeddings = []
for text in data['description'].values:
    embeddings.append(get_embedding_matrix(text))
#     print(embeddings[-1].shape)
    
# onehot_y_df = pandas.get_dummies(data['severity'])

train_data = np.array(embeddings)

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([item.tolist() for item in data['products'].values])

error in word vz
error in word vz
error in word vz


In [10]:
# TODO Clusterize for example "aluminium" and "aluminum";  'spare part' and 'spare parts' and others into one category.
mlb.classes_

array(['aluminium', 'aluminum', 'assembly', 'auto parts', 'auto spare',
       'auto spare parts', 'automobile', 'automotive parts',
       'automotive spare', 'automotive spare parts', 'ball', 'battery',
       'book', 'bottle', 'brake', 'cable', 'cap', 'car', 'ceramic',
       'ceramic tile', 'chair', 'component', 'components', 'compressor',
       'computer', 'copper', 'cotton', 'cover', 'cylinder', 'display',
       'electric', 'electrical', 'engine', 'fabric', 'film', 'filter',
       'furniture', 'gear', 'glass', 'granite', 'hand', 'hardware',
       'hose', 'housing', 'hydraulic', 'imo', 'industrial', 'injection',
       'leather', 'led', 'light', 'lighting', 'machinery',
       'machinery parts', 'medical', 'mold', 'motor', 'motor vehicle',
       'nut', 'oil', 'pipe', 'plastic parts', 'plate', 'polyester',
       'powder', 'power', 'power supply', 'pump', 'pvc', 'ring', 'rubber',
       'screw', 'seat', 'sheet', 'slab', 'software', 'solid wood',
       'spare part', 'spare par

# LSTM

In [None]:
import math

class DataBatcher():
    def __init__(self, _X, _y, _batch_size=30):
        self._X = _X
        self._y = _y
        self._batch_size = _batch_size
        self._resplit = True
        self._num_examples = self._y.shape[0]
    
    def next_batch(self):
        if self._resplit:
#             print('splitting')
            perm0 = np.arange(self._num_examples)
            np.random.shuffle(perm0)
            self._batches_indexes = np.array_split(perm0, math.ceil(perm0.shape[0] / self._batch_size))
#             print(self._batches_indexes)
            self._batch_counter = -1
            self._resplit = False

        self._batch_counter += 1
        if self._batches_indexes[self._batch_counter].shape[0] < self._batch_size:
#             print('hstacking')
            self._resplit = True
            ind = self._batch_counter
#             self._batch_counter = -1
            missing_num = self._batch_size - self._batches_indexes[ind].shape[0]
            return self._X[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))],\
                   self._y[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))]
        
        return self._X[self._batches_indexes[self._batch_counter]], self._y[self._batches_indexes[self._batch_counter]]



In [None]:
from sklearn.model_selection import train_test_split

graph = tf.Graph()

with graph.as_default():

    # Training Parameters
    learning_rate = 0.001
    training_steps = 220000
    batch_size = 80
    display_step = 1000
    checkpoint_step = 10000
    
    # Network Parameters
    num_input = 300 
    timesteps = 50 
    num_hidden = 450 
    num_classes = 100 
    num_layers = 3
#     input_keep_prob = 0.65
#     output_keep_prob = 0.65
    cell = tf.contrib.rnn.LSTMCell

    # tf Graph input
    X = tf.placeholder("float", [None, timesteps, num_input])
    Y = tf.placeholder("float", [None, num_classes])

    # Define weights
    weights = {
        'out': tf.Variable(tf.random_normal([2 * num_hidden, num_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([num_classes]))
    }

    def RNN(x, weights, biases):

        # Prepare data shape to match `rnn` function requirements
        # Current data input shape: (batch_size, timesteps, n_input)
        # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)

        # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
        x = tf.unstack(x, timesteps, 1)

        # Cerate forward and backward cells
        cells_fw = [cell(num_hidden, activation=tf.tanh) for _ in range(num_layers)]
        cells_bw = [cell(num_hidden, activation=tf.tanh) for _ in range(num_layers)]
        
        keep_probs = [0.75, 0.65, 0.65]
        # Add dropout
        cells_fw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=keep_prob_
                                                 ) for cell, keep_prob_ in zip(cells_fw, keep_probs)]
        cells_bw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=keep_prob_
                                                 ) for cell, keep_prob_ in zip(cells_bw, keep_probs)]
        
        outputs, _, _ = tf.contrib.rnn.stack_bidirectional_rnn(
            cells_fw=cells_fw,
            cells_bw=cells_bw,
            inputs=x,
            dtype=tf.float32)
        
        return tf.matmul(outputs[-1], weights['out']) + biases['out']

    logits = RNN(X, weights, biases)
    prediction = tf.nn.sigmoid(logits)

    # Define loss and optimizer
    loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        logits=logits, labels=Y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss_op)

    init = tf.global_variables_initializer()
    
def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
        real_ = np.nonzero(y_true_tmp)[0].tolist()
        pred_ = np.nonzero(y_pred_tmp)[0].tolist()
        if len(real_) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real_).intersection(set(pred_))) / len(real_))
    return(np.array(acc).mean())

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
sess = tf.InteractiveSession(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options))

# with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
# Run the initializer
sess.run(init)
batcher = DataBatcher(X_train, y_train, _batch_size=batch_size)

try:
for step in range(1, training_steps+1):
    batch_x, batch_y = batcher.next_batch()
    # Reshape data to get 28 seq of 28 elements
#         batch_x = batch_x.reshape((batch_size, timesteps, num_input))
    # Run optimization op (backprop)
    sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
    if step % display_step == 0 or step == 1:
        # Calculate batch loss and accuracy
        loss, pred = sess.run([loss_op, tf.round(prediction)], feed_dict={X: batch_x, Y: batch_y})
        
        test_out = sess.run(tf.round(prediction), feed_dict={X: X_test})
        print("Step " + str(step) + ", Loss= " + \
              "{:.4f}".format(loss) + ", Train Acc= " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=pred, y_true=batch_y)) +  ", Test Acc = " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=test_out, y_true=y_test)))
        
    if step % checkpoint_step == 0:
        saver = tf.train.Saver()
        saver.save(sess, '../models/lstm_big_dropout/lstm_100_classes', global_step=step)
except 

saver = tf.train.Saver()
saver.save(sess, '../models/lstm_big_dropout/lstm_100_classes', global_step=step)
        

print("Optimization Finished!")

#     # Calculate accuracy for 128 mnist test images
#     print("Testing Accuracy:", \
#         sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))

Step 1, Loss= 0.4753, Train Acc= 0.224, Test Acc = 0.204
Step 1000, Loss= 0.0625, Train Acc= 0.062, Test Acc = 0.045
Step 2000, Loss= 0.0615, Train Acc= 0.062, Test Acc = 0.056
Step 3000, Loss= 0.0519, Train Acc= 0.122, Test Acc = 0.085
Step 4000, Loss= 0.0480, Train Acc= 0.229, Test Acc = 0.102
Step 5000, Loss= 0.0490, Train Acc= 0.175, Test Acc = 0.105
Step 6000, Loss= 0.0270, Train Acc= 0.537, Test Acc = 0.126
Step 7000, Loss= 0.0209, Train Acc= 0.617, Test Acc = 0.122
Step 8000, Loss= 0.0168, Train Acc= 0.623, Test Acc = 0.127
Step 9000, Loss= 0.0160, Train Acc= 0.754, Test Acc = 0.125
Step 10000, Loss= 0.0122, Train Acc= 0.792, Test Acc = 0.124
Step 11000, Loss= 0.0089, Train Acc= 0.850, Test Acc = 0.126
Step 12000, Loss= 0.0088, Train Acc= 0.858, Test Acc = 0.125
Step 13000, Loss= 0.0064, Train Acc= 0.915, Test Acc = 0.131
Step 14000, Loss= 0.0048, Train Acc= 0.906, Test Acc = 0.135
Step 15000, Loss= 0.0053, Train Acc= 0.884, Test Acc = 0.135
Step 16000, Loss= 0.0029, Train Acc= 

# Testing

In [None]:
num = 1
pred = tf.round(tf.nn.sigmoid(logits))

def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
#         print('y_true_tmp', y_true_tmp, 'y_pred_tmp', y_pred_tmp)
        real_ = np.nonzero(y_true_tmp)[0].tolist()
#         right_num = len(real)
        pred_ = np.nonzero(y_pred_tmp)[0].tolist()
#         print('real', real, 'pred', pred)
        if len(real) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real_).intersection(set(pred_))) / len(real_))
#     print(acc)
    return(np.array(acc).mean())

# correct_predictions = tf.equal(tf.round(tf.nn.sigmoid(logits)), Y)
# accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
# pred = sess.run(accuracy, feed_dict={X: X_test, Y: y_test})
pred = sess.run(pred, feed_dict={X: X_test, Y: y_test})
evaluate_multilabel(pred, y_test)
# real = y_test[num]
# tf.equal(tf.round(tf.nn.sigmoid(pred)), tf.round(y_))
# pred > 0.5

# real = np.nonzero(real)[0].tolist()
# right_num = len(real)
# pred = np.argpartition(pred, -right_num)[-right_num:]
# print(real, pred)
# print('real_classes', mlb.classes_[np.array(real)])
# print('predicted classes', mlb.classes_[np.array(pred)])

In [None]:
with tf.Session() as session:
    save = tf.train.Saver()
    save.restore(session, '../models/lstm_relu/lstm_relu-150000')
    pred = tf.round(tf.nn.sigmoid(logits))
    print(session.run(pred, feed_dict={X: X_test, Y: y_test}))

In [None]:
query = ['dthc']
def filter_categories(cat_list):
    return set(cat_list).intersection(query) == set(query)
data[data['products'].apply(filter_categories)]

In [None]:
set(['qwe', 'asd']).intersection(['qwe', 'asd']) == set(['asd', 'qwe'])

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

In [None]:
batcher = DataBatcher(X_train, y_train, batch_size_=batch_size)


In [None]:
print(mnist.train.images.shape)
print(mnist.train.labels.shape)
print(mnist.test.images.shape)

In [None]:
predict_ = sess.run(prediction, feed_dict={X: [batch_x[3]]})

In [None]:
np.argmax(predict_)
# pred
# batch_y[3]
predict_

In [None]:
obj = 5
predict_ = sess.run(prediction, feed_dict={X: [batch_x[obj]]})
real = np.nonzero(batch_y[obj])[0].tolist()
# batch_y[1]
pred = np.argpartition(predict_[0], -len(real))[-len(real):]
print('real', real)
print('pred', pred)
print('classes real', mlb.classes_[real])
print('classes predicted', mlb.classes_[pred])

len(set(real).intersection(set(pred))) / len(real)

In [None]:
performance_log = None

with open('../perf.log', 'r') as f:
    performance_log = f.readlines()
    
ram = []
cpu = []
for i in range(7, len(performance_log), 9):
#     print(performance_log[i][:-1].split(' '))
    line = [i for i in performance_log[i][:-1].split(' ') if i != '' ]
    if line == list():
        continue
#     print(line)
    ram.append(line[5])
    cpu.append(float(line[8].replace(',', '.')))

ram = [float(i[:-1].replace(',', '.')) if 'g' in i else float(i) / (10**6) for i in ram]

import matplotlib.pyplot as plt

plt.figure(1, figsize=(15,6))
plt.subplot(211)
plt.plot(range(len(ram)),ram)
plt.title("RAM")
plt.subplot(212)
plt.plot(range(len(cpu)), cpu)
plt.title("CPU")
plt.show()

    
# performance_log[:40]

In [None]:
set([1,2,3,4,5,6,7,8,9,10])
list(set([423,123,54,2341,3125,62,322]))