In [1]:
import pandas
import numpy as np
import tensorflow as tf

tf.__version__

'1.4.1'

In [2]:
data = pandas.read_parquet('../data/to_send.pq')
print(data.shape)
data = data[['description', 'products']][pandas.notnull(data['products'])].copy().reset_index(drop=True)
print(data.shape)

(83897, 18)
(47516, 2)


In [3]:
data.head()

Unnamed: 0,description,products
0,Welch allyn combines its practical understandi...,"[power supply, body sub assy, medical, valve b..."
1,In line with the company s intention to ...,"[imo, advertising materials, point, imo label,..."
2,Services redaelli ricambi offers the ability t...,"[auto spare parts, tie rod, tie rod end, auto ..."
3,STROTHMANN not only delivers suitable mechanic...,"[covers non automated, demurrage rules form, r..."
4,"Established\nin 1991, tien jiang enterprise co...","[rubber, polyester, nylon, boot, support]"


In [4]:
data.loc[0]['description']

'Welch allyn combines its practical understanding of clinical needs with its visionary spirit to develop solutions that assess, diagnose, treat, and manage a variety of illnesses and diseases.as a leading global manufacturer of medical diagnostic equipment, we offer a range of connected solutions. with nearly 2,500 employees working in 26 countries, we focus on the customer and imagine how healthcare will be delivered in the future to develop tools and future-proof technologies.our professional customers include physicians’ practices, community clinics, skilled nursing facilities, and emergency departments—places where 95% of patients first seek medical treatment.our welch allyn home division delivers solutions that make home monitoring of blood pressure readings accurate and easy. with the same attention to quality that we offer to our professional customers, we’re now bringing physician-trusted solutions to the home, helping people better manage their health.'

In [5]:
data.loc[0]['products']

array(['power supply', 'body sub assy', 'medical', 'valve body sub assy',
       'digital blood pressure'], dtype=object)

# TEXT PROCESSING

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from collections import Counter

# nltk.download('stopwords')

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

data['products'] = data['products'].apply(lambda x: np.array([' '.join(tokenizer.tokenize(product)).lower()
                                                              for product in x.tolist()]))

all_products = []
for prod_list in data['products'][pandas.notnull(data['products'])].values:
    all_products += [' '.join(tokenizer.tokenize(product))
                     for product in prod_list.tolist()]
    
counter = Counter(all_products)
print('unique categories', len(counter.most_common()))

most_common = [product[0] for product in counter.most_common(600)]

def filter_categories(x):
    new_categories = np.array([product 
                               for product in x.tolist()
                               if product in most_common])
    if new_categories.shape[0] == 0:
        return np.nan
    return new_categories

lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def filter_descriptions(text):
    cleaned_text = [token#lemma.lemmatize(token)
                      for token in tokenizer.tokenize(text.lower())
                      if token not in stopwords_cached]
    if len(cleaned_text) == 0:
        return np.nan
    return ' '.join(cleaned_text)

data['products'] = data['products'].apply(filter_categories)
data['description'] = data['description'].apply(filter_descriptions)

cleaned_data = data[(data['products'].notnull()) & (data['description'].notnull())].copy().reset_index(drop=True)
cleaned_data.shape

unique categories 71111


(35442, 2)

In [7]:
cleaned_data.head()

Unnamed: 0,description,products
0,welch allyn combines practical understanding c...,"[power supply, medical]"
1,line company intention support international g...,"[imo, point, marine pollutant]"
2,services redaelli ricambi offers ability produ...,"[auto spare parts, auto spare, spare parts]"
3,strothmann delivers suitable mechanical system...,[line]
4,established tien jiang enterprise co ltd one s...,"[rubber, polyester, nylon, support]"


### fasttext model

In [8]:
from gensim.models.wrappers.fasttext import FastText

model = FastText.load_fasttext_format('../models/wiki.simple.bin')

In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from sklearn.preprocessing import MultiLabelBinarizer


# nltk.download('stopwords')

# tokenizer = RegexpTokenizer(r'\w+')
tokenizer = RegexpTokenizer(r'[a-z]+')
lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def get_embedding_matrix(text, first_n=50):
#     print(text)
    matrix = []
    text = [token#lemma.lemmatize(token)
            for token in tokenizer.tokenize(text)]
    for word in text[:first_n]:
        try:
            word_embedding = model.wv[word]
            matrix.append(model.wv[word])
        except KeyError:
            print("error in word", word)
            matrix.append(np.zeros(300))
    
    matrix = np.array(matrix)
    # fill text embeddings with seq_len < first_n with zeros
#     print(matrix.shape[0])
    if matrix.shape[0] < first_n:
        matrix = np.vstack((matrix, np.zeros((first_n - matrix.shape[0], 300))))
    return matrix

embeddings = []
for text in cleaned_data['description'].values:
    embeddings.append(get_embedding_matrix(text))
#     print(embeddings[-1].shape)
    
# onehot_y_df = pandas.get_dummies(data['severity'])

train_data = np.array(embeddings)

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([item.tolist() for item in cleaned_data['products'].values])

error in word zc
error in word zc
error in word vz
error in word vz
error in word vz


# LSTM

In [10]:
class DataBatcher():
    def __init__(self, X_, y_, batch_size_=30):
        self.X_ = X_
        self.y_ = y_
        self.batch_size_ = batch_size_
        self.from_ = 0
        self.to = self.from_ + self.batch_size_

    def next_batch(self):
        if self.to == len(self.X_):
            f = self.from_
            t = self.to
            self.from_ = 0
            self.to = self.to = self.from_ + self.batch_size_
            return self.X_[f:t], self.y_[f:t]
        elif self.to > len(self.X_):
            f = self.from_
            t = self.to - len(self.X_)
            self.from_ = self.to - len(self.X_)
            self.to = self.from_ + self.batch_size_
            return np.append(self.X_[f:], self.X_[:t], axis=0), np.append(self.y_[f:], self.y_[:t], axis=0)
        else:
            f = self.from_
            t = self.to
            self.from_ = self.to
            self.to = self.from_ + self.batch_size_
            return self.X_[f:t], self.y_[f:t]

In [12]:
# from tensorflow.examples.tutorials.mnist import input_data
# mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
from sklearn.model_selection import train_test_split

graph = tf.Graph()

with graph.as_default():

    # Training Parameters
    learning_rate = 0.0001
    training_steps = 150000
    batch_size = 80
    display_step = 1000
    checkpoint_step = 20000
    
    # Network Parameters
    num_input = 300 # MNIST data input (img shape: 28*28)
    timesteps = 50 # timesteps
    num_hidden = 128 # hidden layer num of features
    num_classes = 600 # MNIST total classes (0-9 digits)
    num_layers = 2
    input_keep_prob = 0.75
    output_keep_prob = 0.75
    cell = tf.nn.rnn_cell.BasicLSTMCell

    # tf Graph input
    X = tf.placeholder("float", [None, timesteps, num_input])
    Y = tf.placeholder("float", [None, num_classes])

    # Define weights
    weights = {
        'out': tf.Variable(tf.random_normal([2 * num_hidden, num_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([num_classes]))
    }

    def RNN(x, weights, biases):

        # Prepare data shape to match `rnn` function requirements
        # Current data input shape: (batch_size, timesteps, n_input)
        # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)

        # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
        x = tf.unstack(x, timesteps, 1)

        # Cerate forward and backward cells
        cells_fw = [cell(num_hidden, activation=tf.sigmoid) for _ in range(num_layers)]
        cells_bw = [cell(num_hidden, activation=tf.sigmoid) for _ in range(num_layers)]
        
        # Add dropout
        cells_fw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=input_keep_prob,
                                                  output_keep_prob=output_keep_prob
                                                 ) for cell in cells_fw]
        cells_bw = [tf.contrib.rnn.DropoutWrapper(cell, 
                                                  input_keep_prob=input_keep_prob,
                                                  output_keep_prob=output_keep_prob
                                                 ) for cell in cells_bw]
        
        outputs, _, _ = tf.contrib.rnn.stack_bidirectional_rnn(
            cells_fw=cells_fw,
            cells_bw=cells_bw,
            inputs=x,
            dtype=tf.float32)
        
#         rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
        
        return tf.matmul(outputs[-1], weights['out']) + biases['out']
        

    logits = RNN(X, weights, biases)
    prediction = tf.nn.softmax(logits)

    # Define loss and optimizer
    

# tf.nn.sigmoid_cross_entropy_with_logits 
#     loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
#         logits=logits, labels=Y))
    loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        logits=logits, labels=Y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss_op)

    # Evaluate model (with test logits, for dropout to be disabled)
    correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()
    
def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
#         print('y_true_tmp', y_true_tmp, 'y_pred_tmp', y_pred_tmp)
        real = np.nonzero(y_true_tmp)[0].tolist()
        right_num = len(real)
        pred = np.argpartition(y_pred_tmp, -right_num)[-right_num:]
#         print('real', real, 'pred', pred)
        if len(real) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real).intersection(set(pred))) / len(real))
#     print(acc)
    return(np.array(acc).mean())

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
sess = tf.InteractiveSession(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options))

# with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
# Run the initializer
sess.run(init)
batcher = DataBatcher(X_train, y_train, batch_size_=batch_size)

for step in range(1, training_steps+1):
    batch_x, batch_y = batcher.next_batch()
    # Reshape data to get 28 seq of 28 elements
#         batch_x = batch_x.reshape((batch_size, timesteps, num_input))
    # Run optimization op (backprop)
    sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
    if step % display_step == 0 or step == 1:
        # Calculate batch loss and accuracy
        loss, pred = sess.run([loss_op, prediction], feed_dict={X: batch_x, Y: batch_y})
        
        test_out = sess.run(prediction, feed_dict={X: X_test})
        print("Step " + str(step) + ", Loss= " + \
              "{:.4f}".format(loss) + ", Train Acc= " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=pred, y_true=batch_y)) +  ", Test Acc = " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=test_out, y_true=y_test)))
        
    if step % checkpoint_step == 0:
        saver = tf.train.Saver()
        saver.save(sess, '../models/lstm_good/lstm_good', global_step=step)
        

print("Optimization Finished!")

#     # Calculate accuracy for 128 mnist test images
#     print("Testing Accuracy:", \
#         sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))

Step 1, Loss= 2.7806, Train Acc= 0.004, Test Acc = 0.002
Step 500, Loss= 0.5469, Train Acc= 0.006, Test Acc = 0.005
Step 1000, Loss= 0.3122, Train Acc= 0.003, Test Acc = 0.004
Step 1500, Loss= 0.1509, Train Acc= 0.003, Test Acc = 0.004
Optimization Finished!


In [25]:
predict_ = sess.run(prediction, feed_dict={X: [batch_x[3]]})

array([[2.18454562e-03, 1.15506053e-02, 2.30523201e-05, 3.32070363e-06,
        1.50353546e-04, 1.20563403e-04, 3.98125849e-06, 8.15389285e-05,
        2.25681234e-07, 9.43000373e-07, 1.05230291e-04, 4.86507815e-07,
        6.37763333e-06, 3.74643096e-05, 3.62851188e-06, 5.10727914e-05,
        2.91291758e-06, 1.54119061e-05, 1.75035275e-05, 1.34396760e-06,
        3.55263023e-06, 2.84224562e-03, 3.24832101e-04, 7.74649307e-05,
        1.01956475e-05, 4.77190333e-05, 4.19327553e-05, 4.11308829e-05,
        1.23437552e-04, 2.74339000e-05, 6.19934872e-06, 3.74953379e-05,
        4.49305105e-07, 7.40281030e-05, 7.03206868e-04, 1.16717001e-05,
        1.17967669e-04, 2.97253905e-07, 3.12561133e-06, 5.46272622e-06,
        8.61753833e-08, 2.12355371e-05, 6.49193581e-03, 3.89328704e-07,
        2.87322928e-06, 4.24616746e-06, 1.85999197e-05, 7.84599222e-04,
        3.65358716e-07, 1.68811653e-06, 2.68506426e-08, 3.66821109e-06,
        9.85488441e-06, 2.47749158e-05, 2.33883708e-04, 4.117130

In [23]:
np.argmax(pred)
# pred
# batch_y[3]

341

In [26]:
real = np.nonzero(batch_y[3])[0].tolist()
# batch_y[1]
pred = np.argpartition(predict_[0], -len(real))[-len(real):]
print('real', real)
print('pred', pred)

len(set(real).intersection(set(pred))) / len(real)

real [224, 339]
pred [285 107]


0.0

In [None]:
def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
        print('y_true_tmp', y_true_tmp, 'y_pred_tmp', y_pred_tmp)
        real = np.nonzero(y_true_tmp)[0].tolist()
        right_num = len(real)
        pred = np.argpartition(y_pred_tmp, -right_num)[-right_num:]
        print('real', real, 'pred', pred)
        if len(real) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real).intersection(set(pred))) / len(real))
    print(acc)
    return(np.array(acc).mean())
        
evaluate_multilabel([[0,0,1], [0,0,0]], [[0,0,1], [1,1,0]])

In [None]:
np.nonzero([0,1,1])[0].tolist()