In [1]:
import pandas
import numpy as np
import tensorflow as tf

tf.__version__

'1.4.1'

In [2]:
data = pandas.read_parquet('../data/to_send.pq')
print(data.shape)
data = data[['description', 'products']][pandas.notnull(data['products'])].copy().reset_index(drop=True)
print(data.shape)

(83897, 18)
(47516, 2)


In [3]:
data.head()

Unnamed: 0,description,products
0,Welch allyn combines its practical understandi...,"[power supply, body sub assy, medical, valve b..."
1,In line with the company s intention to ...,"[imo, advertising materials, point, imo label,..."
2,Services redaelli ricambi offers the ability t...,"[auto spare parts, tie rod, tie rod end, auto ..."
3,STROTHMANN not only delivers suitable mechanic...,"[covers non automated, demurrage rules form, r..."
4,"Established\nin 1991, tien jiang enterprise co...","[rubber, polyester, nylon, boot, support]"


In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from collections import Counter
from nltk.stem.lancaster import LancasterStemmer

# nltk.download('stopwords')
lemma = nltk.wordnet.WordNetLemmatizer()
st = LancasterStemmer()
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stopwords_cached = stopwords.words('english')

def clean_categories(x):
    result = []
    for category in x:
        tmp_ = ' '.join([st.stem(word)
                         for word in tokenizer.tokenize(category.lower())
                         if word not in stopwords_cached])
        if tmp_ != '':
            result.append(tmp_)
    if len(result) == 0:
        return np.nan
    return np.array(result)

data['products'] = data['products'].apply(clean_categories)
data = data[data['products'].notnull()].copy().reset_index(drop=True)

all_products = []
for prod_list in data['products'].values:
    all_products += [' '.join(tokenizer.tokenize(product))
                     for product in prod_list.tolist()]
    
counter = Counter(all_products)
print('unique categories', len(counter.most_common()))

most_common = [product[0] for product in counter.most_common(100)]

def filter_categories(x):
    new_categories = np.array([product 
                               for product in x.tolist()
                               if product in most_common])
    if new_categories.shape[0] == 0:
        return np.nan
    return new_categories

lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')

def filter_descriptions(text):
    cleaned_text = [lemma.lemmatize(token)
                      for token in tokenizer.tokenize(text.lower())
                      if token not in stopwords_cached]
    if len(cleaned_text) == 0:
        return np.nan
    return ' '.join(cleaned_text)

data['products'] = data['products'].apply(filter_categories)
data['description'] = data['description'].apply(filter_descriptions)

data = data[(data['products'].notnull()) & (data['description'].notnull())].copy().reset_index(drop=True)
data.shape

unique categories 62194


(24213, 2)

In [5]:
data

Unnamed: 0,description,products
0,welch allyn combine practical understanding cl...,"[pow supply, med]"
1,line company intention support international g...,[imo]
2,service redaelli ricambi offer ability produce...,"[auto spar part, auto spar, spar part]"
3,strothmann delivers suitable mechanical system...,[lin]
4,established tien jiang enterprise co ltd one s...,"[rub, polyest]"
5,songwei dedicate become benchmark manufacturin...,"[elect, med]"
6,salespider medium inc leading digital medium c...,[gear]
7,insight trend including weekly round industry ...,[hos]
8,since behl precision fabricating provided prec...,[car part]
9,nidec minster corporation world class supplier...,"[print, mot]"


# doc2vec

In [None]:
from nltk.tokenize import RegexpTokenizer
import gensim

lemma = nltk.wordnet.WordNetLemmatizer()
stopwords_cached = stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')

def read_corpus(tokens_only=False):
#     with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
    for line, id_ in zip(data['description'].values, 
                         data.index.values):
        
        # For training data, add tags
        yield gensim.models.doc2vec.TaggedDocument([token
                                                    for token in tokenizer.tokenize(line)], 
                                                   ['_*' + str(id_)])
        
train_corpus = list(read_corpus())

In [None]:
train_corpus[:2]

In [None]:
# PV-DBOW 

import gensim

model = gensim.models.doc2vec.Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, iter=10, workers=7)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)
model.save('../models/company_descriptions.bin')


In [6]:
import gensim
model = gensim.models.doc2vec.Doc2Vec.load('../models/company_descriptions.bin')

In [7]:
X_ = []
y_ = []

for i in data.index.values:
    X_.append(model.docvecs['_*' + str(i)])
    y_.append(data.loc[i]['products'])
X_ = np.array(X_)

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_ = mlb.fit_transform(y_)

In [9]:
mlb.classes_

array(['alumin', 'assembl', 'auto part', 'auto spar', 'auto spar part',
       'automot part', 'automot spar', 'automot spar part', 'bal',
       'battery', 'bear', 'book', 'bottl', 'brak', 'cabl', 'cap', 'car',
       'car part', 'cast', 'ceram', 'ceram til', 'chair', 'coil',
       'compon', 'compress', 'comput', 'connect', 'cotton', 'cov',
       'cylind', 'display', 'elect', 'electron', 'engin', 'fabr', 'film',
       'filt', 'fit', 'flo', 'furnit', 'gear', 'glass', 'hand',
       'hand tool', 'heat', 'hos', 'hous', 'hydra', 'imo', 'indust',
       'lamp', 'light', 'lin', 'machin part', 'machinery',
       'machinery part', 'med', 'mold', 'mot', 'mot vehic', 'nut', 'oil',
       'pip', 'plast part', 'plat', 'polyest', 'pow', 'pow supply',
       'press', 'print', 'pump', 'pvc', 'ring', 'rol', 'rub', 'screw',
       'seat', 'sheet', 'softw', 'solid wood', 'spar part', 'stainless',
       'stainless steel', 'steel', 'system', 'tabl', 'til', 'tir', 'tool',
       'toy', 'tub', 'tyr', 

# simple NN

In [10]:
import math

class DataBatcher():
    def __init__(self, _X, _y, _batch_size=30):
        self._X = _X
        self._y = _y
        self._batch_size = _batch_size
        self._resplit = True
        self._num_examples = self._y.shape[0]
    
    def next_batch(self):
        if self._resplit:
#             print('splitting')
            perm0 = np.arange(self._num_examples)
            np.random.shuffle(perm0)
            self._batches_indexes = np.array_split(perm0, math.ceil(perm0.shape[0] / self._batch_size))
#             print(self._batches_indexes)
            self._batch_counter = -1
            self._resplit = False

        self._batch_counter += 1
        if self._batches_indexes[self._batch_counter].shape[0] < self._batch_size:
#             print('hstacking')
            self._resplit = True
            ind = self._batch_counter
#             self._batch_counter = -1
            missing_num = self._batch_size - self._batches_indexes[ind].shape[0]
            return self._X[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))],\
                   self._y[np.hstack((self._batches_indexes[ind], self._batches_indexes[0][:missing_num]))]
        
        return self._X[self._batches_indexes[self._batch_counter]], self._y[self._batches_indexes[self._batch_counter]]

In [11]:
from sklearn.model_selection import train_test_split
graph = tf.Graph()

with graph.as_default():
    # Parameters
    learn_rate = 0.001
    num_steps = 200000
    batch_size = 128
    display_step = 1000

    # Network Parameters
#     n_hidden_1 = 60 # 1st layer number of neurons
    n_hidden_2 = 70 # 2nd layer number of neurons
    num_input = 200 # MNIST data input (img shape: 28*28)
    num_classes = 100 # MNIST total classes (0-9 digits)

    # Graph input
    X = tf.placeholder("float", [None, num_input])
    y = tf.placeholder("float", [None, num_classes])
    lr = tf.placeholder("float")

    # Define the neural network
    def simple_nn(x):
        # TF Estimator input is a dict, in case of multiple inputs
#         initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=None, dtype=tf.float32)
        initializer = tf.truncated_normal_initializer(stddev=0.18632568, dtype=tf.float32)
        regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
        layer_1 = tf.layers.dense(inputs=x, units=n_hidden_2, activation=tf.tanh, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        layer_2 = tf.layers.dense(inputs=layer_1, units=n_hidden_2, activation=tf.tanh, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        layer_3 = tf.layers.dense(inputs=layer_2, units=n_hidden_2, activation=tf.tanh, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        layer_4 = tf.layers.dense(inputs=layer_3, units=n_hidden_2, activation=tf.tanh, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        # Output fully connected layer with a neuron for each class
        out_layer = tf.layers.dense(layer_4, num_classes, activation=None, 
                                  kernel_initializer=initializer, kernel_regularizer=regularizer)
        return out_layer

    logits = simple_nn(X)
    prediction = tf.nn.sigmoid(logits)
    loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_op = optimizer.minimize(loss_op)
    init = tf.global_variables_initializer()

def evaluate_multilabel(y_pred, y_true):
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
        real_ = np.nonzero(y_true_tmp)[0].tolist()
        pred_ = np.nonzero(y_pred_tmp)[0].tolist()
        if len(real_) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real_).intersection(set(pred_))) / len(real_))
    return(np.array(acc).mean())

# Start training
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
sess = tf.InteractiveSession(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options))


# Run the initializer
sess.run(init)
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)
batcher = DataBatcher(X_train, y_train, _batch_size=batch_size)


for step in range(1, num_steps + 1):
    batch_x, batch_y = batcher.next_batch()
    # Run optimization op (backprop)
    if step == int(num_steps * 0.75):
        learn_rate /= 2
        print("lr now " + str(learn_rate))
    if step == int(num_steps * 0.90):
        learn_rate /= 2
        print("lr now " + str(learn_rate))
#     print("before run train_op")
    sess.run(train_op, feed_dict={X: batch_x, y: batch_y, lr: learn_rate})
    if step % display_step == 0 or step == 1:
        # Calculate batch loss and accuracy
        loss, pred = sess.run([loss_op, tf.round(prediction)], feed_dict={X: batch_x, y: batch_y})
        
        test_out = sess.run(tf.round(prediction), feed_dict={X: X_test})
        print("Step " + str(step) + ", Loss= " + \
              "{:.4f}".format(loss) + ", Train Acc= " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=pred, y_true=batch_y)) +  ", Test Acc = " + \
              "{:.3f}".format(evaluate_multilabel(y_pred=test_out, y_true=y_test)))
        
print("Optimization Finished!")

Step 1, Loss= 0.7124, Train Acc= 0.467, Test Acc = 0.426
Step 1000, Loss= 0.0705, Train Acc= 0.000, Test Acc = 0.000
Step 2000, Loss= 0.0678, Train Acc= 0.008, Test Acc = 0.005
Step 3000, Loss= 0.0623, Train Acc= 0.066, Test Acc = 0.021
Step 4000, Loss= 0.0657, Train Acc= 0.031, Test Acc = 0.036
Step 5000, Loss= 0.0581, Train Acc= 0.094, Test Acc = 0.042
Step 6000, Loss= 0.0585, Train Acc= 0.062, Test Acc = 0.047
Step 7000, Loss= 0.0619, Train Acc= 0.076, Test Acc = 0.054
Step 8000, Loss= 0.0552, Train Acc= 0.148, Test Acc = 0.060
Step 9000, Loss= 0.0534, Train Acc= 0.115, Test Acc = 0.065
Step 10000, Loss= 0.0543, Train Acc= 0.098, Test Acc = 0.071
Step 11000, Loss= 0.0516, Train Acc= 0.105, Test Acc = 0.071
Step 12000, Loss= 0.0529, Train Acc= 0.123, Test Acc = 0.072
Step 13000, Loss= 0.0511, Train Acc= 0.118, Test Acc = 0.074
Step 14000, Loss= 0.0478, Train Acc= 0.168, Test Acc = 0.078
Step 15000, Loss= 0.0492, Train Acc= 0.171, Test Acc = 0.079
Step 16000, Loss= 0.0512, Train Acc= 

Step 134000, Loss= 0.0252, Train Acc= 0.520, Test Acc = 0.097
Step 135000, Loss= 0.0275, Train Acc= 0.457, Test Acc = 0.097
Step 136000, Loss= 0.0296, Train Acc= 0.530, Test Acc = 0.099
Step 137000, Loss= 0.0310, Train Acc= 0.451, Test Acc = 0.098
Step 138000, Loss= 0.0289, Train Acc= 0.521, Test Acc = 0.100
Step 139000, Loss= 0.0284, Train Acc= 0.484, Test Acc = 0.097
Step 140000, Loss= 0.0278, Train Acc= 0.496, Test Acc = 0.099
Step 141000, Loss= 0.0295, Train Acc= 0.491, Test Acc = 0.095
Step 142000, Loss= 0.0265, Train Acc= 0.528, Test Acc = 0.099
Step 143000, Loss= 0.0333, Train Acc= 0.495, Test Acc = 0.097
Step 144000, Loss= 0.0288, Train Acc= 0.462, Test Acc = 0.096
Step 145000, Loss= 0.0255, Train Acc= 0.565, Test Acc = 0.096
Step 146000, Loss= 0.0255, Train Acc= 0.410, Test Acc = 0.097
Step 147000, Loss= 0.0282, Train Acc= 0.472, Test Acc = 0.099
Step 148000, Loss= 0.0260, Train Acc= 0.507, Test Acc = 0.096
Step 149000, Loss= 0.0309, Train Acc= 0.403, Test Acc = 0.095
lr now 0

In [None]:
pred_ = sess.run(tf.round(prediction), feed_dict={X: X_test})


In [None]:
pred_[0]

In [None]:
data.loc[1]['products']

In [None]:
query = ['seat']
def filter_categories(cat_list):
    return set(cat_list).intersection(query) == set(query)
data[data['products'].apply(filter_categories)]

In [None]:
data.to_csv('../data/categ_preprocessed.csv')