In [None]:
import pandas
import numpy as np
# import tensorflow as tf

# tf.__version__

In [None]:
data = pandas.read_parquet('../data/to_send.pq')
print(data.shape)
data = data[['description', 'products']][pandas.notnull(data['products'])].copy().reset_index(drop=True)
print(data.shape)

In [None]:
data.head()

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from collections import Counter
from nltk.stem.lancaster import LancasterStemmer

# nltk.download('stopwords')

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stopwords_cached = stopwords.words('english')
lemma = nltk.wordnet.WordNetLemmatizer()
st = LancasterStemmer()

# lemmatize categories to make 'detail' and 'details' as one category
def clean_categories(x):
    result = []
    for category in x:
        tmp_ = ' '.join([lemma.lemmatize(word)
                         for word in tokenizer.tokenize(category.lower())
                         if word not in stopwords_cached])
        if tmp_ != '':
            result.append(tmp_)
    if len(result) == 0:
        return np.nan
    return np.array(result)

data['products'] = data['products'].apply(clean_categories)
data = data[data['products'].notnull()].copy().reset_index(drop=True)

all_products = []
for prod_list in data['products'].values:
    all_products += [' '.join(tokenizer.tokenize(product))
                     for product in prod_list.tolist()]
    
counter = Counter(all_products)
print('unique categories', len(counter.most_common()))

most_common = [product[0] for product in counter.most_common(100)]

# remove all categories which is not in top 100
def filter_categories(x):
    new_categories = np.array([product 
                               for product in x.tolist()
                               if product in most_common])
    if new_categories.shape[0] == 0:
        return np.nan
    return new_categories

stopwords_cached = stopwords.words('english')

# basic text processing on descriptions
def filter_descriptions(text):
    cleaned_text = [lemma.lemmatize(token)
                      for token in tokenizer.tokenize(text.lower())
                      if token not in stopwords_cached]
    if len(cleaned_text) == 0:
        return np.nan
    return ' '.join(cleaned_text)

data['products'] = data['products'].apply(filter_categories)
data['description'] = data['description'].apply(filter_descriptions)

data = data[(data['products'].notnull()) & (data['description'].notnull())].reset_index(drop=True)
data.shape

In [None]:
data

In [None]:
from gensim.models.wrappers.fasttext import FastText

model = FastText.load_fasttext_format('../models/wiki.simple.bin')

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['description'])
sequences = tokenizer.texts_to_sequences(data['description'])

X = pad_sequences(sequences, maxlen=50)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform([item.tolist() for item in data['products'].values])
mlb.classes_

In [None]:
X.shape, y.shape, len(tokenizer.word_index)

In [None]:
embedding_matrix = np.zeros((len(tokenizer.word_index), 300))
for word, i in tokenizer.word_index.items():
    if i >= len(tokenizer.word_index):
        continue
    try:
        embedding_vector = model.wv[word]
    except KeyError:
        print("error in word " + word)
        embedding_vector = None
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

In [None]:
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.layers import Dense, Input
from keras.models import Model

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.4
set_session(tf.Session(config=config))


def evaluate_multilabel(y_pred, y_true):
    y_pred, y_true = y_true, y_pred
    acc = []
    for y_pred_tmp, y_true_tmp in zip(y_pred, y_true):
        real_ = np.nonzero(y_true_tmp)[0].tolist()
        pred_ = np.nonzero(y_pred_tmp)[0].tolist()
        if len(real_) == 0:
            #means 0 right answers
            acc.append(0.0)
            continue
        acc.append(len(set(real_).intersection(set(pred_))) / len(real_))
    return(np.array(acc).mean())

embedding_layer = Embedding(embedding_matrix.shape[0],
                            300,
                            weights=[embedding_matrix],
                            input_length=50,
                            trainable=False)
sequence_input = Input(shape=(50,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Bidirectional(LSTM(64))(embedded_sequences)
x = Dropout(0.5)(x)
# x = Bidirectional(LSTM(64))(x)
# x = Dense(1, activation='tanh')(x)
preds = Dense(len(y[0]), activation='sigmoid')(x)

nn = Model(sequence_input, preds)
nn.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])



from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nn.fit(X_train, y_train,
          batch_size=128,
          epochs=100,
validation_data=(X_test, y_test))

In [None]:
nn.fit(X_train, y_train,
          batch_size=128,
          epochs=100,
validation_data=(X_test, y_test))

In [None]:
preds = nn.predict(X_test)

In [None]:

print(evaluate_multilabel(preds, y_test))
for num in range(0, preds.shape[0]):
    real = y_test[num]

    real = np.nonzero(real)[0].tolist()
    right_num = len(real)
    pred = np.round(preds[num])
    pred = np.nonzero(pred)[0].tolist()
    print(real, pred)
    print('real_classes', mlb.classes_[np.array(real)])
    if len(pred) > 0:
        print('predicted classes', mlb.classes_[np.array(pred)])

In [None]:
preds = nn.predict(X_train)

In [None]:

print(evaluate_multilabel(preds, y_train))
for num in range(0, preds.shape[0]):
    real = y_train[num]

    real = np.nonzero(real)[0].tolist()
    right_num = len(real)
    pred = np.round(preds[num])
    pred = np.nonzero(pred)[0].tolist()
    print(real, pred)
    print('real_classes', mlb.classes_[np.array(real)])
    if len(pred) > 0:
        print('predicted classes', mlb.classes_[np.array(pred)])