In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.models import Model
from keras.layers import Dense, Dropout, Input, Conv2D, BatchNormalization, Embedding, Reshape, Permute, MaxPool1D, Concatenate,\
GRU, Bidirectional, Dot, Activation, RepeatVector, Flatten, Multiply, Lambda
from glob import glob
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Adam

from nltk.corpus import stopwords

stopwords_set = set(stopwords.words('English'))
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [8]:
from sklearn.metrics import f1_score


def keras_f1(y_true, y_pred):
    return f1_score(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1), average='weighted')

In [83]:
df = pd.read_csv('../data/rio_kb_article_dataset.csv', encoding='latin1')[['short_description', 'category']]
#df = pd.read_csv('deepdive-bootcamp.csv', encoding='latin1')[['short_description', 'Category']]
df.columns =['short_description', 'category']
#df.loc[pd.isnull(df.category), 'category'] = 'Misc'
df.dropna(inplace=True)

def train_test_split(df, test_size=0.5):
    if 0.5 >= test_size >= 0:
        time = 1 // test_size
        training_set = df[df.index % time == 0]

        testing_set = df[df.index % time != 0]

        return training_set, testing_set
    
df_train, df_test = train_test_split(df)

In [82]:
vocab_size = 2000
embedding_size = 100
seq_len = 15
categories = 9

tokenizer = Tokenizer(num_words=vocab_size, oov_token=None)
tokenizer.fit_on_texts(df_train.short_description)
X_train = pad_sequences(tokenizer.texts_to_sequences(df_train.short_description), maxlen=seq_len)
X_test = pad_sequences(tokenizer.texts_to_sequences(df_test.short_description), maxlen=seq_len)

y_train = to_categorical(pd.Categorical(df_train.category).codes, categories)
y_test = to_categorical(pd.Categorical(df_test.category).codes, categories)


In [72]:


kernel_sizes = [2, 3, 4, 5]
filters = 10



sentences = Input(shape=(seq_len, ))
embedded_sentences = Embedding(vocab_size, embedding_size)(sentences)


feature_list = []
for kernel_size in kernel_sizes:
    reshaped_embedded_sentences = Reshape(target_shape=(seq_len, embedding_size, 1))(embedded_sentences)
    conv_filters = Conv2D(filters, kernel_size=(kernel_size, embedding_size), 
                          padding='valid', activation='relu')(reshaped_embedded_sentences)
    reshaped_filters = Reshape([seq_len - kernel_size + 1, filters])(conv_filters)
    maxpooled_filters = MaxPool1D(pool_size= seq_len - kernel_size + 1)(reshaped_filters)
    final_filters = Reshape((filters, ))(maxpooled_filters)
    dropout_filters = Dropout(0)(final_filters)
    feature_list.append(dropout_filters)
    
feature_vector = Concatenate(axis=1)(feature_list)
output = Dense(categories, activation='softmax')(feature_vector)

model = Model(inputs=sentences, outputs=output)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 15)           0                                            
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, 15, 100)      200000      input_15[0][0]                   
__________________________________________________________________________________________________
reshape_166 (Reshape)           (None, 15, 100, 1)   0           embedding_15[0][0]               
__________________________________________________________________________________________________
reshape_169 (Reshape)           (None, 15, 100, 1)   0           embedding_15[0][0]               
__________________________________________________________________________________________________
reshape_17

In [73]:
model.compile(Adam(0.0007), loss='categorical_crossentropy', metrics=['accuracy'])

In [74]:
model.fit(x=X_train, y=y_train, batch_size=32, epochs=40, validation_data=(X_test, y_test))

Train on 3490 samples, validate on 3485 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1ad24263940>

In [52]:
keras_f1(y_test, model.predict(X_test))

  'precision', 'predicted', average, warn_for)


0.8731732036235087

In [46]:
def get_mapping(x):
    c = pd.Categorical(x)
    d = {}
    for cat, code in zip(x, c.codes):
        d[cat] = code
    return d

get_mapping(df_train.category)

{'User Access': 7,
 'Software': 6,
 'Other': 5,
 'Hardware': 2,
 'Network': 4,
 'Information': 3,
 'Database': 1,
 'Backup': 0}

In [47]:
model.save('rio-cnn.h5')

In [104]:
pd.Categorical(df.category).codes

array([1, 1, 0, ..., 0, 1, 1], dtype=int8)

In [105]:
df.category

1                                Application
2                                Application
3        Access Issues / Security Enablement
4                                Application
5        Access Issues / Security Enablement
6        Access Issues / Security Enablement
7        Access Issues / Security Enablement
8                                Application
9        Access Issues / Security Enablement
10                               Application
11       Access Issues / Security Enablement
12       Access Issues / Security Enablement
13       Access Issues / Security Enablement
15       Access Issues / Security Enablement
16       Access Issues / Security Enablement
17                               Application
18                               Application
19                                       H/W
21       Access Issues / Security Enablement
22       Access Issues / Security Enablement
23                               Application
24       Access Issues / Security Enablement
25       A

In [87]:
sentences = Input(shape=(seq_len, ))
embedded_sentences = Embedding(vocab_size, embedding_size)(sentences)
gru1 = Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.6))(embedded_sentences)
gru2 = Bidirectional(GRU(128, recurrent_dropout=0.6))(gru1)
softmax = Dense(categories, activation='softmax')(gru2)

gru_model = Model(inputs=sentences, outputs=softmax)
gru_model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_24 (InputLayer)        (None, 15)                0         
_________________________________________________________________
embedding_24 (Embedding)     (None, 15, 100)           200000    
_________________________________________________________________
bidirectional_32 (Bidirectio (None, 15, 256)           175872    
_________________________________________________________________
bidirectional_33 (Bidirectio (None, 256)               295680    
_________________________________________________________________
dense_38 (Dense)             (None, 7)                 1799      
Total params: 673,351
Trainable params: 673,351
Non-trainable params: 0
_________________________________________________________________


In [None]:
def batch_generator(X, y, batch_size=32):
    mask = np.random.randint(0, X.shape[0], batch_size)
    return X[mask], y[mask]

In [75]:
import keras.backend as K
batch_size = 32

sentences = Input(shape=(seq_len, ))
embedded_sentences = Embedding(vocab_size, embedding_size)(sentences)
gru1 = Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.8))(embedded_sentences)
gru2 = Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.8))(gru1)

x_t = Flatten()(Dense(1)(gru2))
y_t = Lambda(lambda x: x / 16)(x_t)
a_t = Activation('softmax')(y_t)

#h_weighted = Dot(axes=(1, 1))([a_t, gru1])

gru_shuffled = Permute([2, 1])(gru1)

h_weighted = Lambda(lambda x: K.batch_dot(x[0], x[1]))([gru_shuffled, a_t])
#h_weighted = K.batch_dot(gru_shuffled, a_t)

#x_t = Dot(axes=(2,2))([u_t, u_c_repeated])
#a_t = Activation('softmax')(x_t)

softmax = Dense(categories, activation='softmax')(h_weighted)

gru_model = Model(inputs=sentences, outputs=softmax)
gru_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_22 (InputLayer)           (None, 15)           0                                            
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, 15, 100)      200000      input_22[0][0]                   
__________________________________________________________________________________________________
bidirectional_30 (Bidirectional (None, 15, 256)      175872      embedding_22[0][0]               
__________________________________________________________________________________________________
bidirectional_31 (Bidirectional (None, 15, 256)      295680      bidirectional_30[0][0]           
__________________________________________________________________________________________________
dense_35 (

In [65]:
gru_model.predict(X_train[0:1])

array([[0.12640281, 0.12690303, 0.12647498, 0.12246154, 0.12528576,
        0.12535304, 0.12491726, 0.12220164]], dtype=float32)

In [88]:
gru_model.compile(Adam(0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [89]:
gru_model.fit(x=X_train, y=y_train, batch_size=32, epochs=40, validation_data=(X_test, y_test))

Train on 16262 samples, validate on 16310 samples
Epoch 1/40
 1664/16262 [==>...........................] - ETA: 2:21 - loss: 1.7143 - acc: 0.5234

KeyboardInterrupt: 

In [78]:
keras_f1(y_test, gru_model.predict(X_test))

  'precision', 'predicted', average, warn_for)


0.8610506528210605

In [90]:
y_test

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [12]:
deepdive = pd.read_csv('deepdive-bootcamp.csv')
rio = pd.read_csv('rio_kb_article_dataset.csv',encoding='latin1')

In [21]:
deepdive.Category.unique()

array([nan, 'Application', 'Access Issues / Security Enablement', 'H/W',
       'S/W', 'N/W', 'Job Failures'], dtype=object)

In [17]:
rio.category.value_counts()

User Access    4178
Software       2576
Hardware        107
Network          76
Information      18
Other            15
Database          7
Backup            3
Name: category, dtype: int64