In [None]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense, SpatialDropout1D
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn import preprocessing

import pandas as pd
import numpy as np
import re

from numpy import array
from numpy import asarray
from numpy import zeros

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords

from keras.utils.np_utils import to_categorical

from keras.utils.vis_utils import plot_model

import matplotlib.pyplot as plt

In [None]:
STOPWORDS = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')

In [None]:
def preprocess_text(sen):

    stemmer = WordNetLemmatizer()

   # Remove all the special characters
    document = re.sub(r'\W', ' ', str(sen))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    document = REPLACE_BY_SPACE_RE.sub(' ', document)
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Converting to Lowercase
    document = document.lower()
  
    document = document.split()

    document = ' '.join(word for word in document if word not in STOPWORDS) # remove stopwors from text

    # Lemmatization
  
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document) 
    
    return document

In [None]:
def getModel(length_long_sentence, embedding_matrix):
  deep_inputs = Input(shape=(length_long_sentence,))
  embedding_layer = Embedding(vocab_size, 50, weights=[embedding_matrix], trainable=False)(deep_inputs)
  LSTM_Layer_1 = LSTM(70, activation='tanh')(embedding_layer)
  dense_layer_1 = Dense(35, activation='tanh')(LSTM_Layer_1)
  dense_layer_2 = Dense(4, activation='softmax')(dense_layer_1)
  
  return Model(inputs=deep_inputs, outputs=dense_layer_2)

In [None]:
def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy

    Variables:
        weights: numpy array of shape (C,) where C is the number of classes

    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """

    weights = K.variable(weights)

    def loss(y_true, y_pred):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss

    return loss

In [None]:
train_path = 'Training_set.csv'

col_names = ['triggerTitle','triggerChannelTitle','actionChannelTitle','actionTitle','title', 'desc', 'target']
train_df = pd.read_csv(train_path,skiprows=1,sep=';',names=col_names,encoding = "ISO-8859-1")

In [None]:
#Pre-processing

X_train = []
for i in range(0,len(train_df)):
  X_train.append(preprocess_text(train_df.iloc[i][4] + ". " + train_df.iloc[i][5] + "."))

y = train_df['target']

In [None]:
test_path = 'Test_set.csv'

col_names = ['triggerTitle','triggerChannelTitle','actionChannelTitle','actionTitle','title', 'desc', 'target']
test_df = pd.read_csv(test_path,skiprows=1,sep=';',names=col_names,encoding = "ISO-8859-1")

In [None]:
#Pre-processing

X_test = []
for i in range(0,len(test_df)):
  X_test.append(preprocess_text(test_df.iloc[i][4] + ". " + test_df.iloc[i][5] + ". "))

y_test = test_df['target']

In [None]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
tokenizer = Tokenizer(lower=True, num_words=5000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
vocab_size = len(tokenizer.word_index) + 1

length_long_sentence = 50

X_train = pad_sequences(X_train,  maxlen=length_long_sentence, padding='post')
X_test = pad_sequences(X_test,  maxlen=length_long_sentence,padding='post')

In [None]:
embeddings_dictionary = dict()
glove_file = open('Glove/glove.6B.50d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

In [None]:
embedding_matrix = zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
# STRATIFIES K-FOLD CROSS VALIDATION { 4-fold }

splits = 4

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1)

class_weights_list = []

for train_index, test_index in skf.split(X_train, y_train):
    x_train_fold, x_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    #compute the class weights
    class_weights = compute_class_weight('balanced', np.unique(y_train_fold),y = np.ravel(y_train_fold))

    class_weights_list.append(class_weights)

    ncce = weighted_categorical_crossentropy(weights=np.array(class_weights))

    model = getModel(length_long_sentence, embedding_matrix)

    model.compile(loss=ncce, optimizer='adam', metrics=['acc'])

    model.fit(x_train_fold, y_train_fold, batch_size=10, epochs=24, verbose=1)

    score = model.evaluate(x_test_fold, y_test_fold, verbose=0)

    print("Accuracy Validation: %.2f%%" % (score[1]*100))

In [None]:
ncce = weighted_categorical_crossentropy(weights=np.array(class_weights_list[best_class_weight]))

In [None]:
model = getModel(length_long_sentence, embedding_matrix)

model.compile(loss=ncce, optimizer='adam', metrics=['acc'])

In [None]:
history = model.fit(X_train, y_train, batch_size=10, epochs=24, verbose=1)

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)

print("Accuracy Test: %.2f%%" % (score[1]*100))

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

y_pred_clean = np.zeros_like(y_pred)
for idx, i in enumerate(np.argmax(y_pred,axis=1)):
    y_pred_clean[idx][i] = 1

print(classification_report(y_test, y_pred_clean))