In [1]:
import numpy as np
import pandas as pd

import spacy

from sklearn import preprocessing

from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.data import Sentence

from tensorflow.keras.layers import Input, Dense, GRU, LSTM, Bidirectional, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

In [2]:
df = pd.read_csv('./data/NLData1.csv')
df = df.drop(columns=['Time', 'Speaker', 'ID'])
df = df.drop(labels=0)
df.drop(df[pd.isna(df.Text)].index, inplace=True) # Drop the empty rows
df = df.reset_index(drop=True)
# df_train = df[:450]
# df_test = df[450:500] # Here I didn't use the full data instead just doing validations
# df_test = df_test.reset_index(drop=True)

In [3]:
class GetItAllDone:
    
    # This class plays the role of decorator.
    # No need much attention.
    class clean_text:   
        def __init__(self, decorated):
            self.decorated = decorated
            
        def clean(self, sentence):
            '''Clean all irrelavent tokens in the input sentence'''
            nlp = spacy.load("ja_core_news_sm")
            delete_pos = ["PUNCT", "SPACE", "SYM", "NUM"]
            doc = nlp(sentence)
            word_list = [str(token) for token in doc if token.pos_ not in delete_pos]
            return ' '.join(word_list)
      
        def __call__(self, *args, **kwargs):
            df = self.decorated(*args, **kwargs)[0]
            num_train = self.decorated(*args, **kwargs)[1]
            num_test = self.decorated(*args, **kwargs)[2]
            df['Text'] = df['Text'].apply(self.clean)
            df.drop_duplicates(subset='Text', keep=False, inplace=True)
            df = df.reset_index(drop=True)
            df_train = df[:num_train]
            df_test = df[num_train:num_train+num_test]
            df_test = df_test.reset_index(drop=True)
            return (df_train, df_test)
    
    def __init__(self):
        self.ja_embedding = WordEmbeddings('ja-crawl')
        self.ja_forward_embedding = FlairEmbeddings('ja-forward')
        self.ja_backward_embedding = FlairEmbeddings('ja-backward')

        self.stacked_embedding = StackedEmbeddings([
            self.ja_embedding,
            self.ja_forward_embedding,
            self.ja_backward_embedding
        ])
    
    # This inner method will prepare the data in the form ready for training.
    # We will be using this method to get training and test dataframe ready.
    @clean_text
    def _prepare_data(dataframe, column_to_prepare, num_train, num_test):
        '''
        Inputs:
            dataframe: a dataframe to get prepared
            column_to_prepare: the colunmn to get one-hot encoded
            num_train: the desired number of training data
            num_test: the desired number of testing data
        Returns:
            a tuple with training and testing dataframe
        '''
        num_train = num_train
        num_test = num_test
        data = dataframe
        le = preprocessing.LabelEncoder()
        property_list = list(data.columns)
        property_list.remove('Text')
        df_modified = data.drop(columns=property_list)
        df_to_return = pd.concat([df_modified, data[column_to_prepare]], axis=1)
        df_to_return['Code_Modified'] = df_to_return[column_to_prepare]
        df_to_return['Code_Modified'] = df_to_return['Code_Modified'].replace(np.nan, 'none', regex=True)
        df_to_return['Code_Modified'] = le.fit_transform(df_to_return['Code_Modified'])
        #df_train = df_to_return[:num_train]
        #df_test = df_to_return[num_train:num_train+num_test]
        return (df_to_return, num_train, num_test)
    
    # This method will generate the training data
    def generate_train_data(self, data_to_train, column_to_prepare, num_train, num_test, batch_size, max_length, num_classes, emb_size):
        '''
        Inputs:
            data_to_train: the dataframe of training data
            column_to_prepare: the column to get one-hot encoded
            num_train: the desired number of training data
            num_test: the desired number of testing data
            batch_size: number of training examples running through your network in one batch
            max_length: the number of tokens to take in one sentence
            num_classes: the number of classes in the target
            emb_size: the dimension of embedding space, default to be 4396
        Returns:
            numpy arrays of batch of training data ready to be fed into the model
        '''
        x_batch = []
        y_batch = []
        while True:
            data = self._prepare_data(data_to_train, column_to_prepare, num_train, num_test)[0]
            data = data.sample(frac=1)
            for _, row in data.iterrows():
                my_sent = row['Text']
                sentence = Sentence(my_sent)
                self.stacked_embedding.embed(sentence)
                x = []
                for token in sentence:
                    x.append(token.embedding.cpu().detach().numpy())
                    if len(x) == max_length:
                        break
                
                while len(x) < max_length:
                    x.append(np.zeros(emb_size))
                
                y = np.zeros(num_classes)
                y[row["Code_Modified"]] = 1
                
                x_batch.append(x)
                y_batch.append(y)
                
                if len(y_batch) == batch_size:
                    yield np.array(x_batch), np.array(y_batch)
                    x_batch = []
                    y_batch = []
    
    # This method will generate dataframe for testing
    def generate_test_data(self, data_to_test, column_to_prepare, num_train, num_test, batch_size, max_length, emb_size):
        '''
        Inputs:
            data_to_test: the dataframe of testing data
            column_to_prepare: the column to get one-hot encoded
            num_train: the desired number of training data
            num_test: the desired number of testing data
            batch_size: number of training examples running through your network in one batch
            max_length: the number of tokens to take in one sentence
            emb_size: the dimension of embedding space, default to be 4396
        Returns:
            numpy arrays of batch of testing data ready to be used
        '''
        x_batch = []
        data = self._prepare_data(data_to_test, column_to_prepare, num_train, num_test)[1]
        while True:
            for _, row in data.iterrows():
                my_sent = row['Text']
                sentence = Sentence(my_sent)
                self.stacked_embedding.embed(sentence)
                x = []
                for token in sentence:
                    x.append(token.embedding.cpu().detach().numpy())
                    if len(x) == max_length:
                        break

                while len(x) < max_length:
                    x.append(np.zeros(emb_size))

                x_batch.append(x)            
                if len(x_batch) == batch_size:
                    yield np.array(x_batch)
                    x_batch = []
                    
    # Declare the model
    def declare_model(self, batch_size, max_len, emb_size, gru_size, num_classes):
        '''
        Inputs:
            gru_size: positive integer, dimensionality of the output space
        Returns:
            a model
        '''
        sample = Input(batch_shape=(batch_size, max_len, emb_size))
        gru_out = Bidirectional(GRU(gru_size, return_sequences=True))(sample)
        #lstm_out = Bidirectional(LSTM(gru_size, return_sequences=True))(sample)
        gru_out = Flatten()(gru_out)
        #lstm_out = Flatten()(lstm_out)
        predictions = Dense(num_classes, activation='sigmoid')(gru_out)

        model = Model(inputs=sample, outputs=[predictions])
        model.compile(optimizer=Adam(),loss='categorical_crossentropy', metrics=["acc"])
        print(model.summary())
        return model
    
    # Fetch the targets of testing data.
    # We will be using this method to make comparison with predicted values
    def fetch_target(self, data_to_test, column_to_prepare, num_train, num_test):
        data_to_fetch = self._prepare_data(data_to_test, column_to_prepare, num_train, num_test)[1]
        ans = [data_to_fetch['Code_Modified'][i] for i in range(len(data_to_fetch))]
        return ans

In [4]:
gen = GetItAllDone().generate_train_data(df, column_to_prepare='Relating Levels', num_train=300, num_test=100, batch_size=10, max_length=10, num_classes=6, emb_size=4396)

In [5]:
model = GetItAllDone().declare_model(batch_size=10, max_len=10, emb_size=4396, gru_size=20, num_classes=6)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(10, 10, 4396)]          0         
_________________________________________________________________
bidirectional (Bidirectional (10, 10, 40)              530160    
_________________________________________________________________
flatten (Flatten)            (10, 400)                 0         
_________________________________________________________________
dense (Dense)                (10, 6)                   2406      
Total params: 532,566
Trainable params: 532,566
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
model.fit(gen, steps_per_epoch=30, epochs=5, max_queue_size=15, workers=3)

Train for 30 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff95b086190>

In [19]:
gen_test = GetItAllDone().generate_test_data(df, column_to_prepare='Relating Levels', num_train=300, num_test=100, batch_size=10, max_length=10, emb_size=4396)
predict = np.argmax(model.predict(gen_test, steps=10), axis=1)

In [16]:
ans = GetItAllDone().fetch_target(df, column_to_prepare='Relating Levels', num_train=300, num_test=100)

In [22]:
acc = sum([1 for a, b in zip(predict, ans) if a==b])
print(acc)

97
