In [None]:
import keras

from keras.layers import LSTM,Embedding,Bidirectional,TimeDistributed,Dense
from keras_contrib.layers import CRF
import pickle
from keras.optimizers import Adam
import numpy as np
from keras.layers import Input
from keras.models import Model
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
import pandas as pd

In [None]:
import numpy as np
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, tr, tr_y, batch_size=1,n_classes=12, shuffle=True):
        'Initialization'
        self.batch_size = batch_size
        self.labels = tr_y
        self.X=tr
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return len(self.X)

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index]

        # Generate data
        X, y = self.__data_generation(indexes)
        return np.array([X]), np.array([y])

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.X))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = self.X[indexes]
        X=[170965 if j==514370 else j for j in X]
        #print(X.shape)
        y = keras.utils.to_categorical(self.labels[indexes], num_classes=self.n_classes)
        #print(y.shape)
        
        return X,y

In [None]:
class TDataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, tr, batch_size=1,shuffle=False):
        'Initialization'
        self.batch_size = batch_size
        self.X=tr
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return len(self.X)

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index]

        # Generate data
        X = self.__data_generation(indexes)
        return np.array([X])

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.X))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = self.X[indexes]
        X=[170965 if j==514370 else j for j in X]
        #print(X.shape)
            
        return X

In [None]:
t={'0':0,'TIT':1,'CNP':2,'STD':3,'EFD':4,'TED':5,'PER':6,'VAL':7,'GOV':8,'JUR':9 ,'LEG':10,'HEAD':11}

In [None]:
max_len=69593
vocab_size=170965
n_tags=len(t)

In [None]:
file=open("Train","rb")
tr=pickle.load(file)

In [None]:
file=open("Labels","rb")
tr_y=pickle.load(file)

In [None]:
#tr=pad_sequences(tr, maxlen=max_len, dtype='int32', padding='post', truncating='post', value=0)

#tr_y=pad_sequences(tr_y, maxlen=max_len, dtype='int32', padding='post', truncating='post', value=0)

In [None]:
wt=pickle.load(open("embd_wt","rb"))

In [None]:
wt=np.append([np.array([0 for i in range(225)])],wt,axis=0)

Model

In [None]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=vocab_size+1, output_dim=225,weights=[wt],
                  input_length=max_len, mask_zero=True)(input)
model = Bidirectional(LSTM(units=50, return_sequences=True,recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)   #a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out=crf(model)


In [None]:
model = Model(input, out)

In [None]:
model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])

Training

In [None]:
gen=DataGenerator(tr,tr_y)

In [None]:
history = model.fit_generator(generator=gen, epochs=1,verbose=1)

In [None]:
model.save('my_model.h5')

Loading a Model

In [None]:
from keras.models import load_model
from keras_contrib.layers import CRF


In [None]:
def create_custom_objects():
    instanceHolder = {"instance": None}
    class ClassWrapper(CRF):
        def __init__(self, *args, **kwargs):
            instanceHolder["instance"] = self
            super(ClassWrapper, self).__init__(*args, **kwargs)
    def loss(*args):
        method = getattr(instanceHolder["instance"], "loss_function")
        return method(*args)
    def accuracy(*args):
        method = getattr(instanceHolder["instance"], "accuracy")
        return method(*args)
    return {"ClassWrapper": ClassWrapper ,"CRF": ClassWrapper, "loss": loss, "accuracy":accuracy}



In [None]:
model=load_model("PreProcessedModel.h5",custom_objects=create_custom_objects())

Evaluation

In [None]:
test=pickle.load(open("Test","rb"))

In [None]:
tlabels=pickle.load(open("TLabels","rb"))

In [None]:
zones=pickle.load(open("Tzones","rb"))

In [None]:
test=pad_sequences(test, maxlen=max_len, dtype='int32', padding='post', truncating='post', value=0)


In [None]:
tlabels=pad_sequences(tlabels, maxlen=max_len, dtype='int32', padding='post', truncating='post', value=0)


In [None]:
tgen=TDataGenerator(test)

In [None]:
pred=model.predict_generator(generator=tgen,steps=len(test),use_multiprocessing=True,workers=8,verbose=1)

In [None]:
pred=[[np.argmax(j) for j in i]for i in pred]

In [None]:
true_y=tlabels.ravel()

In [None]:
pred = np.array(pred).ravel()

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
acc=accuracy_score(true_y,pred)

In [None]:
acc

In [None]:
precision, recall, fscore, support = score(true_y, pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

In [None]:
keys=list(t.keys())
for i in range(11):
    print(keys[i],precision[i],recall[i],fscore[i],support[i])

In [None]:
model1score={"Precision":precision,"Recall":recall,"F1-score":fscore,"Support":support}

In [None]:
pickle.dump(model1score,open("score","wb"))

In [None]:
model1score["Id"]=list(t.keys())

In [None]:
pd.DataFrame.from_dict(score)