In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [None]:
import tensorflow as tf
from tensorflow.python import keras

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
from keras.models import Sequential, Model
from tensorflow.keras.layers import Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k
# from keras_contrib.layers import CRF
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
# from keras_contrib.layers import CRF
# from tensorflow_addons.layers import CRF
from tf_crf_layer.loss import crf_loss
from tf_crf_layer.metrics import crf_accuracy

In [None]:
data = pd.read_pickle('data_all.pkl')

In [None]:
with open('dict_wordsB.pkl', 'rb') as dict_words:
    dict_words = pickle.load(dict_words)

In [None]:
with open('dict_tegsB.pkl', 'rb') as dict_tegs:
    dict_tegs = pickle.load(dict_tegs)

In [None]:
n_token = len(dict_words)
n_token

In [None]:
n_tag = len(dict_tegs)
n_tag

In [None]:
maxlen = max([len(s) for s in  data['text_ind'].tolist()])
maxlen

In [None]:
def get_pad_train_test_val(data):

    #максимальная длина токена и тэга
#     n_token = len(list(set(data['Word'].to_list())))
#     n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data['text_ind'].tolist()
    maxlen = max([len(s) for s in  data['text_ind'].tolist()])
#     maxlen = 100
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) и конвертируем в one hot encoding
    tags = data['tag_ind'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= dict_tegs["o"])
    n_tags = len(dict_tegs)+1
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tags length:', len(train_tags),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

In [None]:
train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data)

In [None]:
# фиксируем состояния для воспроизводимости экспериментов
from numpy.random import seed
tf.random.set_seed(2)

In [None]:
input_dim = n_token+1
output_dim = 300
input_length = max([len(s) for s in  data['text_ind'].tolist()])
# input_length = 100
n_tags = len(dict_tegs)+1
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

In [None]:
def get_bilstm_lstm_model():
    model = Sequential()


    # Слой Embedding
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    model.add(Dropout(0.2))

    # Слой BILSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    #     model.add(Dropout(0.2))

    # Слой LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    #     model.add(Dropout(0.2))

    #     crf_layer = CRF(n_tags)
    #     model.add(crf_layer)

    # Слой timeDistributed Layer 
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

    # crf_layer = CRF(n_tags)
    # model.add(crf_layer)

    model.summary()

    #Optimiser 
    adam = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # model.compile(optimizer='adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])


    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

In [None]:
# hist = model.fit(train_tokens, np.array(train_tags), batch_size=64, verbose=1, epochs=1, validation_split=0.2)

In [None]:
def train_model(X, y, model):
    loss = list()
    for i in range(5):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=64, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [None]:
results = pd.DataFrame()

In [None]:
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)

In [None]:
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

In [None]:
plt.style.use('ggplot')

def plot_history(history):
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(accuracy) + 1)

    plt.figure(figsize=(12, 5))
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

plot_history(history)

In [None]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i])
        out.append(out_i)
    return out
test_pred = model.predict(X_test, verbose=1)   
pred_labels = pred2label(test_pred)
test_labels = pred2label(test_tags)

In [None]:
!pip install seqeval

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

In [None]:
!pip install sklearn_crfsuite

In [None]:
from  sklearn_crfsuite.metrics import flat_classification_report  
report = flat_classification_report(y_pred=pred_labels, y_true=test_labels)
print(report)