# Evaluation of BiLSTM-CRF

## Setup

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

### Imports

In [0]:
import pandas as pd
import time
import numpy as np
import math
from sklearn.model_selection import train_test_split, KFold

In [0]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Activation, concatenate, Flatten, Conv1D, MaxPooling1D, Masking
#!pip install -q keras_contrib
!pip install git+https://www.github.com/keras-team/keras-contrib.git
from keras_contrib.layers import CRF
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import classification_report

!pip install -q seqeval
from seqeval.metrics import classification_report as token_classification_report

### Data

In [0]:
dataset_path = F"/content/gdrive/My Drive/Colab Notebooks/Corpus/full_annotations_v10.csv"

In [0]:
df = pd.read_csv(dataset_path)
reviews = list(set(df['Review'].values))
reviews.sort(key=lambda x: int(x[6:]))

### Word Embeddings

In [0]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [0]:
def word2idx(word):
  if word not in word_to_index:
    idx = word_to_index['unk']
  else:
    idx = word_to_index[word]
  return idx

In [0]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
  vocab_len = len(word_to_index) + 1
  emb_dimension = word_to_vec_map['gurke'].shape[0]
  emb_matrix = np.zeros((vocab_len, emb_dimension))
  for word, index in word_to_index.items():
    emb_matrix[index, :] = word_to_vec_map[str(word)]
 
  embedding_layer = Embedding(vocab_len, emb_dimension, trainable=False)
  embedding_layer.build((None,))
  embedding_layer.set_weights([emb_matrix])

  return embedding_layer

In [0]:
#Fasttext

we_path_ft = F"/content/gdrive/My Drive/Colab Notebooks/Corpus/basket/cc.de.300.vec"

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(we_path_ft)

In [0]:
#Glove

we_path_glove = F"/content/gdrive/My Drive/Colab Notebooks/Corpus/basket/vectors.txt"

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(we_path_glove)

## Final Testing, all combined


#### Data Handling

In [0]:
df_new = df
reviews_train, reviews_dev_test = train_test_split(reviews, test_size=0.3, random_state=42)
reviews_dev, reviews_test = train_test_split(reviews_dev_test, test_size = (1/3), random_state=42)

z = 'O'
combined_labels = list()
for i in range(len(df_new)):
  if df_new['BIO'].iloc[i] == 'B':
    z = df_new['Ann_Ann'].iloc[i]
    combined_labels.append('B-' + z)
  elif df_new['BIO'].iloc[i] == 'I':
    combined_labels.append('I-' + z)
  else:
    z = 'O'
    combined_labels.append(z)

df_new['combined_labels'] = combined_labels

df_new_train = df_new[df_new['Review'].isin(reviews_train)]
df_new_dev = df_new[df_new['Review'].isin(reviews_dev)]
df_new_test = df_new[df_new['Review'].isin(reviews_test)]

n_tokens = len(df_new['Tokens'])

In [0]:
#Grouping the tokens per Review

reviews_train_tokens = dict()
reviews_train_labels = dict()
for review in reviews_train:
  df = df_new_train[df_new_train['Review'] == review]
  full_review = list(df['Tokens'].values)
  full_bio = list(df['combined_labels'].values)
  reviews_train_tokens[review] = full_review
  reviews_train_labels[review] = full_bio

reviews_dev_tokens = dict()
reviews_dev_labels = dict()
for review in reviews_dev:
  df = df_new_dev[df_new_dev['Review'] == review]
  full_review = list(df['Tokens'].values)
  full_bio = list(df['combined_labels'].values)
  reviews_dev_tokens[review] = full_review
  reviews_dev_labels[review] = full_bio

reviews_test_tokens = dict()
reviews_test_labels = dict()
for review in reviews_test:
  df = df_new_test[df_new_test['Review'] == review]
  full_review = list(df['Tokens'].values)
  full_bio = list(df['combined_labels'].values)
  reviews_test_tokens[review] = full_review
  reviews_test_labels[review] = full_bio

In [0]:
#preparing the labels

labels = list(set(df_new['combined_labels'].values))
labels.append('PAD')
n_labels = len(labels)
label2idx = {t: i for i, t in enumerate(labels)}
y_train = [[label2idx[i] for i in reviews_train_labels[rev]] for rev in reviews_train]
y_dev = [[label2idx[i] for i in reviews_dev_labels[rev]] for rev in reviews_dev]
y_test = [[label2idx[i] for i in reviews_test_labels[rev]] for rev in reviews_test]

In [0]:
n_reviews_train = len(reviews_train)
n_reviews_dev = len(reviews_dev)
n_reviews_test = len(reviews_test)

X_train = [[word2idx(str(i).lower()) for i in reviews_train_tokens[rev]] for rev in reviews_train]
X_dev = [[word2idx(str(i).lower()) for i in reviews_dev_tokens[rev]] for rev in reviews_dev]
X_test = [[word2idx(str(i).lower()) for i in reviews_test_tokens[rev]] for rev in reviews_test]

maxLen = len(max(X_train + X_dev + X_test, key=len))

In [1]:
X_train = pad_sequences(maxlen=maxLen, sequences=X_train, padding="post", value=word_to_index['pad'])
X_dev = pad_sequences(maxlen=maxLen, sequences=X_dev, padding="post", value=word_to_index['pad'])
X_test = pad_sequences(maxlen=maxLen, sequences=X_test, padding="post", value=word_to_index['pad'])

y_train = pad_sequences(maxlen=maxLen, sequences=y_train, padding="post", value=label2idx['PAD'])
y_train_cat = [to_categorical(i, num_classes=n_labels) for i in y_train]
y_dev = pad_sequences(maxlen=maxLen, sequences=y_dev, padding="post", value=label2idx['PAD'])
y_dev_cat = [to_categorical(i, num_classes=n_labels) for i in y_dev]
y_test = pad_sequences(maxlen=maxLen, sequences=y_test, padding="post", value=label2idx['PAD'])
y_test_cat = [to_categorical(i, num_classes=n_labels) for i in y_test]


NameError: ignored

In [0]:
X_train_arr = np.array(X_train)
X_dev_arr = np.array(X_dev)
X_test_arr = np.array(X_test)

y_train_arr = np.array(y_train_cat)
y_dev_arr = np.array(y_dev_cat)
y_test_arr = np.array(y_test_cat)

### Richtiger Ablauf

To test:


*   optimizer: sgd, adam
*   learning rate: 0.001, 0.05
*   dropout: 0.1, 0.5
*   number of hidden units: 200, 250




In [0]:
optimizer_list = ['sgd', 'adam']
learning_rate_list = [0.001, 0.05]
dropout_list =[0.1, 0.5]
hidden_units_list = [200, 250]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

#### 1/16
o 0, l 0, d 0, n 0

In [0]:
optimizer = optimizer_list[0]
learning_rate = learning_rate_list[0]
dropout = dropout_list[0]
hidden_units = hidden_units_list[0]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

#### 2/16
o 1, l 0, d 0, n 0

In [0]:
optimizer = optimizer_list[1]
learning_rate = learning_rate_list[0]
dropout = dropout_list[0]
hidden_units = hidden_units_list[0]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

##### Eval

In [0]:
test_pred = model.predict(X_dev_arr)

In [0]:
print(test_pred.shape)
print(y_dev_arr.shape)

(200, 775, 6)
(200, 775, 6)


In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_dev_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred
print(final_pred)
print(len(final_pred))

final_test = list()
for test in test_labels:
  final_test += test
print(final_test)
print(len(final_test))


['B-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'O', 'B-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'O', 'B-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Claim', 'O', 'B-Claim', 'I-Claim', 'O', 'O', 'O', 'B-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'O', 'B-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Premise', 'I-Claim', 'I-Claim', 'I-Claim', 'O', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'O', 'O', 'O', 'B-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Clai

In [0]:
print(classification_report(final_test, final_pred))

              precision    recall  f1-score   support

     B-Claim       0.41      0.49      0.45      1584
   B-Premise       0.38      0.37      0.37      1806
     I-Claim       0.53      0.77      0.63     20576
   I-Premise       0.69      0.42      0.52     22254
           O       0.71      0.66      0.68      9389

    accuracy                           0.59     55609
   macro avg       0.54      0.54      0.53     55609
weighted avg       0.62      0.59      0.58     55609



#### 3/16
o 1, l 1, d 0, n 0

In [0]:
optimizer = optimizer_list[1]
learning_rate = learning_rate_list[1]
dropout = dropout_list[0]
hidden_units = hidden_units_list[0]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

##### Eval

In [0]:
test_pred = model.predict(X_dev_arr)

In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_dev_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred


final_test = list()
for test in test_labels:
  final_test += test



In [0]:
print(classification_report(final_test, final_pred))

#### 4/16
o 1, l 1, d 1, n 0

In [0]:
optimizer = optimizer_list[1]
learning_rate = learning_rate_list[1]
dropout = dropout_list[1]
hidden_units = hidden_units_list[0]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

##### Eval

In [0]:
test_pred = model.predict(X_dev_arr)

In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_dev_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred


final_test = list()
for test in test_labels:
  final_test += test



In [0]:
print(classification_report(final_test, final_pred))

#### 5/16
o 1, l 1, d 1, n 1

In [0]:
optimizer = optimizer_list[1]
learning_rate = learning_rate_list[1]
dropout = dropout_list[1]
hidden_units = hidden_units_list[1]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

##### Eval

In [0]:
test_pred = model.predict(X_dev_arr)

In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_dev_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred


final_test = list()
for test in test_labels:
  final_test += test



In [0]:
print(classification_report(final_test, final_pred))

#### 6/16
o 0, l 1, d 1, n 1

In [0]:
optimizer = optimizer_list[0]
learning_rate = learning_rate_list[1]
dropout = dropout_list[1]
hidden_units = hidden_units_list[1]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

#### 7/16
o 0, l 0, d 1, n 1

In [0]:
optimizer = optimizer_list[0]
learning_rate = learning_rate_list[0]
dropout = dropout_list[1]
hidden_units = hidden_units_list[1]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

#### 8/16
o 0, l 0, d 0, n 1

In [0]:
optimizer = optimizer_list[0]
learning_rate = learning_rate_list[0]
dropout = dropout_list[0]
hidden_units = hidden_units_list[1]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

#### 9/16
o 0, l 1, d 0, n 1

In [0]:
optimizer = optimizer_list[0]
learning_rate = learning_rate_list[1]
dropout = dropout_list[0]
hidden_units = hidden_units_list[1]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

#### 10/16
o 1, l 1, d 0, n 1

In [0]:
optimizer = optimizer_list[1]
learning_rate = learning_rate_list[1]
dropout = dropout_list[0]
hidden_units = hidden_units_list[1]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

##### Eval

In [0]:
test_pred = model.predict(X_dev_arr)

In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_dev_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred


final_test = list()
for test in test_labels:
  final_test += test



In [0]:
print(classification_report(final_test, final_pred))

#### 11/16
o 1, l 0, d 0, n 1

In [0]:
optimizer = optimizer_list[1]
learning_rate = learning_rate_list[0]
dropout = dropout_list[0]
hidden_units = hidden_units_list[1]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

##### Eval

In [0]:
test_pred = model.predict(X_dev_arr)

In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_dev_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred


final_test = list()
for test in test_labels:
  final_test += test



In [0]:
print(classification_report(final_test, final_pred))

#### 12/16
o 1, l 0, d 1, n 1

In [0]:
optimizer = optimizer_list[1]
learning_rate = learning_rate_list[0]
dropout = dropout_list[1]
hidden_units = hidden_units_list[1]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

##### Eval

In [0]:
test_pred = model.predict(X_dev_arr)

In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_dev_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred


final_test = list()
for test in test_labels:
  final_test += test



In [0]:
print(classification_report(final_test, final_pred))

#### 13/16
o 1, l 0, d 1, n 0

In [0]:
optimizer = optimizer_list[1]
learning_rate = learning_rate_list[0]
dropout = dropout_list[1]
hidden_units = hidden_units_list[0]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

##### Eval

In [0]:
test_pred = model.predict(X_dev_arr)

In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_dev_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred


final_test = list()
for test in test_labels:
  final_test += test



In [0]:
print(classification_report(final_test, final_pred))

#### 14/16
o 0, l 0, d 1, n 0

In [0]:
optimizer = optimizer_list[0]
learning_rate = learning_rate_list[0]
dropout = dropout_list[1]
hidden_units = hidden_units_list[0]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

#### 15/16
o 0, l 1, d 0, n 0

In [0]:
optimizer = optimizer_list[0]
learning_rate = learning_rate_list[1]
dropout = dropout_list[0]
hidden_units = hidden_units_list[0]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

#### 16/16
o 0, l 1, d 1, n 0

In [0]:
optimizer = optimizer_list[0]
learning_rate = learning_rate_list[1]
dropout = dropout_list[1]
hidden_units = hidden_units_list[0]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

##### Test

In [0]:
test_pred = model.predict(X_test_arr)

In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred


final_test = list()
for test in test_labels:
  final_test += test



In [0]:
print(classification_report(final_test, final_pred))

## Final Model

o 1, l 1, d 0, n 1

#### Data Handling

In [0]:
df_new = df

z = 'O'
combined_labels = list()
for i in range(len(df_new)):
  if df_new['BIO'].iloc[i] == 'B':
    z = df_new['Ann_Ann'].iloc[i]
    combined_labels.append('B-' + z)
  elif df_new['BIO'].iloc[i] == 'I':
    combined_labels.append('I-' + z)
  else:
    z = 'O'
    combined_labels.append(z)

df_new['combined_labels'] = combined_labels

df_new_train = df_new[df_new['Review'].isin(reviews_train)]
df_new_dev = df_new[df_new['Review'].isin(reviews_dev)]
df_new_test = df_new[df_new['Review'].isin(reviews_test)]

n_tokens = len(df_new['Tokens'])
print(n_tokens)

272024


In [0]:
#Grouping the tokens per Review

reviews_train_tokens = dict()
reviews_train_labels = dict()
for review in reviews_train:
  df = df_new_train[df_new_train['Review'] == review]
  full_review = list(df['Tokens'].values)
  full_bio = list(df['combined_labels'].values)
  reviews_train_tokens[review] = full_review
  reviews_train_labels[review] = full_bio

reviews_dev_tokens = dict()
reviews_dev_labels = dict()
for review in reviews_dev:
  df = df_new_dev[df_new_dev['Review'] == review]
  full_review = list(df['Tokens'].values)
  full_bio = list(df['combined_labels'].values)
  reviews_dev_tokens[review] = full_review
  reviews_dev_labels[review] = full_bio

reviews_test_tokens = dict()
reviews_test_labels = dict()
for review in reviews_test:
  df = df_new_test[df_new_test['Review'] == review]
  full_review = list(df['Tokens'].values)
  full_bio = list(df['combined_labels'].values)
  reviews_test_tokens[review] = full_review
  reviews_test_labels[review] = full_bio

In [0]:
#preparing the labels

labels = list(set(df_new['combined_labels'].values))
labels.append('PAD')

n_labels = len(labels)

label2idx = {t: i for i, t in enumerate(labels)}



y_train = [[label2idx[i] for i in reviews_train_labels[rev]] for rev in reviews_train]

y_dev = [[label2idx[i] for i in reviews_dev_labels[rev]] for rev in reviews_dev]


y_test = [[label2idx[i] for i in reviews_test_labels[rev]] for rev in reviews_test]


['I-Claim', 'B-Premise', 'B-Claim', 'I-Premise', 'O', 'PAD']
6
{'I-Claim': 0, 'B-Premise': 1, 'B-Claim': 2, 'I-Premise': 3, 'O': 4, 'PAD': 5}
[[4, 4, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 1, 3, 3, 3, 3, 3, 3, 4, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 0, 0, 0, 0, 0, 4, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 0, 0, 0, 0, 4, 4, 2, 0, 0, 0, 0, 4, 4, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 3

In [0]:
n_reviews_train = len(reviews_train)
n_reviews_dev = len(reviews_dev)
n_reviews_test = len(reviews_test)

X_train = [[word2idx(str(i).lower()) for i in reviews_train_tokens[rev]] for rev in reviews_train]

X_dev = [[word2idx(str(i).lower()) for i in reviews_dev_tokens[rev]] for rev in reviews_dev]

X_test = [[word2idx(str(i).lower()) for i in reviews_test_tokens[rev]] for rev in reviews_test]


maxLen = len(max(X_train + X_dev + X_test, key=len))


[[762172, 762172, 706905, 336685, 227451, 206624, 290678, 762172, 150874, 175140, 205046, 518175, 186291, 407367, 76458, 154185, 579565, 25904, 302675, 762172, 150753, 774320, 480114, 161009, 78996, 760032, 202924, 150753, 779309, 762172, 336685, 227451, 161009, 189977, 584791, 154183, 446200, 290678, 847042, 760032, 161009, 264436, 446200, 340563, 667682, 645124, 762172, 762172, 653340, 762172, 69279, 155857, 692429, 219859, 480114, 156895, 762172, 762172, 346974, 496190, 161009, 778257, 692428, 176962, 150753, 839220, 762172, 156895, 565751, 350270, 839819, 762172, 336685, 823218, 161750, 518166, 202678, 161009, 279871, 839220, 186099, 343735, 34258, 227451, 336685, 514125, 250986, 676824, 645124, 762172, 762172, 150753, 409405, 837518, 155857, 125957, 639885, 480114, 208977, 46958, 156895, 441794, 257007, 762172, 175140, 678431, 155857, 729451, 152098, 676824, 28830, 762172, 150874, 175140, 17065, 584791, 762172, 762172, 762172, 43187, 348812, 480114, 514125, 382317, 762172, 346206,

In [0]:
X_train = pad_sequences(maxlen=maxLen, sequences=X_train, padding="post", value=word_to_index['pad'])
X_dev = pad_sequences(maxlen=maxLen, sequences=X_dev, padding="post", value=word_to_index['pad'])
X_test = pad_sequences(maxlen=maxLen, sequences=X_test, padding="post", value=word_to_index['pad'])

y_train = pad_sequences(maxlen=maxLen, sequences=y_train, padding="post", value=label2idx['PAD'])
y_train_cat = [to_categorical(i, num_classes=n_labels) for i in y_train]
y_dev = pad_sequences(maxlen=maxLen, sequences=y_dev, padding="post", value=label2idx['PAD'])
y_dev_cat = [to_categorical(i, num_classes=n_labels) for i in y_dev]
y_test = pad_sequences(maxlen=maxLen, sequences=y_test, padding="post", value=label2idx['PAD'])
y_test_cat = [to_categorical(i, num_classes=n_labels) for i in y_test]


In [0]:
X_train_arr = np.array(X_train)
X_dev_arr = np.array(X_dev)
X_test_arr = np.array(X_test)

y_train_arr = np.array(y_train_cat)
y_dev_arr = np.array(y_dev_cat)
y_test_arr = np.array(y_test_cat)

### Training

In [0]:
optimizer_list = ['sgd', 'adam']
learning_rate_list = [0.001, 0.05]
dropout_list =[0.1, 0.5]
hidden_units_list = [200, 250]

In [0]:
optimizer = optimizer_list[1]
learning_rate = learning_rate_list[0]
dropout = dropout_list[0]
hidden_units = hidden_units_list[1]

In [0]:
def bio_tagger(input_shape=(maxLen, ), word_to_vec_map=word_to_vec_map, word_to_index=word_to_index):
  review_indices = Input(shape = input_shape, dtype='int32')
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(review_indices)
  X = Bidirectional(LSTM(units=hidden_units, return_sequences=True, recurrent_dropout=dropout))(embeddings)
  X = TimeDistributed(Dense(units=hidden_units, activation="relu"))(X)
  crf = CRF(n_labels)
  out = crf(X)

  model = Model(inputs=review_indices, outputs=out)
  model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

  return model

In [0]:
model = bio_tagger()

In [0]:
model.fit(X_train_arr, y_train_arr, epochs = 10, batch_size = 32, shuffle=True, validation_data=[X_dev_arr, y_dev_arr])

### Evaluation

In [0]:
test_pred = model.predict(X_test_arr)

In [0]:
idx2label = {i: w for w, i in label2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2label[p_i])
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test_arr)

In [0]:
pred_labels = [[x for x in pred if x != 'PAD'] for pred in pred_labels]
test_labels = [[x for x in test if x != 'PAD'] for test in test_labels]

In [0]:
final_pred = list()
for pred in pred_labels:
  final_pred += pred


final_test = list()
for test in test_labels:
  final_test += test



In [0]:
#new best model
print(classification_report(final_test, final_pred))

In [0]:
#new best model
print(token_classification_report(final_test, final_pred))