# Challenges in NLP, WS19/20

Blaschke Verena, ISCL MA<br/>
Korniyenko Maxim, ISCL MA<br/>
Tureski Sam, ML MA<br/>

-----
## Baseline model for Span Identification task
-----

The working process looks like the following:
- Data preparation.
- Creating the model.
- Training the model.
- Testing the model.

In [1]:
from google.colab import drive

import pandas as pd
import numpy as np

# Creating the model
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Dropout, TimeDistributed
from keras.models import Sequential

# Results analysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

Using TensorFlow backend.


#1. Data preparation

Reading the data from the file and storing it in a data frame

In [0]:
url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-bio.tsv?token=AFDEFD7WXGQPGEJK6X5OB6C53AMEC'
df = pd.read_csv(url, sep='\t',names=["document_id", "sent_number","idx_token_beginning", "idx_token_end", "token","bio_label"], quoting = 3)

Getting the data frame with sentences and saving tokens to the list

In [0]:
df_sents = df.groupby('sent_number')['token'].apply(list)
df_sents = df_sents.to_frame()
df_sents['sent_number'] = df_sents.index
df_sents["sentences"]= df_sents["token"].str.join(" ")

In [0]:
sentence_list = df_sents["token"].to_list()

Getting the data frame with labels and them to the list


In [0]:
df_labels = df.groupby('sent_number')['bio_label'].apply(list)
df_labels = df_labels.to_frame()

In [0]:
bio_sent_list = df_labels["bio_label"].to_list()

## Encoding data

In [0]:
MAX_SEQUENCE_LENGTH = 30

#### Encoding features

Reading the glove embeddings from the file.

In [8]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:
embeddings_index = {}
file_path = 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt'
f = open(file_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [0]:
embedding_dim = 100
# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros([len(sentence_list), MAX_SEQUENCE_LENGTH, embedding_dim])
# for each word in out tokenizer lets try to find that work in our w2v model
for i, sentence in enumerate(sentence_list):
    for j, word in enumerate(sentence_list[i]):
        if j > MAX_SEQUENCE_LENGTH:
        #Split these longer sentences later
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # we found the word - add that words vector to the matrix
            embedding_matrix[i] = embedding_vector
        else:
            # doesn't exist, assign a random vector
            embedding_matrix[i] = np.random.randn(embedding_dim)

#### Encoding labels

In [0]:
# the following parameters should be changed
# if we switch back to three labels
label2idx = {"O": [1, 0], "I": [0, 1], "B": [0, 1]}
n_classes = 2

In [0]:
# first create a matrix of zeros, this is our embedding matrix
y = np.zeros([len(sentence_list), MAX_SEQUENCE_LENGTH, n_classes])
# for each word in out tokenizer lets try to find that work in our w2v model
for i, sentence in enumerate(sentence_list):
    for j, word in enumerate(bio_sent_list[i]):
        if j < MAX_SEQUENCE_LENGTH:
            y[i][j] = label2idx.get(word)
        else:
            break 

Splitting the data

In [0]:
X_train, X_test, y_train, y_test = train_test_split(embedding_matrix, y, test_size=0.1)

# 2. Creating the model

In [14]:
model = Sequential()

model.add(Bidirectional(CuDNNLSTM(512, return_sequences=True), input_shape=(MAX_SEQUENCE_LENGTH, embedding_dim)))
model.add(Dropout(0.25))

model.add(TimeDistributed(Dense(n_classes, activation='softmax')))

model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['categorical_accuracy'])
print(model.summary())





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 30, 1024)          2514944   
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 1024)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 30, 2)             2050      
Total params: 2,516,994
Trainable params: 2,516,994
Non-trainable params: 0
_________________________________________________________________
None


# 3. Training the model.

In [15]:
batch_size = 128
history = model.fit(X_train, y_train, epochs=10, batch_size=batch_size, verbose=1, validation_split=0.1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 12687 samples, validate on 1410 samples
Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 4. Testing the model

In [0]:
y_hat = model.predict(X_test)

Making the true and predicted labels flat for further analysis.

In [0]:
y_hat_flat = y_hat.reshape(-1, n_classes).argmax(axis=1)

In [0]:
y_test_flat = y_test.reshape(-1, n_classes).argmax(axis=1)

In [19]:
f1_score(y_hat_flat, y_test_flat, average="macro")

0.48639353028075866

In [0]:
target_names = ["O", "I"]

In [21]:
print(classification_report(y_test_flat, y_hat_flat, target_names=target_names))

              precision    recall  f1-score   support

           O       0.91      1.00      0.95     42922
           I       0.19      0.01      0.02      4088

    accuracy                           0.91     47010
   macro avg       0.55      0.50      0.49     47010
weighted avg       0.85      0.91      0.87     47010



In [22]:
confusion_matrix(y_true=y_test_flat, y_pred=y_hat_flat)

array([[42738,   184],
       [ 4045,    43]])