# Challenges in NLP, WS19/20

Blaschke Verena, ISCL MA<br/>
Korniyenko Maxim, ISCL MA<br/>
Tureski Sam, ML MA<br/>

-----
## Baseline model for Span Identification task
-----

The working process looks like the following:
- Data preparation.
- Creating the model.
- Training the model.
- Testing the model.

In [1]:
from google.colab import drive

import pandas as pd
import numpy as np
import collections
from enum import Enum

# Creating the model
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Dropout, TimeDistributed
from keras.models import Sequential

# Results analysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

Using TensorFlow backend.


In [0]:
# installing tools for oversampling
# !pip install -U imbalanced-learn

#1. Data preparation

Deciding on the data preprocessing type

In [0]:
class InputType(Enum):
    BASELINE = 1
    IMPROVED = 2

input_type = InputType.IMPROVED

Reading the data from the file and storing it in a data frame

In [11]:
if input_type == InputType.IMPROVED:
  train_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-bio-improved.tsv?token=AD7GEDJXECH44VAMQAIRA7S54WBLU'
elif input_type == InputType.BASELINE:
  train_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-bio-baseline.tsv?token=AF75TYYBV4BIFHV3R5VNB2C537DZO'

train_df = pd.read_csv(train_url, sep='\t',names=["document_id", "sent_number","idx_token_beginning", "idx_token_end", "token","bio_label"], quoting = 3)
train_df.head()

Unnamed: 0,document_id,sent_number,idx_token_beginning,idx_token_end,token,bio_label
0,111111111,1,0,4,Next,O
1,111111111,1,5,11,plague,O
2,111111111,1,12,20,outbreak,O
3,111111111,1,21,23,in,O
4,111111111,1,24,34,Madagascar,O


Getting the data frame with sentences and saving tokens to the list

In [0]:
def get_sentence_list(input_df):
  df_sents = input_df.groupby('sent_number')['token'].apply(list)
  df_sents = df_sents.to_frame()
  df_sents['sent_number'] = df_sents.index
  df_sents["sentences"]= df_sents["token"].str.join(" ")
  sentence_list = df_sents["token"].to_list()
  return sentence_list

In [0]:
train_sentence_list = get_sentence_list(train_df)

Getting the data frame with labels and them to the list


In [0]:
def get_bio_sent_list(input_df):
  df_labels = input_df.groupby('sent_number')['bio_label'].apply(list)
  df_labels = df_labels.to_frame()
  bio_sent_list = df_labels["bio_label"].to_list()
  return bio_sent_list

In [0]:
train_bio_sent_list = get_bio_sent_list(train_df)

## Encoding data

In [0]:
MAX_SEQUENCE_LENGTH = 35

#### Encoding features

Reading the glove embeddings from the file.

In [17]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [18]:
embeddings_index = {}
file_path = 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt'
f = open(file_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [0]:
EMBEDDING_DIM = 100

In [0]:
def get_X(sentence_list, 
          max_sequence_length,
          embedding_dim):
  # first create a matrix of zeros, this is our embedding matrix
  embedding_matrix = np.zeros([len(sentence_list), max_sequence_length, embedding_dim])
  # for each word in out tokenizer lets try to find that work in our w2v model
  for i, sentence in enumerate(sentence_list):
    for j, word in enumerate(sentence_list[i]):
      if j > max_sequence_length:
          #Split these longer sentences later
          continue
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          # we found the word - add that words vector to the matrix
          embedding_matrix[i] = embedding_vector
      else:
          # doesn't exist, assign a random vector
          embedding_matrix[i] = np.random.randn(embedding_dim)
  return embedding_matrix

In [0]:
train_features = get_X(sentence_list=train_sentence_list,
                       max_sequence_length=MAX_SEQUENCE_LENGTH,
                       embedding_dim=EMBEDDING_DIM)

#### Encoding labels

In [0]:
# the following parameters should be changed
# if we switch back to three labels
label2idx = {"O": [1, 0, 0], "B": [0, 1, 0], "I": [0, 0, 1]}
N_CLASSES = 3

In [0]:
def get_y(sentence_list, 
          bio_sent_list,
          label_dict, 
          max_sequence_length, 
          n_classes):
  # first create a matrix of zeros, this is our embedding matrix
  labels = np.zeros([len(sentence_list), max_sequence_length, n_classes])
  # for each word in out tokenizer lets try to find that work in our w2v model
  for i, sentence in enumerate(sentence_list):
    for j, word in enumerate(bio_sent_list[i]):
      if j < max_sequence_length:
        labels[i][j] = label_dict.get(word)
      else:
        break
  return labels

In [0]:
y = get_y(sentence_list=train_sentence_list,
          bio_sent_list=train_bio_sent_list,
          label_dict=label2idx,
          max_sequence_length=MAX_SEQUENCE_LENGTH,
          n_classes=N_CLASSES)

BONUS: Some attempts of applying oversampling

In [0]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(ratio='minority')
# y_train=y_train.astype('int')
# print(y_train[0])
# rows = y_train.shape[0]
# y_train = y_train.reshape(-1, n_classes).argmax(axis=1).flatten()
# print(y_train.shape)
# X_train = X_train.reshape(rows, MAX_SEQUENCE_LENGTH, 100).reshape(-1, 100)
# print(X_train.shape)
# X_sm, y_sm = smote.fit_sample(X_train, y_train)

# 2. Creating the model

In [25]:
model = Sequential()

model.add(Bidirectional(CuDNNLSTM(512, return_sequences=True), input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)))
model.add(Dropout(0.25))

model.add(TimeDistributed(Dense(N_CLASSES, activation='softmax')))

model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['categorical_accuracy'])
print(model.summary())





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 35, 1024)          2514944   
_________________________________________________________________
dropout_1 (Dropout)          (None, 35, 1024)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 35, 3)             3075      
Total params: 2,518,019
Trainable params: 2,518,019
Non-trainable params: 0
_________________________________________________________________
None


# 3. Training the model.

#### Training using all of the data. 

In [26]:
batch_size = 128
history = model.fit(train_features, y, epochs=10, batch_size=batch_size, verbose=1, validation_split=0.1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 14097 samples, validate on 1567 samples
Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Training using only some part of the data. For testing the model.

Splitting the data

In [0]:
test_split = True

In [42]:
if test_split:
  X_train, X_test, y_train, y_test = train_test_split(train_features, y, test_size=0.1)
  history = model.fit(X_train, y_train, epochs=10, batch_size=batch_size, verbose=1, validation_split=0.1)

Train on 12687 samples, validate on 1410 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 4. Testing the model

In [45]:
if test_split:
  y_hat = model.predict(X_test)
  y_hat_flat = y_hat.reshape(-1, N_CLASSES).argmax(axis=1)
  y_test_flat = y_test.reshape(-1, N_CLASSES).argmax(axis=1)
  print(f1_score(y_hat_flat, y_test_flat, average="macro"))

0.35392193810572214


  'recall', 'true', average, warn_for)


Making the true and predicted labels flat for further analysis.

In [44]:
if test_split:
  target_names = ["O", "B", "I"]
  print(classification_report(y_test_flat, y_hat_flat, target_names=target_names))
  print(confusion_matrix(y_true=y_test_flat, y_pred=y_hat_flat))

              precision    recall  f1-score   support

           O       0.93      1.00      0.96     50666
           B       0.00      0.00      0.00       494
           I       0.46      0.06      0.10      3685

    accuracy                           0.92     54845
   macro avg       0.46      0.35      0.35     54845
weighted avg       0.89      0.92      0.89     54845

[[50423     0   243]
 [  487     0     7]
 [ 3474     0   211]]


  'precision', 'predicted', average, warn_for)


# 5. Getting predictions for development data

Reading the development data frame

In [0]:
if input_type == InputType.IMPROVED:
  dev_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-improved.tsv?token=AD7GEDK7D5VA3ADADCVIWHK54WCWM'
elif input_type == InputType.BASELINE:
  dev_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-baseline.tsv?token=AF75TY24Z3BQ7Q3XIJ4A4TC54BBQI'

dev_df = pd.read_csv(dev_url, sep='\t',names=["document_id", "sent_number","idx_token_beginning", "idx_token_end", "token"], quoting = 3)

Getting the number of training instances (rows) in the data frame

In [0]:
n_rows_dev = dev_df.shape[0]

Preparing the dev data and making predictions

In [0]:
dev_sentence_list = get_sentence_list(dev_df)

In [0]:
dev_features = get_X(sentence_list=dev_sentence_list,
                     max_sequence_length=MAX_SEQUENCE_LENGTH,
                     embedding_dim=EMBEDDING_DIM)

In [0]:
y_hat = model.predict(dev_features)

From one-hot encoding to integers

In [0]:
y_hat = y_hat.reshape(-1, N_CLASSES).argmax(axis=1).reshape(len(dev_sentence_list), MAX_SEQUENCE_LENGTH)

Mapping the predictions to the corresponding indeces in the dev data frame

In [0]:
def get_labels_vector(sentence_list,
                      predicted_labels,
                      max_sequence_length,
                      n_rows):

  labels_vector = []

  for i, _ in enumerate(sentence_list):
    for j, _ in enumerate(sentence_list[i]):
      if predicted_labels[i][j] == 0:
        label = "O"
      elif predicted_labels[i][j] == 1:
        label = "B"
      else:
        label = "I"
      labels_vector.append(label)
  return labels_vector

In [0]:
predicted_labels_column = get_labels_vector(dev_sentence_list, 
                                            y_hat, 
                                            MAX_SEQUENCE_LENGTH,
                                            n_rows_dev)

Concatenation of the original dev data frame and the prediction vector

In [0]:
result_df = pd.concat([dev_df, pd.DataFrame(predicted_labels_column, columns=["bio_label"])], axis=1, sort=False)

Overview of the results

In [37]:
result_df.head()

Unnamed: 0,document_id,sent_number,idx_token_beginning,idx_token_end,token,bio_label
0,730081389,1,0,6,Police,O
1,730081389,1,7,10,had,O
2,730081389,1,11,21,previously,O
3,730081389,1,22,26,gone,O
4,730081389,1,27,29,to,O


In [38]:
result_df["bio_label"].value_counts()

O    67532
I       32
Name: bio_label, dtype: int64

Saving data frame to a file

In [0]:
result_df.to_csv(path_or_buf="dev_predictions_bio.tsv",
                 sep="\t",
                 header=False,
                 index=False)