# Challenges in NLP, WS19/20

Blaschke Verena, ISCL MA<br/>
Korniyenko Maxim, ISCL MA<br/>
Tureski Sam, ML MA<br/>

-----
## Baseline model for Span Identification task
-----

The working process looks like the following:
- Data preparation.
- Creating the model.
- Training the model.
- Testing the model.

In [1]:
from google.colab import drive

import pandas as pd
import numpy as np
import collections
from enum import Enum

# Creating the model
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Dropout, TimeDistributed
from keras.models import Sequential

# Results analysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

Using TensorFlow backend.


In [0]:
# installing tools for oversampling
# !pip install -U imbalanced-learn

In [0]:
class InputType(Enum):
    BASELINE = 1
    IMPROVED = 2
    BASELINE_TWENTY = 3
    BASELINE_FOURTY = 4
    BASELINE_FIFTY = 5
    IMPROVED_SENTIMENT = 6

# Config

In [0]:
# input_type = InputType.BASELINE
input_type = InputType.IMPROVED_SENTIMENT
add_sentiment_features = True

EMBEDDING_DIM = 100
batch_size = 128

# epochs = 10
epochs = 15

N_CLASSES = 2
# N_CLASSES = 3
# class_weighting = False
class_weighting = True
o_weight = 1.0
i_weight = 6.5
b_weight = 6.5

test_split = False


#1. Data preparation

Deciding on the data preprocessing type

Reading the data from the file and storing it in a data frame

In [5]:
if input_type == InputType.IMPROVED:
  train_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-bio-improved.tsv?token=AD7GEDJSZEXG4DJTXIDCSY2542WA4'
elif input_type == InputType.BASELINE:
  train_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-bio-baseline.tsv?token=AF75TYYBV4BIFHV3R5VNB2C537DZO'
elif input_type == InputType.BASELINE_FIFTY:
  train_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-bio-baseline-50.tsv?token=AD7GEDL2V5JNNRXKNTPVGAC54WIEQ'
elif input_type == InputType.BASELINE_TWENTY:
  # Uploaded this as a temp file to this colab project
  train_url = '/content/train-data-bio-baseline-20.tsv'
elif input_type == InputType.BASELINE_FOURTY:
  # Uploaded this as a temp file to this colab project
  train_url = '/content/train-data-bio-baseline-40.tsv'
elif input_type == InputType.IMPROVED_SENTIMENT:
  # Uploaded this as a temp file to this colab project
  train_url = '/content/train-data-bio-improved-sentiment.tsv'

names = ["document_id", "sent_number","idx_token_beginning", "idx_token_end", "token", "bio_label"]
if add_sentiment_features:
  names.append("sentiment")

train_df = pd.read_csv(train_url, sep='\t',names=names, quoting = 3)
train_df.head()

Unnamed: 0,document_id,sent_number,idx_token_beginning,idx_token_end,token,bio_label,sentiment
0,111111111,1,0,4,Next,O,0.0
1,111111111,1,5,11,plague,O,0.0
2,111111111,1,12,20,outbreak,O,0.0
3,111111111,1,21,23,in,O,0.0
4,111111111,1,24,34,Madagascar,O,0.0


Getting the data frame with sentences and saving tokens to the list

In [6]:
print(train_df["bio_label"].value_counts())

O    350354
I     45542
B      5392
Name: bio_label, dtype: int64


In [0]:
def get_sentence_list(input_df):
  df_sents = input_df.groupby('sent_number')['token'].apply(list)
  df_sents = df_sents.to_frame()
  df_sents['sent_number'] = df_sents.index
  df_sents["sentences"]= df_sents["token"].str.join(" ")
  sentence_list = df_sents["token"].to_list()
  return sentence_list

In [0]:
train_sentence_list = get_sentence_list(train_df)

Getting the data frame with labels and them to the list


In [0]:
def get_cols(input_df, col='bio_label'):
  df_labels = input_df.groupby('sent_number')[col].apply(list)
  df_labels = df_labels.to_frame()
  return df_labels[col].to_list()

In [0]:
train_bio_sent_list = get_cols(train_df)

In [0]:
if add_sentiment_features:
  train_sentiment = get_cols(train_df, 'sentiment')

## Encoding data

In [12]:
if input_type == InputType.BASELINE_FIFTY:
  MAX_SEQUENCE_LENGTH = 50
elif input_type == InputType.BASELINE_TWENTY:
  MAX_SEQUENCE_LENGTH = 20
elif input_type == InputType.BASELINE_FOURTY:
  MAX_SEQUENCE_LENGTH = 40
else:
  MAX_SEQUENCE_LENGTH = 35

MAX_SEQUENCE_LENGTH

35

#### Encoding features

Reading the glove embeddings from the file.

In [13]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
embeddings_index = {}
file_path = 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt'
f = open(file_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [0]:
def get_X(sentence_list, 
          max_sequence_length,
          embedding_dim,
          n_features,
          sentiment_features):
  # first create a matrix of zeros, this is our embedding matrix
  embedding_matrix = np.zeros([len(sentence_list),
                               max_sequence_length,
                               embedding_dim + n_features])
  print(embedding_matrix.shape)
  # for each word in out tokenizer lets try to find that work in our w2v model
  for i, sentence in enumerate(sentence_list):
    for j, word in enumerate(sentence_list[i]):
      if j > max_sequence_length:
          #Split these longer sentences later
          continue
      embedding_vector = embeddings_index.get(word, np.random.randn(embedding_dim))
      # if embedding_vector is not None:
      #     # we found the word - add that words vector to the matrix
      #     embedding_matrix[i] = embedding_vector
      # else:
      #     # doesn't exist, assign a random vector
      #     embedding_matrix[i] = np.random.randn(embedding_dim)
      if sentiment_features:
        embedding_matrix[i][j][:embedding_dim] = embedding_vector
        embedding_matrix[i][j][-n_features] = sentiment_features[i][j]
      else:
        embedding_matrix[i][j] = embedding_vector
  return embedding_matrix

In [23]:
n_features = 0
if add_sentiment_features:
  sentiment_features = train_sentiment
  n_features = 1
else:
  sentiment_features = None

train_features = get_X(sentence_list=train_sentence_list,
                       max_sequence_length=MAX_SEQUENCE_LENGTH,
                       embedding_dim=EMBEDDING_DIM,
                       n_features=n_features,
                       sentiment_features=sentiment_features)

(21501, 35, 101)


#### Encoding labels

In [0]:
if N_CLASSES == 3:
  label2idx = {"O": [1, 0, 0], "B": [0, 0, 1], "I": [0, 1, 0]}
elif N_CLASSES == 2:
  label2idx = {"O": [1, 0], "B": [0, 1], "I": [0, 1]}

In [0]:
def get_y(sentence_list, 
          bio_sent_list,
          label_dict, 
          max_sequence_length, 
          n_classes):
  # first create a matrix of zeros, this is our embedding matrix
  if n_classes == 1:
    labels = np.zeros([len(sentence_list), max_sequence_length])
  else:
    labels = np.zeros([len(sentence_list), max_sequence_length, n_classes])
  # for each word in out tokenizer lets try to find that work in our w2v model
  for i, sentence in enumerate(sentence_list):
    for j, word in enumerate(bio_sent_list[i]):
      if j < max_sequence_length:
        labels[i][j] = label_dict.get(word)
      else:
        break
  return labels

In [0]:
y = get_y(sentence_list=train_sentence_list,
          bio_sent_list=train_bio_sent_list,
          label_dict=label2idx,
          max_sequence_length=MAX_SEQUENCE_LENGTH,
          n_classes=N_CLASSES)

In [29]:
if class_weighting:
  label2weight = {'O': o_weight, 'I': i_weight, 'B': b_weight}
  sample_weight = get_y(sentence_list=train_sentence_list,
                        bio_sent_list=train_bio_sent_list,
                        label_dict=label2weight,
                        max_sequence_length=MAX_SEQUENCE_LENGTH,
                        n_classes=1)
  print(sample_weight[2])
else:
  sample_weight = None

[1.  6.5 6.5 6.5 6.5 6.5 6.5 6.5 6.5 6.5 1.  1.  1.  1.  1.  1.  1.  1.
 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0. ]


BONUS: Some attempts of applying oversampling

In [0]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(ratio='minority')
# y_train=y_train.astype('int')
# print(y_train[0])
# rows = y_train.shape[0]
# y_train = y_train.reshape(-1, n_classes).argmax(axis=1).flatten()
# print(y_train.shape)
# X_train = X_train.reshape(rows, MAX_SEQUENCE_LENGTH, 100).reshape(-1, 100)
# print(X_train.shape)
# X_sm, y_sm = smote.fit_sample(X_train, y_train)

# 2. Creating the model

In [32]:
model = Sequential()

model.add(Bidirectional(CuDNNLSTM(512, return_sequences=True), input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM + n_features)))
model.add(Dropout(0.25))

model.add(TimeDistributed(Dense(N_CLASSES, activation='softmax')))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'],
              sample_weight_mode='temporal')
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, 35, 1024)          2519040   
_________________________________________________________________
dropout_2 (Dropout)          (None, 35, 1024)          0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 35, 2)             2050      
Total params: 2,521,090
Trainable params: 2,521,090
Non-trainable params: 0
_________________________________________________________________
None


# 3. Training the model.

#### Training using all of the data. 

In [33]:
history = model.fit(train_features, y, epochs=epochs, batch_size=batch_size, verbose=1, validation_split=0.1, sample_weight=sample_weight)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 19350 samples, validate on 2151 samples
Epoch 1/15





Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


#### Training using only some part of the data. For testing the model.

Splitting the data

In [0]:
if test_split:
  X_train, X_test, y_train, y_test = train_test_split(train_features, y, test_size=0.1)
  history = model.fit(X_train, y_train, epochs=10, batch_size=batch_size, verbose=1, validation_split=0.1)

# 4. Testing the model

In [0]:
if test_split:
  y_hat = model.predict(X_test)
  y_hat_flat = y_hat.reshape(-1, N_CLASSES).argmax(axis=1)
  y_test_flat = y_test.reshape(-1, N_CLASSES).argmax(axis=1)
  print(f1_score(y_hat_flat, y_test_flat, average="macro"))

Making the true and predicted labels flat for further analysis.

In [0]:
if test_split:
  target_names = ["O", "B", "I"]
  print(classification_report(y_test_flat, y_hat_flat, target_names=target_names))
  print(confusion_matrix(y_true=y_test_flat, y_pred=y_hat_flat))

# 5. Getting predictions for development data

Reading the development data frame

In [0]:
if input_type == InputType.IMPROVED:
  dev_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-improved.tsv?token=AD7GEDLLJXGKMJV76VSXIKC542WAY'
elif input_type == InputType.BASELINE:
  dev_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-baseline.tsv?token=AF75TY24Z3BQ7Q3XIJ4A4TC54BBQI'
elif input_type == InputType.BASELINE_FIFTY:
  dev_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-baseline-50.tsv?token=AD7GEDMOORVAKHCVCU2N2EK54WIN2'
elif input_type == InputType.BASELINE_TWENTY:
  dev_url = '/content/dev-baseline-20.tsv'
elif input_type == InputType.BASELINE_FOURTY:
  dev_url = '/content/dev-baseline-40.tsv'
elif input_type == InputType.IMPROVED_SENTIMENT:
  dev_url = '/content/dev-improved-sentiment.tsv'

names = ["document_id", "sent_number","idx_token_beginning", "idx_token_end", "token"]
if add_sentiment_features:
  names.append("sentiment")
dev_df = pd.read_csv(dev_url, sep='\t',names=names, quoting = 3)

Getting the number of training instances (rows) in the data frame

In [0]:
n_rows_dev = dev_df.shape[0]

Preparing the dev data and making predictions

In [0]:
dev_sentence_list = get_sentence_list(dev_df)

In [43]:
if add_sentiment_features:
  dev_sentiment = get_cols(dev_df, 'sentiment')

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [48]:
dev_features = get_X(sentence_list=dev_sentence_list,
                     max_sequence_length=MAX_SEQUENCE_LENGTH,
                     embedding_dim=EMBEDDING_DIM,
                     n_features=n_features,
                     sentiment_features=dev_sentiment)

(3830, 35, 101)


In [0]:
y_hat = model.predict(dev_features)

From one-hot encoding to integers

In [0]:
y_hat = y_hat.reshape(-1, N_CLASSES).argmax(axis=1).reshape(len(dev_sentence_list), MAX_SEQUENCE_LENGTH)

Mapping the predictions to the corresponding indeces in the dev data frame

In [0]:
def get_labels_vector(sentence_list,
                      predicted_labels,
                      max_sequence_length,
                      n_rows):

  labels_vector = []

  for i, _ in enumerate(sentence_list):
    for j, _ in enumerate(sentence_list[i]):      
      if predicted_labels[i][j] == 0:
        label = "O"
      elif predicted_labels[i][j] == 1:
        label = "I"
      else:
        label = "B"
      labels_vector.append(label)
  return labels_vector

In [0]:
predicted_labels_column = get_labels_vector(dev_sentence_list, 
                                            y_hat, 
                                            MAX_SEQUENCE_LENGTH,
                                            n_rows_dev)

Concatenation of the original dev data frame and the prediction vector

In [0]:
result_df = pd.concat([dev_df, pd.DataFrame(predicted_labels_column, columns=["bio_label"])], axis=1, sort=False)

Overview of the results

In [54]:
result_df.head()

Unnamed: 0,document_id,sent_number,idx_token_beginning,idx_token_end,token,sentiment,bio_label
0,730081389,1,0,6,Police,0.0,O
1,730081389,1,7,10,had,0.0,O
2,730081389,1,11,21,previously,0.0,O
3,730081389,1,22,26,gone,0.0,O
4,730081389,1,27,29,to,0.0,O


In [55]:
result_df["bio_label"].value_counts()

O    59211
I     7962
Name: bio_label, dtype: int64

Saving data frame to a file

In [0]:
result_df.to_csv(path_or_buf="dev_predictions_sentiment_bio.tsv",
                 sep="\t",
                 header=False,
                 index=False)