# Challenges in NLP, WS19/20

Blaschke Verena, ISCL MA<br/>
Korniyenko Maxim, ISCL MA<br/>
Tureski Sam, ML MA<br/>

-----
## Baseline model for Span Identification task
-----

The working process looks like the following:
- Data preparation.
- Creating the model.
- Training the model.
- Testing the model.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import pandas as pd
import numpy as np
import collections
from enum import Enum
from itertools import takewhile
import urllib.request
import time

# Creating the model
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Dropout, TimeDistributed
from keras.models import Sequential

# Results analysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [0]:
# installing tools for oversampling
# !pip install -U imbalanced-learn

# Config

In [0]:
class Config:
  def __init__(self):
    self.MAX_SEQ_LEN = 35
    self.EMBED_DIM = 100
    self.BATCH_SIZE = 32
    self.EPOCHS = 10

    self.N_CLASSES = 2
    self.O_WEIGHT = 1.0
    self.I_WEIGHT = 6.5
    self.B_WEIGHT = 6.5

    self.LSTM_UNITS = 512
    self.DROPOUT = 0.25
    self.OPTIMIZER = 'adam'
    self.METRIC = 'categorical_accuracy'
    self.LOSS = 'categorical_crossentropy'

    self.TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-improved-sentiwordnet-arguingfull.tsv?token=AD7GEDLFTVHGUIDOG4EDKYK57FJJY'
    self.DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-improved-sentiwordnet-arguingfull.tsv?token=AD7GEDKHMNRQLNNRBNDYWJK57FJJ6'

    self.EMBEDDING_PATH = 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt'

# Input data

In [0]:
# Version for files that are not specified via a URL:
# def get_comments(filename):
#   with open(filename, 'r', encoding='utf8') as f:
#     commentiter = takewhile(lambda s: s.startswith('#'), f)
#     comments = list(commentiter)
#   return comments

def get_comments(filename):
  comments = []
  with urllib.request.urlopen(filename) as f:
    for line in f:
      if line.startswith(b'#'):
        comments.append(line)
      else:
        break
  return comments

In [0]:
def get_cols(input_df, col):
  return input_df.groupby('sent_id')[col].apply(list).to_frame()

In [0]:
def add_sent_lens(input_df, col='token'):
  input_df['n_toks'] = input_df[col].apply(lambda x: len(x))
  return input_df

In [0]:
def get_features(input_df, feature_cols):
  x = add_sent_lens(get_cols(input_df, 'token'))
  for feature in feature_cols:
    x = pd.merge(left=x, right=get_cols(input_df, feature),
                 left_on='sent_id', right_on='sent_id')
  return x

In [0]:
def encode_x(x, word2embedding, feature_header, max_seq_len, embed_dim):
  embedding_matrix = np.zeros([len(x),
                               max_seq_len,
                               embed_dim + len(feature_header)])
  for row in x.itertuples():
    sent_idx = row.Index - 1
    for tok_idx in range(row.n_toks):
      word = row.token[tok_idx]
      embedding_matrix[sent_idx][tok_idx][:embed_dim] = \
        word2embedding.get(word, np.random.randn(embed_dim))
      for i, feature in enumerate(feature_header):
        embedding_matrix[sent_idx][tok_idx][embed_dim + i] = \
          getattr(row, feature)[tok_idx]
  return embedding_matrix

In [0]:
def encode_y(y, label2idx, max_seq_len, n_classes):
  if n_classes == 1:
    labels = np.zeros([len(y), max_seq_len])
  else:
    labels = np.zeros([len(y), max_seq_len, n_classes])

  for row in y.itertuples():
    sent_idx = row.Index - 1
    for tok_idx, label in enumerate(row.label):
      labels[sent_idx][tok_idx] = label2idx[label]
  return labels

In [0]:
def prepare_data(config, word2embedding, training):
  # We're getting the comments this way so we can:
  # - add them to the output
  # - parse lines that contain '#' as the token
  if training:
    infile = config.TRAIN_URL
  else:
    infile = config.DEV_URL
  comments = get_comments(infile)
  df = pd.read_csv(infile, sep='\t', skiprows=len(comments), quoting=3)

  std_cols = ['document_id', 'sent_id', 'token_start',
              'token_end', 'token', 'label']
  feature_cols = []
  for col in df.columns:
    if col not in std_cols:
      feature_cols.append(col)

  x_raw = get_features(df, feature_cols)
  x_enc = encode_x(x_raw, word2embedding, feature_cols,
                   config.MAX_SEQ_LEN, config.EMBED_DIM)

  y = None
  sample_weight = None
  if 'label' in df.columns:
    y_raw = get_cols(df, 'label')
    if config.N_CLASSES == 3:
      label2idx = {"O": [1, 0, 0], "B": [0, 0, 1], "I": [0, 1, 0]}
    elif config.N_CLASSES == 2:
      label2idx = {"O": [1, 0], "B": [0, 1], "I": [0, 1]}
    y = encode_y(y_raw, label2idx, config.MAX_SEQ_LEN, config.N_CLASSES)
    label2weight = {'O': config.O_WEIGHT, 'I': config.I_WEIGHT,
                    'B': config.B_WEIGHT}
    sample_weight = encode_y(y_raw, label2weight, config.MAX_SEQ_LEN,
                             n_classes=1)
  
  return df, x_raw, x_enc, sample_weight, y, comments

In [0]:
config = Config()

In [0]:
word2embedding = {}
f = open(config.EMBEDDING_PATH)
for line in f:
  values = line.split()
  word2embedding[values[0]] = np.asarray(values[1:], dtype='float32')
f.close()

print('Found %s word vectors.' % len(word2embedding))

Found 400000 word vectors.


In [0]:
_, _, train_x, sample_weight, train_y, comments = prepare_data(config,
                                                               word2embedding,
                                                               training=True)
dev_df, dev_raw, dev_x, _, _, _ = prepare_data(config, word2embedding,
                                               training=False)

In [0]:
print(train_x.shape)
print(dev_x.shape)
print(train_y.shape)
print(sample_weight[2])
dev_raw.head()

(21501, 35, 103)
(3830, 35, 103)
(21501, 35, 2)
[1.  6.5 6.5 6.5 6.5 6.5 6.5 6.5 6.5 6.5 1.  1.  1.  1.  1.  1.  1.  1.
 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0. ]


Unnamed: 0_level_0,token,n_toks,positive,negative,arglex
sent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"[Police, had, previously, gone, to, home, wher...",12,"[0.0625, 0.01875, 0.0, 0.03125, 0.0, 0.0147058...","[0.0, 0.05, 0.0, 0.40625, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[CLEVELAND, —, Police, invstigating, domestic,...",31,"[0.0, 0.0, 0.0625, 0.0, 0.02083333333333333, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.125, 0.05, 0.0, 0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[police, reports, from, the, Columbus, suburb,...",10,"[0.0625, 0.038461538461538464, 0.0, 0.0, 0.0, ...","[0.0, 0.009615384615384616, 0.0, 0.0, 0.0, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[Westerville, Officers, Eric, Joering, ,, 39, ...",34,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,"[The, suspect, ,, 30-year, -, old, Quentin, Sm...",30,"[0.0, 0.125, 0.0, 0.0, 0.0, 0.1527777777777778...","[0.0, 0.20833333333333331, 0.0, 0.0, 0.0, 0.06...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Model

In [0]:
# def custom_loss(y_true, y_pred):
#   # for test purposes
#   return K.variable(value=np.ones(1))

In [0]:
def get_bilstm(input_shape, config):
  model = Sequential()
  model.add(Bidirectional(CuDNNLSTM(config.LSTM_UNITS, return_sequences=True),
                          input_shape=input_shape))
  model.add(Dropout(config.DROPOUT))
  model.add(TimeDistributed(Dense(config.N_CLASSES, activation='softmax')))
  model.compile(
                # loss = custom_loss,
                loss=config.LOSS,
                optimizer=config.OPTIMIZER,
                metrics=[config.METRIC],
                sample_weight_mode='temporal')
  return model

In [0]:
model = get_bilstm(train_x.shape[1:], config)
print(model.summary())


Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_4 (Bidirection (None, 35, 1024)          2527232   
_________________________________________________________________
dropout_4 (Dropout)          (None, 35, 1024)          0         
_________________________________________________________________
time_distributed_4 (TimeDist (None, 35, 2)             2050      
Total params: 2,529,282
Trainable params: 2,529,282
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
history = model.fit(train_x, train_y,
                    epochs=config.EPOCHS,
                    batch_size=config.BATCH_SIZE,
                    verbose=1,
                    validation_split=0.1,
                    sample_weight=sample_weight)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 19350 samples, validate on 2151 samples
Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Predictions

In [0]:
def get_predictions(model, x, x_raw, n_classes):
  y_hat = model.predict(x)
  y_hat = y_hat.reshape(-1, n_classes).argmax(axis=1).reshape(x.shape[:2])
  labels = []
  for row in x_raw.itertuples():
    sent_idx = row.Index - 1
    for tok_idx in range(row.n_toks):
      if y_hat[sent_idx][tok_idx] == 0:
        label = "O"
      elif y_hat[sent_idx][tok_idx] == 1:
        label = "I"
      else:
        label = "B"
      labels.append(label)
  return labels

In [0]:
y_hat = get_predictions(model, dev_x, dev_raw, config.N_CLASSES)
result_df = pd.concat([dev_df, pd.DataFrame(y_hat, columns=['label'])],
                      axis=1, sort=False)
print(result_df['label'].value_counts())
result_df.head()

In [0]:
def si_predictions_to_spans(label_df):
  spans = []
  prev_label = 'O'
  prev_span_start = '-1'
  prev_span_end = '-1'
  prev_article = ''

  first_line = True
  for row in label_df.itertuples():
    article = row.document_id
    span_start = row.token_start
    span_end = row.token_end
    label = row.label

    span, prev_span_start = update_prediction(article, label,
                                              span_start, span_end,
                                              prev_article, prev_label,
                                              prev_span_start,
                                              prev_span_end)
    if span is not None:
      spans.append(span)

    prev_article = article
    prev_label = label
    prev_span_end = span_end

  # Make sure we get the last prediction
  span, _ = update_prediction(article, label, span_start, span_end,
                              prev_article, prev_label, prev_span_start,
                              prev_span_end)
  if span is not None:
    spans.append(span)
  return spans

# Helper method for si_predictions_to_spans
def update_prediction(article, label, span_start, span_end,
                     prev_article, prev_label, prev_span_start, prev_span_end):
  span = None
  cur_span_start = prev_span_start
  # Ending a span: I-O, B-O, I-B, B-B, new article
  if prev_label != 'O' and (label != 'I' or prev_article != article):
    span = (prev_article, prev_span_start, prev_span_end)

  # Starting a new span: O-B, O-I, I-B, B-B, new article
  if label == 'B' or (label == 'I' and prev_label == 'O') \
          or prev_article != article:
      # Update the start of the current label span
      cur_span_start = span_start

  return span, cur_span_start

In [0]:
spans = si_predictions_to_spans(result_df)

In [0]:
now = time.strftime("%Y%m%d-%H%M%S", time.localtime())
outfile = 'spans_' + now + '.txt'
logfile = 'log_' + now + '.txt'

with open(logfile, mode='w') as f:
  f.write('DATA PREPROCESSING\n\n')
  for comment in comments:
    comment = comment.decode("utf-8")
    comment = comment.replace('#', '')
    fields = comment.split(',')
    for field in fields:
      f.write(comment.strip() + '\n')
  f.write('\n\nCONFIG\n\n')
  f.write('max seq len: ' + str(config.MAX_SEQ_LEN) + '\n')
  f.write('embedding depth: ' + str(config.EMBED_DIM) + '\n')
  f.write('batch size: ' + str(config.BATCH_SIZE) + '\n')
  f.write('epochs: ' + str(config.EPOCHS) + '\n')
  f.write('number of labels: ' + str(config.N_CLASSES) + '\n')
  f.write('O weight: ' + str(config.O_WEIGHT) +
          ', I weight:' + str(config.I_WEIGHT) +
          ', B weight: ' + str(config.B_WEIGHT) + '\n')
  f.write('hidden units: ' + str(config.LSTM_UNITS) + '\n')
  f.write('dropout rate: ' + str(config.DROPOUT) + '\n')
  f.write('optimizer: ' + config.OPTIMIZER + '\n')
  f.write('metric: ' + config.METRIC + '\n')
  f.write('loss: ' + config.LOSS + '\n')
  f.write('\n\nMODEL HISTORY\n\n')
  f.write('Validation loss ' + config.LOSS + '\n')
  f.write(str(history.history['val_loss']) + '\n')
  f.write('Loss ' + config.LOSS + '\n')
  f.write(str(history.history['loss']) + '\n')
  f.write('Validation ' + config.METRIC + '\n')
  f.write(str(history.history['val_' + config.METRIC]) + '\n')
  f.write(config.METRIC + '\n')
  f.write(str(history.history[config.METRIC]) + '\n')
  f.write('\n\nMODEL SUMMARY\n\n')
  model.summary(print_fn=lambda x: f.write(x + '\n'))

with open(outfile, mode='w') as f:
  for span in spans:
    f.write(str(span[0]) + '\t' + str(span[1]) + '\t' + str(span[2]) + '\n')