# Challenges in NLP, WS19/20

Blaschke Verena, ISCL MA<br/>
Korniyenko Maxim, ISCL MA<br/>
Tureski Sam, ML MA<br/>

-----
## Baseline model for Span Identification task
-----

The working process looks like the following:
- Data preparation.
- Creating the model.
- Training the model.
- Testing the model.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import collections
from enum import Enum
from itertools import takewhile
import urllib.request
import time

# Creating the model
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Dropout, TimeDistributed
from keras.models import Sequential

# Results analysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

Using TensorFlow backend.


In [0]:
# installing tools for oversampling
# !pip install -U imbalanced-learn

# Config

In [0]:
class Config:
  def __init__(self):
    self.MAX_SEQ_LEN = 35
    self.EMBED_DIM = 100
    self.BATCH_SIZE = 128
    self.EPOCHS = 10

    self.N_CLASSES = 2
    self.O_WEIGHT = 1.0
    self.I_WEIGHT = 6.5
    self.B_WEIGHT = 6.5

    self.LSTM_UNITS = 512
    self.DROPOUT = 0.25
    self.OPTIMIZER = 'adam'
    self.METRIC = 'categorical_accuracy'

    self.TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-improved-sentiwordnet-arguingfull.tsv?token=AD7GEDLFTVHGUIDOG4EDKYK57FJJY'
    self.DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-improved-sentiwordnet-arguingfull.tsv?token=AD7GEDKHMNRQLNNRBNDYWJK57FJJ6'

    self.EMBEDDING_PATH = 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt'

# Input data

In [0]:
# Version for files that are not specified via a URL:
# def get_comments(filename):
#   with open(filename, 'r', encoding='utf8') as f:
#     commentiter = takewhile(lambda s: s.startswith('#'), f)
#     comments = list(commentiter)
#   return comments

def get_comments(filename):
  comments = []
  with urllib.request.urlopen(filename) as f:
    for line in f:
      if line.startswith(b'#'):
        comments.append(line)
      else:
        break
  return comments

In [0]:
def get_cols(input_df, col):
  return input_df.groupby('sent_id')[col].apply(list).to_frame()

In [0]:
def add_sent_lens(input_df, col='token'):
  input_df['n_toks'] = input_df[col].apply(lambda x: len(x))
  return input_df

In [0]:
def get_features(input_df, feature_cols):
  x = add_sent_lens(get_cols(input_df, 'token'))
  for feature in feature_cols:
    x = pd.merge(left=x, right=get_cols(input_df, feature),
                 left_on='sent_id', right_on='sent_id')
  return x

In [0]:
def encode_x(x, word2embedding, feature_header, max_seq_len, embed_dim):
  embedding_matrix = np.zeros([len(x),
                               max_seq_len,
                               embed_dim + len(feature_header)])
  for row in x.itertuples():
    sent_idx = row.Index - 1
    for tok_idx in range(row.n_toks):
      word = row.token[tok_idx]
      embedding_matrix[sent_idx][tok_idx][:embed_dim] = \
        word2embedding.get(word, np.random.randn(embed_dim))
      for i, feature in enumerate(feature_header):
        embedding_matrix[sent_idx][tok_idx][embed_dim + i] = \
          getattr(row, feature)[tok_idx]
  return embedding_matrix

In [0]:
def encode_y(y, label2idx, max_seq_len, n_classes):
  if n_classes == 1:
    labels = np.zeros([len(y), max_seq_len])
  else:
    labels = np.zeros([len(y), max_seq_len, n_classes])

  for row in y.itertuples():
    sent_idx = row.Index - 1
    for tok_idx, label in enumerate(row.label):
      labels[sent_idx][tok_idx] = label2idx[label]
  return labels

In [0]:
def prepare_data(config, word2embedding, training):
  # We're getting the comments this way so we can:
  # - add them to the output
  # - parse lines that contain '#' as the token
  if training:
    infile = config.TRAIN_URL
  else:
    infile = config.DEV_URL
  comments = get_comments(infile)
  df = pd.read_csv(infile, sep='\t', skiprows=len(comments), quoting=3)

  std_cols = ['document_id', 'sent_id', 'token_start',
              'token_end', 'token', 'label']
  feature_cols = []
  for col in df.columns:
    if col not in std_cols:
      feature_cols.append(col)

  x_raw = get_features(df, feature_cols)
  x_enc = encode_x(x_raw, word2embedding, feature_cols,
                   config.MAX_SEQ_LEN, config.EMBED_DIM)

  y = None
  sample_weight = None
  if 'label' in df.columns:
    y_raw = get_cols(df, 'label')
    if config.N_CLASSES == 3:
      label2idx = {"O": [1, 0, 0], "B": [0, 0, 1], "I": [0, 1, 0]}
    elif config.N_CLASSES == 2:
      label2idx = {"O": [1, 0], "B": [0, 1], "I": [0, 1]}
    y = encode_y(y_raw, label2idx, config.MAX_SEQ_LEN, config.N_CLASSES)
    label2weight = {'O': config.O_WEIGHT, 'I': config.I_WEIGHT,
                    'B': config.B_WEIGHT}
    sample_weight = encode_y(y_raw, label2weight, config.MAX_SEQ_LEN,
                             n_classes=1)
  
  return df, x_raw, x_enc, sample_weight, y, comments

In [0]:
config = Config()

In [0]:
word2embedding = {}
f = open(config.EMBEDDING_PATH)
for line in f:
  values = line.split()
  word2embedding[values[0]] = np.asarray(values[1:], dtype='float32')
f.close()

print('Found %s word vectors.' % len(word2embedding))

In [0]:
_, _, train_x, sample_weight, train_y, comments = prepare_data(config,
                                                               word2embedding,
                                                               training=True)
dev_df, dev_raw, dev_x, _, _, _ = prepare_data(config, word2embedding,
                                               training=False)

In [0]:
print(train_x.shape)
print(dev_x.shape)
print(train_y.shape)
print(sample_weight[2])
dev_raw.head()

# Model

In [0]:
def get_bilstm(input_shape, config):
  model = Sequential()
  model.add(Bidirectional(CuDNNLSTM(config.LSTM_UNITS, return_sequences=True),
                          input_shape=input_shape))
  model.add(Dropout(config.DROPOUT))
  model.add(TimeDistributed(Dense(config.N_CLASSES, activation='softmax')))
  model.compile(loss='categorical_crossentropy',
                optimizer=config.OPTIMIZER,
                metrics=[config.METRIC],
                sample_weight_mode='temporal')
  return model

In [0]:
model = get_bilstm(train_x.shape[1:], config)
print(model.summary())

In [0]:
history = model.fit(train_x, train_y,
                    epochs=config.EPOCHS,
                    batch_size=config.BATCH_SIZE,
                    verbose=1,
                    validation_split=0.1,
                    sample_weight=sample_weight)

# Predictions

In [0]:
def get_predictions(model, x, x_raw, n_classes):
  y_hat = model.predict(x)
  y_hat = y_hat.reshape(-1, n_classes).argmax(axis=1).reshape(dev_x.shape[:2])
  labels = []
  for row in x_raw.itertuples():
    sent_idx = row.Index - 1
    for tok_idx in range(row.n_toks):
      if y_hat[sent_idx][tok_idx] == 0:
        label = "O"
      elif y_hat[sent_idx][tok_idx] == 1:
        label = "I"
      else:
        label = "B"
      labels.append(label)
  return labels

In [0]:
y_hat = get_predictions(model, dev_x, dev_raw, config.N_CLASSES)
result_df = pd.concat([dev_df, pd.DataFrame(y_hat, columns=['label'])],
                      axis=1, sort=False)

In [0]:
print(result_df['label'].value_counts())
result_df.head()

In [0]:
now = int(time.time())
outfile = 'dev_labels_' + str(now) + '.tsv'

In [0]:
# Add call to label->span conversion ?
# And use a separate log file?

with open(outfile, mode='wb') as f:
  for comment in comments:
    f.write(comment)
with open(outfile, mode='a') as f:
  f.write('# Validation_' + config.METRIC + '=' + \
              str(history.history['val_' + config.METRIC]) + '\n')

result_df.to_csv(path_or_buf=outfile, sep='\t',
                 mode='a', index=False)