# Challenges in NLP, WS19/20

Blaschke Verena, ISCL MA<br/>
Korniyenko Maxim, ISCL MA<br/>
Tureski Sam, ML MA<br/>

-----
## Baseline model for Span Identification task
-----

The working process looks like the following:
- Data preparation.
- Creating the model.
- Training the model.
- Testing the model.

In [0]:
from google.colab import drive

import pandas as pd
import numpy as np
import collections
from enum import Enum
from itertools import takewhile
import urllib.request

# Creating the model
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Dropout, TimeDistributed
from keras.models import Sequential

# Results analysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [0]:
# installing tools for oversampling
# !pip install -U imbalanced-learn

# Config

In [0]:
MAX_SEQUENCE_LENGTH = 35
EMBEDDING_DIM = 100
batch_size = 128

epochs = 10

N_CLASSES = 2
# N_CLASSES = 3
class_weighting = True
o_weight = 1.0
i_weight = 6.5
b_weight = 6.5

test_split = False

# train_url = '/content/train-data-improved-sentiwordnet-arguingfull.tsv'
train_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-improved-sentiwordnet-arguingfull.tsv?token=AD7GEDO4X6BQYQGURMKJB7C57DXFK'
# dev_url = '/content/dev-improved-sentiwordnet-arguingfull.tsv'
dev_url = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-improved-sentiwordnet-arguingfull.tsv?token=AD7GEDLZIF6CIEOANPAMRVC57DXFQ'

outfile = 'dev_predictions_bio.tsv'

#1. Data preparation

Deciding on the data preprocessing type

Reading the data from the file and storing it in a data frame

In [0]:
# Version for files that are not specified via a URL:
# def get_comments(filename):
#   with open(filename, 'r', encoding='utf8') as f:
#     commentiter = takewhile(lambda s: s.startswith('#'), f)
#     comments = list(commentiter)
#   return comments

def get_comments(filename):
  comments = []
  with urllib.request.urlopen(filename) as f:
    for line in f:
      if line.startswith(b'#'):
        comments.append(line)
      else:
        break
  return comments

In [92]:
# We're getting the comments this way so we can:
# - add them to the output
# - parse lines that contain '#' as the token
train_comments = get_comments(train_url)
dev_comments = get_comments(dev_url)
assert train_comments == dev_comments
train_df = pd.read_csv(train_url, sep='\t', skiprows=len(train_comments), quoting=3)
dev_df = pd.read_csv(dev_url, sep='\t', skiprows=len(dev_comments), quoting=3)

std_cols = ['document_id', 'sent_id', 'token_start', 'token_end', 'token', 'label']
feature_cols = []
for col in train_df.columns:
  if col not in std_cols:
    feature_cols.append(col)

print('features:', feature_cols)
print(train_df["label"].value_counts())
train_df.head()

features: ['positive', 'negative', 'arglex']
O    350354
I     45542
B      5392
Name: label, dtype: int64


Unnamed: 0,document_id,sent_id,token_start,token_end,token,label,positive,negative,arglex
0,111111111,1,0,4,Next,O,0.0,0.03125,0
1,111111111,1,5,11,plague,O,0.071429,0.214286,0
2,111111111,1,12,20,outbreak,O,0.0,0.125,0
3,111111111,1,21,23,in,O,0.0,0.0,0
4,111111111,1,24,34,Madagascar,O,0.0,0.0,0


Getting the data frame with sentences and saving tokens to the list

In [0]:
def get_cols(input_df, col):
  return input_df.groupby('sent_id')[col].apply(list).to_frame()

In [0]:
def add_sent_lens(input_df, col='token'):
  input_df['n_toks'] = input_df[col].apply(lambda x: len(x))
  return input_df

In [0]:
def get_features(input_df):
  x = add_sent_lens(get_cols(input_df, 'token'))
  for feature in feature_cols:
    x = pd.merge(left=x, right=get_cols(input_df, feature),
                 left_on='sent_id', right_on='sent_id')
  return x

In [119]:
train_raw = get_features(train_df)
dev_raw = get_features(dev_df)

train_raw.head()

AttributeError: ignored

In [97]:
dev_raw.head()

Unnamed: 0_level_0,token,n_toks,positive,negative,arglex
sent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"[Police, had, previously, gone, to, home, wher...",12,"[0.0625, 0.01875, 0.0, 0.03125, 0.0, 0.0147058...","[0.0, 0.05, 0.0, 0.40625, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[CLEVELAND, —, Police, invstigating, domestic,...",31,"[0.0, 0.0, 0.0625, 0.0, 0.02083333333333333, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.125, 0.05, 0.0, 0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[police, reports, from, the, Columbus, suburb,...",10,"[0.0625, 0.038461538461538464, 0.0, 0.0, 0.0, ...","[0.0, 0.009615384615384616, 0.0, 0.0, 0.0, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[Westerville, Officers, Eric, Joering, ,, 39, ...",34,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,"[The, suspect, ,, 30-year, -, old, Quentin, Sm...",30,"[0.0, 0.125, 0.0, 0.0, 0.0, 0.1527777777777778...","[0.0, 0.20833333333333331, 0.0, 0.0, 0.0, 0.06...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [98]:
train_y = get_cols(train_df, 'label')

train_y.head()

KeyError: ignored

## Encoding data

#### Encoding features

Reading the glove embeddings from the file.

In [99]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [100]:
embeddings_index = {}
file_path = 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt'
f = open(file_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [0]:
def encode_X(x, feature_header, max_seq_len, embed_dim):
  embedding_matrix = np.zeros([len(x),
                               max_seq_len,
                               embed_dim + len(feature_header)])
  for row in x.itertuples():
    sent_idx = row.Index - 1
    for tok_idx in range(row.n_toks):
      word = row.token[tok_idx]
      embedding_matrix[sent_idx][tok_idx][:embed_dim] = embeddings_index.get(word,
                                                                             np.random.randn(embed_dim))
      for i, feature in enumerate(feature_header):
        embedding_matrix[sent_idx][tok_idx][embed_dim + i] = getattr(row, feature)[tok_idx]
  return embedding_matrix

In [102]:
train_x = encode_X(train_raw, feature_cols, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
dev_x = encode_X(dev_raw, feature_cols, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
print(train_x.shape)
print(dev_x.shape)

(21501, 35, 103)
(3830, 35, 103)


#### Encoding labels

In [0]:
if N_CLASSES == 3:
  label2idx = {"O": [1, 0, 0], "B": [0, 0, 1], "I": [0, 1, 0]}
elif N_CLASSES == 2:
  label2idx = {"O": [1, 0], "B": [0, 1], "I": [0, 1]}

In [0]:
def encode_y(y, label2idx, max_seq_len, n_classes):
  if n_classes == 1:
    labels = np.zeros([len(y), max_seq_len])
  else:
    labels = np.zeros([len(y), max_seq_len, n_classes])

  for row in y.itertuples():
    sent_idx = row.Index - 1
    for tok_idx, label in enumerate(row.label):
      labels[sent_idx][tok_idx] = label2idx[label]
  return labels

In [105]:
y = encode_y(train_y, label2idx, MAX_SEQUENCE_LENGTH, N_CLASSES)
y.shape

(21501, 35, 2)

In [106]:
if class_weighting:
  label2weight = {'O': o_weight, 'I': i_weight, 'B': b_weight}
  sample_weight = encode_y(train_y, label2weight,
                           MAX_SEQUENCE_LENGTH, n_classes=1)
  print(sample_weight[2])
else:
  sample_weight = None

[1.  6.5 6.5 6.5 6.5 6.5 6.5 6.5 6.5 6.5 1.  1.  1.  1.  1.  1.  1.  1.
 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0. ]


# 2. Creating the model

In [0]:
# import tensorflow as tf

# def f1(y_true, y_pred):
#   true_pos = tf.count_nonzero(y_true * y_pred)
#   false_pos = tf.count_nonzero((y_true - 1) * y_pred)
#   false_neg = tf.count_nonzero(y_true * (y_pred - 1))
#   if true_pos == 0:
#     return 0.0
#   if false_pos == 0:
#     return 0.0
#   prec = true_pos / (true_pos + false_pos)
#   if false_neg == 0:
#     return 0.0
#   return 2 * prec * rec / (prec + rec)

In [109]:
model = Sequential()

model.add(Bidirectional(CuDNNLSTM(512, return_sequences=True),
                        input_shape=(MAX_SEQUENCE_LENGTH,
                                     EMBEDDING_DIM + len(feature_cols))))
model.add(Dropout(0.25))

model.add(TimeDistributed(Dense(N_CLASSES, activation='softmax')))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'],
              sample_weight_mode='temporal')
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, 35, 1024)          2527232   
_________________________________________________________________
dropout_2 (Dropout)          (None, 35, 1024)          0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 35, 2)             2050      
Total params: 2,529,282
Trainable params: 2,529,282
Non-trainable params: 0
_________________________________________________________________
None


# 3. Training the model.

#### Training using all of the data. 

In [111]:
history = model.fit(train_x, y,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=1,
                    validation_split=0.1,
                    sample_weight=sample_weight)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 19350 samples, validate on 2151 samples
Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 4. Getting predictions for development data

In [0]:
y_hat = model.predict(dev_x)

From one-hot encoding to integers

In [116]:
y_hat = y_hat.reshape(-1, N_CLASSES).argmax(axis=1).reshape(len(dev_x), MAX_SEQUENCE_LENGTH)
y_hat.shape

(3830, 35)

Mapping the predictions to the corresponding indeces in the dev data frame

In [0]:
def get_labels_vector(x, y_hat):
  labels_vector = []
  for row in x.itertuples():
    sent_idx = row.Index - 1
    for tok_idx in range(row.n_toks):
      if y_hat[sent_idx][tok_idx] == 0:
        label = "O"
      elif y_hat[sent_idx][tok_idx] == 1:
        label = "I"
      else:
        label = "B"
      labels_vector.append(label)
  return labels_vector

In [0]:
predicted_labels_column = get_labels_vector(dev_raw, y_hat)

Concatenation of the original dev data frame and the prediction vector

In [0]:
result_df = pd.concat([dev_df, pd.DataFrame(predicted_labels_column, columns=["label"])], axis=1, sort=False)

Overview of the results

In [126]:
result_df.head()

Unnamed: 0,document_id,sent_id,token_start,token_end,token,positive,negative,arglex,label
0,730081389,1,0,6,Police,0.0625,0.0,0,O
1,730081389,1,7,10,had,0.01875,0.05,0,O
2,730081389,1,11,21,previously,0.0,0.0,0,O
3,730081389,1,22,26,gone,0.03125,0.40625,0,O
4,730081389,1,27,29,to,0.0,0.0,0,O


In [127]:
result_df["label"].value_counts()

O    54005
I    13168
Name: label, dtype: int64

Saving data frame to a file

In [0]:
with open(outfile, mode='wb') as f:
  for comment in dev_comments:
    f.write(comment)

result_df.to_csv(path_or_buf=outfile, sep='\t',
                 mode='a', index=False)