# Challenges in NLP, WS19/20

Blaschke Verena, ISCL MA<br/>
Korniyenko Maxim, ISCL MA<br/>
Tureski Sam, ML MA<br/>

-----
## Baseline model for Span Identification task
-----

The working process looks like the following:
- Data preparation.
- Creating the model.
- Training the model.
- Testing the model.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import pandas as pd
import numpy as np
import collections
from enum import Enum
from itertools import takewhile
import urllib.request
import time

# Creating the model
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Dropout, TimeDistributed
from keras.models import Sequential

# Results analysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [0]:
# installing tools for oversampling
# !pip install -U imbalanced-learn

# Processing the Input

In [0]:
# Helper method for prepare_data
def get_comments(filename, url=True):
    if url:
        comments = []
        with urllib.request.urlopen(filename) as f:
            for line in f:
                if line.startswith(b'#'):
                    comments.append(line.decode("utf-8"))
                else:
                    break
        return comments
    with open(filename, 'r', encoding='utf8') as f:
        commentiter = takewhile(lambda s: s.startswith('#'), f)
        comments = list(commentiter)
    return comments


# Helper method for prepare_data
def get_cols(input_df, col):
    return input_df.groupby('sent_id')[col].apply(list).to_frame()


# Helper method for prepare_data
def add_sent_lens(input_df, col='token'):
    input_df['n_toks'] = input_df[col].apply(lambda x: len(x))
    return input_df


# Helper method for prepare_data
def get_features(input_df, feature_cols):
    x = add_sent_lens(get_cols(input_df, 'token'))
    for feature in feature_cols:
        x = pd.merge(left=x, right=get_cols(input_df, feature),
                     left_on='sent_id', right_on='sent_id')
    return x


def encode_x(x, word2embedding, feature_header, max_seq_len, embed_dim):
    """Encode the input data.

    Arguments:
    x -- a Pandas dataframe
    word2embedding -- a dict(str -> np.array) from tokens to embeddings
    feature_header -- dataframe names of additional feature columns
    max_seq_len -- the maximum number of tokens per sentence in x
    embed_dim -- the array length of the vectors in word2embedding
    """
    embedding_matrix = np.zeros([len(x), max_seq_len,
                                 embed_dim + len(feature_header)])
    for row in x.itertuples():
        sent_idx = row.Index - 1
        for tok_idx in range(row.n_toks):
            word = row.token[tok_idx]
            embedding_matrix[sent_idx][tok_idx][:embed_dim] = \
                word2embedding.get(word, np.random.randn(embed_dim))
            for i, feature in enumerate(feature_header):
                embedding_matrix[sent_idx][tok_idx][embed_dim + i] = \
                    getattr(row, feature)[tok_idx]
    return embedding_matrix


def encode_y(y, label2idx, max_seq_len, n_classes):
    if n_classes == 1:
        labels = np.zeros([len(y), max_seq_len])
    else:
        labels = np.zeros([len(y), max_seq_len, n_classes])

    for row in y.itertuples():
        sent_idx = row.Index - 1
        for tok_idx, label in enumerate(row.label):
            labels[sent_idx][tok_idx] = label2idx[label]
    return labels


def prepare_data(config, word2embedding, training):
    # We're getting the comments this way so we can:
    # - add them to the output
    # - parse lines that actually contain '#' as token
    if training:
        infile = config.TRAIN_URL
    else:
        infile = config.DEV_URL
    comments = get_comments(infile, config.ONLINE_SOURCES)
    df = pd.read_csv(infile, sep='\t', skiprows=len(comments), quoting=3)

    std_cols = ['document_id', 'sent_id', 'token_start',
                'token_end', 'token', 'label']
    feature_cols = []
    for col in df.columns:
        if col not in std_cols:
            feature_cols.append(col)

    x_raw = get_features(df, feature_cols)
    x_enc = encode_x(x_raw, word2embedding, feature_cols,
                     config.MAX_SEQ_LEN, config.EMBED_DIM)

    y = None
    sample_weight = None
    if 'label' in df.columns:
        y_raw = get_cols(df, 'label')
        if config.N_CLASSES == 3:
            label2idx = {"O": [1, 0, 0], "B": [0, 0, 1], "I": [0, 1, 0]}
        elif config.N_CLASSES == 2:
            label2idx = {"O": [1, 0], "B": [0, 1], "I": [0, 1]}
        y = encode_y(y_raw, label2idx, config.MAX_SEQ_LEN, config.N_CLASSES)
        label2weight = {'O': config.O_WEIGHT, 'I': config.I_WEIGHT,
                        'B': config.B_WEIGHT}
        sample_weight = encode_y(y_raw, label2weight, config.MAX_SEQ_LEN,
                                 n_classes=1)

    return df, x_raw, x_enc, y, sample_weight, comments

def get_data(config, word2embedding=None):
    if not word2embedding:
        word2embedding = {}
        f = open(config.EMBEDDING_PATH)
        for line in f:
            values = line.split()
            word2embedding[values[0]] = np.asarray(values[1:], dtype='float32')
        f.close()

    _, _, train_x, train_y, sample_weight, comments = prepare_data(
        config, word2embedding, training=True)
    dev_df, dev_raw, dev_x, _, _, _ = prepare_data(config, word2embedding,
                                                   training=False)
    return Data(train_x, train_y, sample_weight, comments,
                dev_df, dev_raw, dev_x)


class Data:
    def __init__(self, train_x, train_y, sample_weight,
                 comments, dev_df, dev_raw, dev_x):
        self.train_x = train_x
        self.train_y = train_y
        self.sample_weight = sample_weight
        self.comments = comments
        self.dev_df = dev_df
        self.dev_raw = dev_raw
        self.dev_x = dev_x

# Creating the Model

In [0]:
# def custom_loss(y_true, y_pred):
#   # for test purposes
#   return K.variable(value=np.ones(1))

In [0]:
def get_bilstm(input_shape, config):
    model = Sequential()
    model.add(Bidirectional(CuDNNLSTM(config.LSTM_UNITS,
                                      return_sequences=True),
                            input_shape=input_shape))
    model.add(Dropout(config.DROPOUT))
    model.add(TimeDistributed(Dense(config.N_CLASSES, activation='softmax')))
    model.compile(loss=config.LOSS, optimizer=config.OPTIMIZER,
                  metrics=[config.METRIC], sample_weight_mode='temporal')
    return model


def create_and_fit_bilstm(config, train_x, train_y, sample_weight):
    model = get_bilstm(train_x.shape[1:], config)
    history = model.fit(train_x, train_y, epochs=config.EPOCHS,
                        batch_size=config.BATCH_SIZE, validation_split=0.1,
                        sample_weight=sample_weight, verbose=1,)
    return model, history

# Predictions

In [0]:
def get_bio_predictions(model, x, x_raw, n_classes):
    y_hat = model.predict(x)
    y_hat = y_hat.reshape(-1, n_classes).argmax(axis=1).reshape(x.shape[:2])
    labels = []
    for row in x_raw.itertuples():
        sent_idx = row.Index - 1
        for tok_idx in range(row.n_toks):
            if y_hat[sent_idx][tok_idx] == 0:
                label = "O"
            elif y_hat[sent_idx][tok_idx] == 1:
                label = "I"
            else:
                label = "B"
            labels.append(label)
    return labels


def si_predictions_to_spans(label_df):
    spans = []
    prev_label = 'O'
    prev_span_start = '-1'
    prev_span_end = '-1'
    prev_article = ''

    for row in label_df.itertuples():
        article = row.document_id
        span_start = row.token_start
        span_end = row.token_end
        label = row.label

        span, prev_span_start = update_prediction(article, label,
                                                  span_start, span_end,
                                                  prev_article, prev_label,
                                                  prev_span_start,
                                                  prev_span_end)
        if span is not None:
            spans.append(span)

        prev_article = article
        prev_label = label
        prev_span_end = span_end

    # Make sure we get the last prediction
    span, _ = update_prediction(article, label, span_start, span_end,
                                prev_article, prev_label, prev_span_start,
                                prev_span_end)
    if span is not None:
        spans.append(span)
    return spans


# Helper method for si_predictions_to_spans
def update_prediction(article, label, span_start, span_end, prev_article,
                      prev_label, prev_span_start, prev_span_end):
    span = None
    cur_span_start = prev_span_start
    # Ending a span: I-O, B-O, I-B, B-B, new article
    if prev_label != 'O' and (label != 'I' or prev_article != article):
        span = (prev_article, prev_span_start, prev_span_end)

    # Starting a new span: O-B, O-I, I-B, B-B, new article
    if label == 'B' or (label == 'I' and prev_label == 'O') \
            or prev_article != article:
        # Update the start of the current label span
        cur_span_start = span_start

    return span, cur_span_start


def predict(config, model, history, dev_df, dev_raw, dev_x, comments,
            file_prefix, file_stem, file_suffix):
    y_hat = get_bio_predictions(model, dev_x, dev_raw, config.N_CLASSES)
    result_df = pd.concat([dev_df, pd.DataFrame(y_hat, columns=['label'])],
                          axis=1, sort=False)
    spans = si_predictions_to_spans(result_df)

    outfile = file_prefix + 'spans_' + file_stem + '_' + file_suffix + '.txt'
    logfile = file_prefix + 'log_' + file_stem + '_' + file_suffix + '.txt'

    with open(logfile, mode='w') as f:
        f.write('DATA PREPROCESSING\n\n')
        for comment in comments:
            comment = comment.replace('#', '')
            fields = comment.split(',')
            for field in fields:
                f.write(comment.strip() + '\n')
        f.write('\n\nCONFIG\n\n')
        f.write(config.pretty_str())
        f.write('\n\nMODEL HISTORY\n\n')
        f.write('Validation loss ' + config.LOSS + '\n')
        f.write(str(history.history['val_loss']) + '\n')
        f.write('Loss ' + config.LOSS + '\n')
        f.write(str(history.history['loss']) + '\n')
        f.write('Validation ' + config.METRIC + '\n')
        f.write(str(history.history['val_' + config.METRIC]) + '\n')
        f.write(config.METRIC + '\n')
        f.write(str(history.history[config.METRIC]) + '\n')
        f.write('\n\nMODEL SUMMARY\n\n')
        model.summary(print_fn=lambda x: f.write(x + '\n'))

    with open(outfile, mode='w') as f:
        for span in spans:
            f.write(str(span[0]) + '\t' + str(span[1]) + '\t' +
                    str(span[2]) + '\n')

# Putting it all together

In [0]:
def run(config, file_stem, file_suffix, verbose=True,
        data=None, word2embedding=None, file_prefix=''):
    if verbose:
        print('Running with config:')
        print(config.pretty_str())
    if not data:
        if verbose:
            print('Encoding the data')
        data = get_data(config, word2embedding)
    if verbose:
        print('Building the model')
    model, history = create_and_fit_bilstm(config, data.train_x,
                                           data.train_y,
                                           data.sample_weight)
    if verbose:
        print('Predicting the test data spans')
    predict(config, model, history, data.dev_df, data.dev_raw,
            data.dev_x, data.comments, file_prefix, file_stem, file_suffix)
    if verbose:
        print('Done!\n\n')
    return data

# Config

In [0]:
class Config:
    def __init__(self, args=None):
        """Creates a default configuration.

        Keyword arguments:
        args -- a dict(str -> ?) containing values diverging from the default
        """
        # Encoding the data:
        self.MAX_SEQ_LEN = 35
        self.EMBED_DIM = 100
        self.N_CLASSES = 2
        self.ONLINE_SOURCES = True
        self.TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-data-improved-sentiwordnet-arguingfull.tsv?token=AD7GEDOZHQ7BVKUTD7RZYJS6AOSVW'
        self.DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-improved-sentiwordnet-arguingfull.tsv?token=AD7GEDN7YPFXYH5TSRBSNKS6AOSWK'
        self.EMBEDDING_PATH = 'gdrive/My Drive/colab_projects/data/glove.6B.100d.txt'

        # Building the model:
        self.BATCH_SIZE = 128
        self.EPOCHS = 10
        self.O_WEIGHT = 1.0
        self.I_WEIGHT = 6.5
        self.B_WEIGHT = 6.5
        self.LSTM_UNITS = 512
        self.DROPOUT = 0.25
        self.OPTIMIZER = 'adam'
        self.METRIC = 'categorical_accuracy'
        self.LOSS = 'categorical_crossentropy'

        if args:
            for key in args:
                setattr(self, key, args[key])

    def pretty_str(self):
        return 'max seq len: ' + str(self.MAX_SEQ_LEN) + '\n' + \
               'embedding depth: ' + str(self.EMBED_DIM) + '\n' + \
               'number of labels: ' + str(config.N_CLASSES) + '\n' + \
               'batch size: ' + str(self.BATCH_SIZE) + '\n' + \
               'epochs: ' + str(self.EPOCHS) + '\n' + \
               'O weight: ' + str(self.O_WEIGHT) + \
               ', I weight:' + str(self.I_WEIGHT) + \
               ', B weight: ' + str(self.B_WEIGHT) + '\n' + \
               'hidden units: ' + str(self.LSTM_UNITS) + '\n' + \
               'dropout rate: ' + str(self.DROPOUT) + '\n' + \
               'optimizer: ' + self.OPTIMIZER + '\n' + \
               'metric: ' + self.METRIC + '\n' + \
               'loss: ' + self.LOSS + '\n'

def run_config(config, file_prefix, data=None, repetitions=5, verbose=True):
    now = time.strftime("%Y%m%d-%H%M%S", time.localtime())
    for i in range(repetitions):
        if verbose:
            print("Iteration " + str(i + 1) + " of " + str(repetitions))
        data = run(config, data=data, verbose=verbose,
                   file_prefix=file_prefix, file_stem=now, file_suffix=str(i))
    # Return data in case the next config only changes model features
    return data

In [0]:
file_prefix = '/content/gdrive/My Drive/semeval-predictions/'
data = None
# config = Config()
config = Config({'TRAIN_URL': 'train-data-improved-sentiwordnet-arguingfull-pos.tsv',
                 'DEV_URL': 'dev-improved-sentiwordnet-arguingfull-pos.tsv',
                 'ONLINE_SOURCES': False})
data = run_config(config, file_prefix, data)