# Challenges in Computational Linguistics, WS 19/20

Blaschke Verena, ISCL MA<br/>
Korniyenko Maxim, ISCL MA<br/>
Tureski Sam, ML MA<br/>

-----
## SemEval2020-11: Propaganda Detection
### Task 1: Span identification
-----


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

# model.py

In [0]:
import pandas as pd
import numpy as np
from itertools import takewhile
import zipfile
import urllib.request
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Dropout, \
    TimeDistributed, Activation
from keras.models import Sequential
from sklearn import svm, preprocessing
from sklearn.linear_model import LinearRegression

########################
# Processing the input #
########################


# Helper method for prepare_data
def get_comments(filename, url=True):
    if url:
        comments = []
        with urllib.request.urlopen(filename) as f:
            for line in f:
                if line.startswith(b'#'):
                    comments.append(line.decode("utf-8"))
                else:
                    break
        return comments
    with open(filename, 'r', encoding='utf8') as f:
        commentiter = takewhile(lambda s: s.startswith('#'), f)
        comments = list(commentiter)
    return comments


# Helper method for prepare_data
def get_cols(input_df, col):
    return input_df.groupby('sent_id')[col].apply(list).to_frame()


# Helper method for prepare_data
def add_sent_lens(input_df, col='token'):
    input_df['n_toks'] = input_df[col].apply(lambda x: len(x))
    return input_df


# Helper method for prepare_data
def get_features(input_df, feature_cols):
    x = add_sent_lens(get_cols(input_df, 'token'))
    for feature in feature_cols:
        x = pd.merge(left=x, right=get_cols(input_df, feature),
                     left_on='sent_id', right_on='sent_id')
    return x


# Helper method for encode_x_bert
def bert_embeddings_for_sent(bert_tokens, row, feature_header, embedding_matrix,
                             embed_dim, sent_idx, uncased):
    if len(bert_tokens) < len(row.token):
        print('BERT', [i[0] for i in bert_tokens])
        print('X', row.token)
    word_idx = 0
    for (tok, embed) in bert_tokens:
        if word_idx == row.n_toks:
            break
        word = str(row.token[word_idx])
        if word == '\ufeff':  # Prints a warning, but is dealt with.
            word_idx += 1
            continue
        if uncased:
            word = word.lower()
        if tok == word or word.startswith(tok):
            # startswith: Use embedding of first subtoken
            embedding_matrix[sent_idx - 1][word_idx][:embed_dim] = embed
            for i, feature in enumerate(feature_header):
                embedding_matrix[sent_idx - 1][word_idx][embed_dim + i] = \
                    getattr(row, feature)[word_idx]
            word_idx += 1
            continue
        if tok.startswith('##') and not word.startswith('##'):
            # BERT word continutation prefix (e.g. per ##pet ##uate)
            continue


# Task 1: Token embeddings
def encode_x_bert(x, bert_file, feature_header, max_seq_len, embed_dim=768,
                  uncased=True):
    # TODO this currently assumes that the BERT file only contains information
    # about a single layer. extend this to multiple layers?
    embedding_matrix = np.zeros([len(x), max_seq_len,
                                 embed_dim + len(feature_header)])
    prev_sent_idx = 1
    bert_tokens = []
    sentences = x.itertuples()
    with open(bert_file, encoding='utf8') as f:
        for line in f:
            cells = line.split('\t')
            sent_idx = int(cells[0])
            layer = int(cells[1])
            token = cells[2]
            embedding = np.fromstring(cells[3][1:-1], sep=',')

            if sent_idx != prev_sent_idx:
                if sent_idx % 1000 == 0:
                    print("BERT embeddings for sentence", sent_idx)
                row = next(sentences)
                assert row.Index == prev_sent_idx
                bert_embeddings_for_sent(bert_tokens, row, feature_header,
                                         embedding_matrix, embed_dim,
                                         prev_sent_idx, uncased)
                bert_tokens = []

            bert_tokens.append((token, embedding))
            prev_sent_idx = sent_idx

    # Last line:
    row = next(sentences)
    bert_embeddings_for_sent(bert_tokens, row, feature_header, embedding_matrix,
                             embed_dim, prev_sent_idx, uncased)
    return embedding_matrix


# Task 2: Sequence embeddings
def encode_x_seq(x, bert_file, feature_header, embed_dim=768, uncased=True,
                 n_bert_layers=1):
    embedding_matrix = np.zeros([len(x),
                                 embed_dim * n_bert_layers + len(feature_header)])
    prev_sent_idx = 1
    bert_tokens = []
    sequences = x.itertuples()
    with open(bert_file, encoding='utf8') as f:
        idx = 0
        for line in f:
            row = next(sequences)
            for bert_layer in range(n_bert_layers):
                cells = line.split('\t')
                sent_idx = int(cells[0])
                layer = cells[1]
                seq = cells[2]
                print(cells)
                print(line)
                print(bert_file)
                print(cells[3])
                embedding = np.fromstring(cells[3][1:-1], sep=',')
                text = row.text
                if uncased:
                    text = text.lower()
                # assert text == seq or text + ' ' + text == seq
                embedding_matrix[idx][embed_dim * bert_layer:embed_dim * (bert_layer + 1)] = embedding
                if n_bert_layers > 1 and bert_layer < n_bert_layers - 1:
                    line = next(f)
            for i, feature in enumerate(feature_header):
                embedding_matrix[idx][embed_dim * n_bert_layers + i] = getattr(row, feature)
            idx += 1
    return embedding_matrix


def encode_x(x, word2embedding, feature_header, max_seq_len,
             embed_dim, uncased):
    """Encode the input data.

    Arguments:
    x -- a Pandas dataframe
    word2embedding -- a dict(str -> np.array) from tokens to embeddings
    feature_header -- dataframe names of additional feature columns
    max_seq_len -- the maximum number of tokens per sentence in x
    embed_dim -- the array length of the vectors in word2embedding
    """
    embedding_matrix = np.zeros([len(x), max_seq_len,
                                 embed_dim + len(feature_header)])
    for row in x.itertuples():
        sent_idx = row.Index - 1
        for tok_idx in range(row.n_toks):
            word = str(row.token[tok_idx])
            if uncased:
                word = word.lower()
            embedding_matrix[sent_idx][tok_idx][:embed_dim] = \
                word2embedding.get(word, np.random.randn(embed_dim))
            for i, feature in enumerate(feature_header):
                embedding_matrix[sent_idx][tok_idx][embed_dim + i] = \
                    getattr(row, feature)[tok_idx]
    return embedding_matrix


def encode_y(y, label2idx, max_seq_len, n_classes):
    if n_classes == 1:
        if max_seq_len > 1:
            labels = np.zeros([len(y), max_seq_len])
        else:
            labels = np.zeros(len(y))
    else:
        labels = np.zeros([len(y), max_seq_len, n_classes])

    if max_seq_len > 1:
        for row in y.itertuples():
            sent_idx = row.Index - 1
            for tok_idx, label in enumerate(row.label):
                labels[sent_idx][tok_idx] = label2idx[label]
    else:
        for row in y.iteritems():
            labels[row[0]] = label2idx[row[1]]
    return labels


def prepare_data(config, word2embedding, training):
    # We're getting the comments this way so we can:
    # - add them to the output
    # - parse lines that actually contain '#' as token
    if training:
        infile = config.TRAIN_URL
    else:
        infile = config.DEV_URL
    comments = get_comments(infile, config.ONLINE_SOURCES)
    df = pd.read_csv(infile, sep='\t', skiprows=len(comments), quoting=3,
                     encoding='utf8')
    
    if config.TOKEN_LVL:
        std_cols = ['document_id', 'sent_id', 'token_start',
                    'token_end', 'token', 'label']
    else:
        std_cols = ['document_id', 'span_start', 'span_end', 'text', 'label']
    feature_cols = []
    for col in df.columns:
        if config.FEATURES is None:  # Determine features based on file header
            if col not in std_cols and col not in config.EXCLUDE_FEATURES:
                feature_cols.append(col)
        else:
            if col in config.FEATURES:
                feature_cols.append(col)

    if config.TOKEN_LVL:
        x_raw = get_features(df, feature_cols)
    else:
        x_raw = df

    if config.USE_BERT:
        if training:
            bert_file = config.TRAIN_BERT
        else:
            bert_file = config.DEV_BERT
        if config.TOKEN_LVL:
            x_enc = encode_x_bert(x_raw, bert_file, feature_cols,
                                  config.MAX_SEQ_LEN, config.EMBED_DIM,
                                  config.UNCASED)
        else:
            x_enc = encode_x_seq(x_raw, bert_file, feature_cols, 
                                 config.EMBED_DIM, config.UNCASED,
                                 config.N_BERT_LAYERS)
    else:
        x_enc = encode_x(x_raw, word2embedding, feature_cols,
                     config.MAX_SEQ_LEN, config.EMBED_DIM, config.UNCASED)
        
    
    print(x_enc.shape)

    y = None
    sample_weight = None
    if training:
        if config.TOKEN_LVL:
            y_raw = get_cols(df, 'label')
            if config.N_CLASSES == 3:
                label2idx = {"O": [1, 0, 0], "B": [0, 0, 1], "I": [0, 1, 0]}
            elif config.N_CLASSES == 2:
                label2idx = {"O": [1, 0], "B": [0, 1], "I": [0, 1]}
            y = encode_y(y_raw, label2idx, config.MAX_SEQ_LEN, config.N_CLASSES)
            sample_weight = encode_y(y_raw, config.CLASS_WEIGHTS,
                                     config.MAX_SEQ_LEN, n_classes=1)
        else:
            y = df.label
            if config.CLASS_WEIGHTS:
                sample_weight = encode_y(y, config.CLASS_WEIGHTS,
                                         config.MAX_SEQ_LEN, n_classes=1)

    return df, x_raw, x_enc, y, sample_weight, comments, feature_cols


def load_zipped_embeddings(infile):
    word2embedding = {}
    with zipfile.ZipFile(infile) as f_in_zip:
        file_in = f_in_zip.filelist[0].filename
        i = 0
        with f_in_zip.open(file_in, 'r') as f_in:
            for line in f_in:
                values = line.decode().rstrip().split()
                word2embedding[values[0]] = np.asarray(values[1:],
                                                       dtype='float32')
                i += 1
                if i % 100000 == 0:
                    print("Read " + str(i) + " embeddings")
    return word2embedding


def get_data(config, word2embedding=None):
    if (not word2embedding) and (not config.USE_BERT):
        if config.EMBEDDING_PATH[-4:] == '.zip':
            word2embedding = load_zipped_embeddings(config.EMBEDDING_PATH)
        else:
            word2embedding = {}
            f = open(config.EMBEDDING_PATH)
            for line in f:
                values = line.rstrip().split()
                word2embedding[values[0]] = np.asarray(values[1:],
                                                       dtype='float32')
            f.close()

    _, _, train_x, train_y, sample_weight, comments, features = prepare_data(
        config, word2embedding, training=True)
    dev_df, dev_raw, dev_x, _, _, _, _ = prepare_data(config, word2embedding,
                                                   training=False)
    return Data(train_x, train_y, sample_weight, comments,
                dev_df, dev_raw, dev_x, features)


class Data:
    def __init__(self,
                 # If initializing on the fly:
                 train_x=None, train_y=None, sample_weight=None,
                 comments=None, dev_df=None, dev_raw=None, dev_x=None,
                 features=None,
                 # If initializing from files:
                 path=None):
        self.train_x = train_x
        self.train_y = train_y
        self.sample_weight = sample_weight
        self.comments = comments
        self.features = features
        self.dev_df = dev_df
        self.dev_raw = dev_raw
        self.dev_x = dev_x
        if path:
            self.load(path)


    def save(self, path='gdrive/My Drive/colab_projects/data/data/'):
        np.save(path + 'train_x', self.train_x)
        np.save(path + 'train_y', self.train_y)
        np.save(path + 'dev_x', self.dev_x)
        np.save(path + 'sample_weight', self.sample_weight)
        self.dev_raw.to_csv(path + 'dev_raw')
        self.dev_df.to_csv(path + 'dev_df')
        with open(path + 'comments.txt', 'w', encoding='utf8') as f:
            for comment in self.comments:
                f.write(comment + '\n')
        with open(path + 'features.txt', 'w', encoding='utf8') as f:
            for feature in self.features:
                f.write(feature + '\n')


    def load(self, path='gdrive/My Drive/colab_projects/data/data/'):
        self.train_x = np.load(path + 'train_x.npy')
        self.train_y = np.load(path + 'train_y.npy')
        self.dev_x = np.load(path + 'dev_x.npy')
        self.sample_weight = np.load(path + 'sample_weight.npy')
        self.dev_raw = pd.read_csv(path + 'dev_raw')
        self.dev_df = pd.read_csv(path + 'dev_df')
        self.comments =[]
        with open(path + 'comments.txt', 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.comments.append(line)
        self.features =[]
        with open(path + 'features.txt', 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.features.append(line)


######################
# Creating the model #
######################]

def get_svm(train_x, train_y):
    model = svm.SVC(decision_function_shape='ovo')
    model.fit(train_x, train_y)
    return model


def get_ffnn(config, train_x, train_y, sample_weight, single_layer=False):
    y_encoder = preprocessing.OneHotEncoder()
    train_y_enc = y_encoder.fit_transform(train_y.to_numpy().reshape(-1, 1))
    model = Sequential()
    if single_layer:
        model.add(Dense(config.N_CLASSES, activation='softmax',
                        input_dim=train_x.shape[1]))
    else:
        model.add(Dense(config.HIDDEN, activation='relu',
                        input_dim=train_x.shape[1]))
        model.add(Dropout(config.DROPOUT))
        model.add(Dense(config.N_CLASSES, activation='softmax'))
    model.compile(loss=config.LOSS, optimizer=config.OPTIMIZER,
                  metrics=[config.METRIC])
    history = model.fit(train_x, train_y_enc, epochs=config.EPOCHS,
                        batch_size=config.BATCH_SIZE,
                        sample_weight=sample_weight, verbose=1)
    return model, history, y_encoder


def get_bilstm(config, train_x, train_y, sample_weight):
    model = Sequential()
    model.add(Bidirectional(CuDNNLSTM(config.LSTM_UNITS,
                                      return_sequences=True),
                            input_shape=train_x.shape[1:]))
    model.add(Dropout(config.DROPOUT))
    model.add(TimeDistributed(Dense(config.N_CLASSES, activation='softmax')))
    model.compile(loss=config.LOSS, optimizer=config.OPTIMIZER,
                  metrics=[config.METRIC], sample_weight_mode='temporal')
    history = model.fit(train_x, train_y, epochs=config.EPOCHS,
                        batch_size=config.BATCH_SIZE,
                        sample_weight=sample_weight, verbose=1)
    return model, history


###############
# Predictions #
###############


def get_bio_predictions(model, x, x_raw, n_classes):
    y_hat = model.predict(x)
    y_hat = y_hat.reshape(-1, n_classes).argmax(axis=1).reshape(x.shape[:2])
    labels = []
    for row in x_raw.itertuples():
        sent_idx = row.Index - 1
        for tok_idx in range(row.n_toks):
            if y_hat[sent_idx][tok_idx] == 0:
                label = "O"
            elif y_hat[sent_idx][tok_idx] == 1:
                label = "I"
            else:
                label = "B"
            labels.append(label)
    return labels


def si_predictions_to_spans(label_df):
    spans = []
    prev_label = 'O'
    prev_span_start = '-1'
    prev_span_end = '-1'
    prev_article = ''

    for row in label_df.itertuples():
        article = row.document_id
        span_start = row.token_start
        span_end = row.token_end
        label = row.label_pred

        span, prev_span_start = update_predicted_span(article, label,
                                                      span_start, span_end,
                                                      prev_article, prev_label,
                                                      prev_span_start,
                                                      prev_span_end)
        if span is not None:
            spans.append(span)

        prev_article = article
        prev_label = label
        prev_span_end = span_end

    # Make sure we get the last prediction
    span, _ = update_predicted_span(article, label, span_start, span_end,
                                    prev_article, prev_label, prev_span_start,
                                    prev_span_end)
    if span is not None:
        spans.append(span)
    return spans


# Helper method for si_predictions_to_spans
def update_predicted_span(article, label, span_start, span_end, prev_article,
                          prev_label, prev_span_start, prev_span_end):
    span = None
    cur_span_start = prev_span_start
    # Ending a span: I-O, B-O, I-B, B-B, new article
    if prev_label != 'O' and (label != 'I' or prev_article != article):
        span = (prev_article, prev_span_start, prev_span_end)

    # Starting a new span: O-B, O-I, I-B, B-B, new article
    if label == 'B' or (label == 'I' and prev_label == 'O') \
            or prev_article != article:
        # Update the start of the current label span
        cur_span_start = span_start
    return span, cur_span_start


def print_spans(spans, file_prefix, file_stem, file_suffix):
    outfile = file_prefix + 'spans_' + file_stem + '_' + file_suffix + '.txt'
    with open(outfile, mode='w') as f:
        for span in spans:
            f.write(str(span[0]) + '\t' + str(span[1]) + '\t' +
                    str(span[2]) + '\n')


def predict_si(config, model, history, dev_df, dev_raw, dev_x, comments,
               file_prefix, file_stem, file_suffix, features,
               predict_spans=True):
    y_hat = get_bio_predictions(model, dev_x, dev_raw, config.N_CLASSES)
    result_df = pd.concat([dev_df, pd.DataFrame(y_hat, columns=['label_pred'])],
                          axis=1, sort=False)

    logfile = file_prefix + 'log_' + file_stem + '_' + file_suffix + '.txt'

    with open(logfile, mode='w') as f:
        f.write('DATA PREPROCESSING\n\n')
        for comment in comments:
            comment = comment.replace('#', '')
            fields = comment.split(',')
            for field in fields:
                f.write(comment.strip() + '\n')
        f.write('Additional features:' + str(features) + '\n')
        f.write('\n\nCONFIG\n\n')
        f.write(config.pretty_str())
        f.write('\n\nMODEL HISTORY\n\n')
        f.write('Loss ' + config.LOSS + '\n')
        f.write(str(history.history['loss']) + '\n')
        f.write(config.METRIC + '\n')
        f.write(str(history.history[config.METRIC]) + '\n')
        f.write('\n\nMODEL SUMMARY\n\n')
        model.summary(print_fn=lambda x: f.write(x + '\n'))

    if predict_spans:
        spans = si_predictions_to_spans(result_df)
        print_spans(spans, file_prefix, file_stem, file_suffix)

    return result_df


def predict_tc(config, model, history, dev_df, dev_x, comments, file_prefix,
               file_stem, file_suffix, features, y_encoder=None):
    logfile = file_prefix + 'log_' + file_stem + '_' + file_suffix + '.txt'
    with open(logfile, mode='w') as f:
        f.write('DATA PREPROCESSING\n\n')
        for comment in comments:
            comment = comment.replace('#', '')
            fields = comment.split(',')
            for field in fields:
                f.write(comment.strip() + '\n')
        f.write('Additional features:' + str(features) + '\n')
        f.write('\n\nCONFIG\n\n')
        f.write(config.pretty_str())
        if history:
            f.write('\n\nMODEL HISTORY\n\n')
            f.write('Loss ' + config.LOSS + '\n')
            f.write(str(history.history['loss']) + '\n')
            f.write(config.METRIC + '\n')
            f.write(str(history.history[config.METRIC]) + '\n')
            f.write('\n\nMODEL SUMMARY\n\n')
            model.summary(print_fn=lambda x: f.write(x + '\n'))

    y_hat = model.predict(dev_x)
    if y_encoder:
        y_hat = y_encoder.inverse_transform(y_hat)
    return print_tc(y_hat, dev_df, file_prefix, file_stem, file_suffix)


def print_tc(y_hat, dev_df, file_prefix, file_stem, file_suffix):
    outfile = file_prefix + 'labels_' + file_stem + '_' + file_suffix + '.txt'
    result_df = pd.concat([dev_df, pd.DataFrame(y_hat, columns=['label_pred'])],
                          axis=1, sort=False)
    result_df = result_df[['document_id', 'label_pred', 'span_start',
                           'span_end']]
    result_df.to_csv(outfile, sep='\t', index=False, header=False)
    return result_df



###########################
# Putting it all together #
###########################


def run(config, file_stem, file_suffix, verbose=True, predict_spans=True,
        data=None, word2embedding=None, file_prefix=''):
    if verbose:
        print('Running with config:')
        print(config.pretty_str())
    if not data:
        if config.LOAD_DATA:
            print('Loading data from files')
            data = Data(path=config.DATA_PATH)
        else:
            if verbose:
                print('Encoding the data')
            data = get_data(config, word2embedding)
            if config.SAVE_DATA:
                data.save()

    if verbose:
        print('Additional features:', data.features)
        print('Building the model')
    if config.TOKEN_LVL:
        model, history = get_bilstm(config, data.train_x, data.train_y,
                                    data.sample_weight)
    else:
        history = None
        y_encoder = None
        if config.MODEL == 'SVM':
            model = get_svm(data.train_x, data.train_y)
        elif config.MODEL.startswith('FFNN'):
            model, history, y_encoder = get_ffnn(config, data.train_x,
                                                 data.train_y,
                                                 data.sample_weight,
                                                 single_layer=config.MODEL == 'FFNN-single')
        elif config.MODEL == 'LSTM':
            model, history = get_bilstm(config, data.train_x, data.train_y,
                                    data.sample_weight)

    if verbose:
        print('Predicting the test data labels/spans')
    if config.TOKEN_LVL:
        labels = predict_si(config, model, history, data.dev_df, data.dev_raw,
                            data.dev_x, data.comments, file_prefix, file_stem,
                            file_suffix, data.features, predict_spans)
    else:
        labels = predict_tc(config, model, history, data.dev_df, data.dev_x,
                            data.comments, file_prefix, file_stem, file_suffix,
                            data.features, y_encoder)
    if verbose:
        print('Done!\n\n')

    return data, labels


# grid_search.py

In [0]:
# from model import run, si_predictions_to_spans, print_spans
from collections import Counter
import time


class Config:
    def __init__(self, args=None):
        """Creates a default configuration.

        Keyword arguments:
        args -- a dict(str -> ?) containing values diverging from the default
        """
        # Encoding the data:
        self.TOKEN_LVL = True  # True if task 1, False if task 2.
        if args and 'TOKEN_LVL' in args:
            self.TOKEN_LVL = args['TOKEN_LVL']

        self.ONLINE_SOURCES = True  # Input is given via URLs, not local files.

        self.UNCASED = True  # If true, words are turned into lower case.
        self.FEATURES = None  # If None, the features are determined from the
                              # input file.
        self.EXCLUDE_FEATURES = []  # Only used if FEATURES is not None
        self.SAVE_DATA = False  # If true, the following two values can be used
                                # for re-using the data next time.
        # In case the training & dev data were saved and can be reused:
        self.DATA_PATH = 'gdrive/My Drive/colab_projects/data/data/'
        self.LOAD_DATA = False

        # Building the model:
        self.BATCH_SIZE = 128
        self.LSTM_UNITS = 512
        self.DROPOUT = 0.25
        self.OPTIMIZER = 'adam'
        self.METRIC = 'categorical_accuracy'
        self.LOSS = 'categorical_crossentropy'

        # Making predictions:
        self.MAJORITY_VOTING = True

        # Task-specific options
        if self.TOKEN_LVL:
            # Task 1: Span identification
            # For using train+dev and test, see the end of this file.
            self.N_CLASSES = 2
            self.MAX_SEQ_LEN = 35
            self.EMBED_DIM = 300
            self.EPOCHS = 10
            self.CLASS_WEIGHTS = {'O': 1.0, 'I': 6.5, 'B': 6.5}
            self.TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-train.tsv?token=AD7GEDPCTVL5SN46K6LG6EC6LP4BW'
            self.DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-dev.tsv?token=AD7GEDLCUDJD7YBIGBJVR526LP4BM'
            self.EMBEDDING_PATH = 'gdrive/My Drive/colab_projects/data/glove.42B.300d.zip'  # 'gdrive/My Drive/colab_projects/data/glove.6B.100d.zip'
            self.USE_BERT = False
            self.TRAIN_BERT = 'gdrive/My Drive/colab_projects/data/train_bert-base-uncased.tsv'
            self.DEV_BERT = 'gdrive/My Drive/colab_projects/data/dev_bert-base-uncased.tsv'
        else:
            # Task 2: Technique classification
            self.MODEL = 'FFNN'  # Options: 'SVM', 'FFNN', 'FFNN-single', 'LSTM'
            self.HIDDEN = 128  # If MODEL == 'FFNN'
            self.EPOCHS = 15  # If MODEL == 'FFNN'
            self.CLASS_WEIGHTS = None
            self.TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-train.tsv?token=AD7GEDOJ3TVR5J4ALHKARP26NO6LO'
            self.DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-dev.tsv?token=AD7GEDL5VBEWFJB4IPZ4TXS6NO6LQ'
            self.USE_BERT = True  # Currently, we don't have an alternative to this.
            self.EMBED_DIM = 768
            self.N_BERT_LAYERS = 1
            self.TRAIN_BERT = 'gdrive/My Drive/colab_projects/data/tc_train_bert-base-uncased.tsv'
            self.DEV_BERT = 'gdrive/My Drive/colab_projects/data/tc_dev_bert-base-uncased.tsv'
            self.N_CLASSES = 14
            self.MAX_SEQ_LEN = -1  # Value is irrelevant (fixed-size input)

        self.FLATTEN = (not self.TOKEN_LVL) or (self.MODEL != 'LSTM')

        if args:
            for key in args:
                setattr(self, key, args[key])                

    def pretty_str(self):
        return 'max seq len: ' + str(self.MAX_SEQ_LEN) + '\n' + \
               'embedding depth: ' + str(self.EMBED_DIM) + '\n' + \
               'BERT embeddings: ' + str(self.USE_BERT) + '\n' + \
               'BERT layers: ' + str(self.N_BERT_LAYERS) + '\n' + \
               'TRAIN_BERT: ' + str(self.TRAIN_BERT) + '\n' + \
               'DEV_BERT: ' + str(self.DEV_BERT) + '\n' + \
               'number of labels: ' + str(config.N_CLASSES) + '\n' + \
               'batch size: ' + str(self.BATCH_SIZE) + '\n' + \
               'epochs: ' + str(self.EPOCHS) + '\n' + \
               'class weights: ' + str(self.CLASS_WEIGHTS) + '\n' + \
               'hidden units: ' + str(self.LSTM_UNITS) + '\n' + \
               'dropout rate: ' + str(self.DROPOUT) + '\n' + \
               'optimizer: ' + self.OPTIMIZER + '\n' + \
               'metric: ' + self.METRIC + '\n' + \
               'loss: ' + self.LOSS + '\n'


def get_majority_vote(votes):
    votes = [k for k, _ in sorted(dict(Counter(votes)).items(),
                                  key=lambda item: item[1],
                                  reverse=True)]
    # Task 1: For our data, preferring specific labels in tie situations
    # doesn't make a difference.
    return votes[0]


def run_config(config, file_prefix, data=None, repetitions=5, verbose=True):
    now = time.strftime("%Y%m%d-%H%M%S", time.localtime())
    predictions = None
    label_cols = []
    for i in range(repetitions):
        if verbose:
            print("Iteration " + str(i + 1) + " of " + str(repetitions))
        data, labels = run(config, data=data, verbose=verbose,
                           file_prefix=file_prefix, file_stem=now,
                           file_suffix=str(i))
        if config.MAJORITY_VOTING:
            if predictions is None:
                predictions = labels
                predictions = predictions.rename(
                    columns={'label_pred': 'label_0'})
            else:
                predictions.insert(loc=len(predictions.columns),
                                   column='label_' + str(i),
                                   value=labels.label_pred)
            label_cols.append('label_' + str(i))
    if config.MAJORITY_VOTING:
        labels = []
        for row in predictions.itertuples():
            labels.append(get_majority_vote(
                [getattr(row, l) for l in label_cols]))
        predictions['label_pred'] = labels
        if config.TOKEN_LVL:
            spans = si_predictions_to_spans(predictions)
            print_spans(spans, file_prefix, now, 'majority')
        else:
            print_tc(labels, data.dev_df, file_prefix, now, 'majority')

    # Return data in case the next config only changes model features
    return data, now


file_prefix = '/content/gdrive/My Drive/colab_projects/semeval-predictions/'
data = None

### Hyperparameter tuning:
# for epochs in [5, 15, 20, 25]:
#     config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 'EPOCHS': epochs})
#     data, _ = run_config(config, file_prefix, data)

### You can change config values by passing a dictionary to the constructor:
# config = Config({'LOAD_DATA': True})
# config = Config({'USE_BERT': True})
# config = Config({'USE_BERT': True, 'TOKEN_LVL': False})
config = Config({'USE_BERT': True, 'TOKEN_LVL': False, 
                #  'FEATURES': [],
                 'FEATURES': ['repetitions', 'length', 'question'],
                 'MODEL': 'FFNN',
                #  'CLASS_WEIGHTS': {'Loaded_Language': 1,
                #                    'Name_Calling,Labeling': 1,
                #                    'Repetition': 2,
                #                    'Doubt': 2,
                #                    'Exaggeration,Minimisation': 2,
                #                    'Appeal_to_fear-prejudice': 2,
                #                    'Flag-Waving': 1,
                #                    'Causal_Oversimplification': 1,
                #                    'Appeal_to_Authority': 1,
                #                    'Slogans': 1,
                #                    'Black-and-White_Fallacy': 1,
                #                    'Whataboutism,Straw_Men,Red_Herring': 1,
                #                    'Thought-terminating_Cliches': 1,
                #                    'Bandwagon,Reductio_ad_hitlerum': 1}
                 'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_bert-base-uncased_10.tsv',
                 'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_bert-base-uncased_10.tsv',
                 'EMBED_DIM': 768 * 11
                #  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/tc_train_bert-large-uncased.tsv',
                #  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/tc_dev_bert-large-uncased.tsv',
                #  'EMBED_DIM': 1024,
                #  'TRAIN_BERT': '/content/gdrive/My Drive/colab_projects/data/full_bert_train.tsv',
                #  'DEV_BERT': '/content/gdrive/My Drive/colab_projects/data/full_bert_dev.tsv',
                #  'EMBED_DIM': 14 + 768,
                #  'UNCASED': False
                 })
data, now = run_config(config, file_prefix, data)

### For predictions on the final test set (task 1):
# config = Config({'USE_BERT': True,
#                  'TRAIN_URL': 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-train%2Bdev.tsv?token=AD7GEDJ7GSTS3RSP5ZSXLZ26LP4BS',
#                  'DEV_URL': 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-test.tsv?token=AD7GEDM7A3GFIAEZHHESFO26LP4BQ',
#                  'TRAIN_BERT': 'gdrive/My Drive/colab_projects/data/train+dev_bert-base-uncased.tsv',
#                  'DEV_BERT': 'gdrive/My Drive/colab_projects/data/test_bert-base-uncased.tsv'
#                  })
# data, now = run_config(config, file_prefix, data)


In [0]:
# now = '20200307-151259'

for sfx in ['0', '1', '2', '3', '4', 'majority']:
    f = file_prefix + 'labels_' + now + '_' + sfx + '.txt'
    df = pd.read_csv(f, sep='\t', usecols=[1], names=['label'])
    df = df['label'].value_counts().rename_axis('labels').reset_index(name='counts')
    df['%'] = df['counts'] / df['counts'].sum()
    print('labels_' + now + '_' + sfx)
    print(df)
    print('\n')