In [1]:
import gc
import numpy as np
import pandas as pd
import re
import tensorflow as tf

from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from os import path
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from string import punctuation
from tensorflow.keras.layers import (BatchNormalization, Concatenate, Conv1D, Dense, Dropout, Embedding, 
                                     GlobalMaxPooling1D, Input, TimeDistributed)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm_notebook
from unidecode import unidecode

np.random.seed(42)
tf.compat.v1.random.set_random_seed(42)

# Data Ingestion

## Dataset constants

In [2]:
DIR_PATH = "../data/"
LANGUAGE = "spanish"
DROP_COLUMNS = ["split", "language"]
UNRELIABLE_SAMPLING = 0.1

## Dataset Loading

In [3]:
%%time
def load_data(base_path, language, drop_columns, unreliable_sampling):
    datasets = {}
    for ds in tqdm_notebook(["train_reliable", "train_unreliable", "dev", "test"]):
        if ds == "train_unreliable" and unreliable_sampling == 0:
            continue
        
        df = pd.read_parquet(
            path.join(base_path, f"{language}", f"{ds}.parquet")
        ).drop(drop_columns, axis=1, errors="ignore")
        
        if ds == "train_unreliable" and 0 < unreliable_sampling < 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(frac=unreliable_sampling)
            ).reset_index(drop=True)
        elif ds == "train_unreliable" and unreliable_sampling > 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(n=int(unreliable_sampling))
            ).reset_index(drop=True)
        
        if ds == "train_reliable":
            datasets["train"] = df
        elif ds == "train_unreliable":
            datasets["train"] = pd.concat([
                datasets["train"],
                df
            ], ignore_index=True)
        else:
            datasets[ds] = df
    
    w2v = KeyedVectors.load_word2vec_format(
        path.join(base_path, f"{language}", "word2vec.bin.gz"), 
        binary=True
    )
    
    return datasets, w2v

datasets, w2v = load_data(DIR_PATH, LANGUAGE, DROP_COLUMNS, UNRELIABLE_SAMPLING)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 1min 19s, sys: 20.6 s, total: 1min 40s
Wall time: 1min 20s


# Data Preprocessing

## Label Encoding

In [4]:
%%time
def label_encoder(*dfs):
    labels = pd.concat(dfs)["category"].tolist()
    lbl_enc = LabelEncoder().fit(labels)

    return lbl_enc

lbl_enc = label_encoder(datasets["train"], datasets["dev"])

for split in ["train", "dev"]:
    datasets[split]["target"] = lbl_enc.transform(datasets[split]["category"])
    datasets[split].drop(["category"], axis=1, inplace=True)

CPU times: user 8.86 s, sys: 1.27 s, total: 10.1 s
Wall time: 2.97 s


## Text curation

### Punctuation removal

In [84]:
%%time

def remove_punctuation(datasets, punctuation, tcolumn="tokens", pcolumn="pos"):
    for split in tqdm_notebook(datasets):
        token_pos = pd.Series(list(zip(datasets[split][tcolumn], datasets[split][pcolumn])))
        datasets[split][["non_punct_tokens", "non_punct_pos"]] = pd.DataFrame(
            token_pos.apply(
                lambda tw: list(zip(*[(tw[0][i], tw[1][i]) for i in range(len(tw[0])) if tw[0][i] not in punctuation]))
            ).tolist()
        )
        datasets[split] = datasets[split].dropna()
    return datasets

datasets = remove_punctuation(datasets, punctuation, "words", "pos")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

CPU times: user 18.2 s, sys: 709 ms, total: 18.9 s
Wall time: 14.1 s


### Stopwords removal

In [86]:
%%time

def remove_stopwords(datasets, stopwords, tcolumn="tokens", pcolumn="pos"):
    for split in tqdm_notebook(datasets):
        token_pos = pd.Series(list(zip(datasets[split][tcolumn], datasets[split][pcolumn])))
        datasets[split][["non_sw_tokens", "non_sw_pos"]] = pd.DataFrame(
            token_pos.apply(
                lambda tw: list(zip(*[(tw[0][i], tw[1][i]) 
                                      for i in range(len(tw[0])) if tw[0][i]]))
            ).tolist()
        )
        datasets[split] = datasets[split].dropna()
    return datasets

datasets = remove_stopwords(datasets, set(stopwords.words(LANGUAGE)), "non_punct_tokens", "non_punct_pos")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

CPU times: user 20.3 s, sys: 1.14 s, total: 21.5 s
Wall time: 13.8 s


### Word Vectorization

In [88]:
%%time

def word_with_vector(word, w2v, stemmer):
    if word in w2v:
        return word
    elif word.capitalize() in w2v:
        return word.capitalize()
    elif unidecode(word) in w2v:
        return unidecode(word)
    elif unidecode(word.capitalize()) in w2v:
        return unidecode(word.capitalize())
    elif stemmer.stem(word) in w2v:
        return stemmer.stem(word)
    elif word.isdigit():
        return "DIGITO"
    else:
        return "<UNK>"
    # TODO: Lemmatization? Other normalizations?

def word_vectorize(datasets, language, w2v, column="tokens"):
    stemmer = SnowballStemmer(language)
    for split in tqdm_notebook(datasets):
        datasets[split]["normalized_tokens"] = datasets[split][column].apply(
            lambda words: [word_with_vector(w, w2v, stemmer) for w in words]
        )
    return datasets

datasets = word_vectorize(datasets, LANGUAGE, w2v, "non_sw_tokens")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

CPU times: user 1min 26s, sys: 674 ms, total: 1min 26s
Wall time: 1min 26s


In [89]:
%%time
def words_to_idx(all_words, w2v, null_token="<NULL>",
                 unknown_token="<UNK>", num_token="DIGITO"):
    word_index = {word for words in all_words for word in words if word in w2v}
    word_index = {word: idx for idx, word in enumerate(sorted(word_index), start=1)}
    word_index[null_token] = 0
    if num_token not in word_index:
        word_index[num_token] = len(word_index)
    word_index[unknown_token] = len(word_index)

    return word_index

word_index = words_to_idx(pd.concat(list(datasets.values()), sort=False)["normalized_tokens"], w2v)

print(f"Vocab length: {len(word_index)}")

Vocab length: 100469
CPU times: user 11 s, sys: 598 ms, total: 11.6 s
Wall time: 7.17 s


## Characters Preprocessing

In [90]:
%%time
def chars_to_idx(titles, null_token="<NULL>", unknown_token="<UNK>"):
    char_index = {char for title in titles for char in title}
    char_index = {char: idx for idx, char in enumerate(sorted(char_index), start=1)}
    char_index[null_token] = 0
    char_index[unknown_token] = len(char_index)

    return char_index

char_index = chars_to_idx(
    pd.concat(
        list(datasets.values()), 
        ignore_index=True, 
        sort=False
    )["non_sw_tokens"].apply(lambda tokens: " ".join(tokens))
)

print(f"Char vocab length: {len(char_index)}")

Char vocab length: 135
CPU times: user 6.6 s, sys: 188 ms, total: 6.79 s
Wall time: 4.64 s


## POS Preprocessing

In [93]:
%%time
def pos_tags_to_idx(all_pos_tags, w2v, null_pos_tag="<NULL>", unknown_pos_tag="<UNK>"):
    pos_tag_index = {pos_tag for pos_tags in all_pos_tags for pos_tag in pos_tags}
    pos_tag_index = {pos_tag: idx for idx, pos_tag in enumerate(sorted(pos_tag_index), start=1)}
    pos_tag_index[null_pos_tag] = 0
    pos_tag_index[unknown_pos_tag] = len(pos_tag_index)

    return pos_tag_index

pos_tag_index = pos_tags_to_idx(pd.concat(list(datasets.values()), sort=False)["non_sw_pos"], w2v)

print(f"Pos TAG length: {len(pos_tag_index)}")

Pos TAG length: 19
CPU times: user 6.21 s, sys: 373 ms, total: 6.58 s
Wall time: 2.15 s


# Network Data Preparation

## Word and PoS Sequences

In [96]:
%%time

WORD_MAX_SEQUENCE_LEN = 15

def word_sequence_padding(series, word_index, max_len):
    return pad_sequences(
            series.apply(
                lambda words: [word_index.get(word, word_index["<UNK>"]) for word in words]
            ).tolist(), maxlen=max_len
        )

train_word_sequences = word_sequence_padding(
    datasets["train"]["normalized_tokens"], word_index, WORD_MAX_SEQUENCE_LEN
)

dev_word_sequences = word_sequence_padding(
    datasets["dev"]["normalized_tokens"], word_index, WORD_MAX_SEQUENCE_LEN
)

test_word_sequences = word_sequence_padding(
    datasets["test"]["normalized_tokens"], word_index, WORD_MAX_SEQUENCE_LEN
)

CPU times: user 10.8 s, sys: 228 ms, total: 11 s
Wall time: 11 s


In [97]:
%%time
def get_embedding_matrix(word_index, w2v):
    embedding_matrix = np.zeros((len(word_index), w2v.vector_size))

    for word, i in word_index.items():
        if word in w2v and word not in {"<NULL>", "<UNK>", "<NUM>"}:
            embedding_matrix[i] = w2v[word]
        elif word == "<UNK>" or word == "<NUM>":
            embedding_matrix[i] = np.random.normal(size=(w2v.vector_size,))

    return embedding_matrix

word_embedding_matrix = get_embedding_matrix(word_index, w2v)

CPU times: user 328 ms, sys: 88 ms, total: 416 ms
Wall time: 437 ms


In [98]:
%%time

def pos_tag_sequence_padding(series, pos_tag_index, max_len):
    return pad_sequences(
            series.apply(
                lambda pos_tags: [pos_tag_index.get(pos_tag, pos_tag_index["<UNK>"]) for pos_tag in pos_tags]
            ).tolist(), maxlen=max_len
        )

train_pos_tag_sequences = pos_tag_sequence_padding(
    datasets["train"]["non_sw_pos"], pos_tag_index, WORD_MAX_SEQUENCE_LEN
)

dev_pos_tag_sequences = pos_tag_sequence_padding(
    datasets["dev"]["non_sw_pos"], pos_tag_index, WORD_MAX_SEQUENCE_LEN
)

test_pos_tag_sequences = pos_tag_sequence_padding(
    datasets["test"]["non_sw_pos"], pos_tag_index, WORD_MAX_SEQUENCE_LEN
)

CPU times: user 11.5 s, sys: 356 ms, total: 11.9 s
Wall time: 11.9 s


## Character Sequences

In [102]:
%%time

CHAR_MAX_SEQUENCE_LEN = 10

def char_sequence_padding(series, char_index, char_max_len, word_max_len):
    return pad_sequences(
        series.apply(
            lambda words: pad_sequences([
                [char_index.get(char, char_index["<UNK>"]) for char in word]
            for word in words], maxlen=char_max_len)
    ), maxlen=word_max_len, value=np.zeros(char_max_len))

train_char_sequences = char_sequence_padding(
    datasets["train"]["non_sw_tokens"], char_index, CHAR_MAX_SEQUENCE_LEN, WORD_MAX_SEQUENCE_LEN
)

dev_char_sequences = char_sequence_padding(
    datasets["dev"]["non_sw_tokens"], char_index, CHAR_MAX_SEQUENCE_LEN, WORD_MAX_SEQUENCE_LEN
)

test_char_sequences = char_sequence_padding(
    datasets["test"]["non_sw_tokens"], char_index, CHAR_MAX_SEQUENCE_LEN, WORD_MAX_SEQUENCE_LEN
)

CPU times: user 1min 31s, sys: 762 ms, total: 1min 32s
Wall time: 1min 32s


## Targets

In [103]:
%%time

train_target = to_categorical(
    datasets["train"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

dev_target = to_categorical(
    datasets["dev"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

CPU times: user 273 ms, sys: 2.15 s, total: 2.42 s
Wall time: 2.42 s


# CNN Building

## Model Constants

In [104]:
WORD_FILTERS_LEN = [2, 3, 4, 5]
WORD_FILTER_COUNT = 128

CHAR_FILTERS_LEN = [2, 3, 4]
CHAR_FILTER_COUNT = 64
CHAR_VECTOR_SIZE = 32

POS_VECTOR_SIZE = 8

ACTIVATION = "relu"
PADDING = "same"

## Model Building

In [106]:
def build_model(word_vocab_size, word_vector_size, word_embedding_matrix,
                char_vocab_size, char_vector_size, 
                pos_vocab_size, pos_vector_size, output_size,
                word_max_sequence_len, char_max_sequence_len,
                word_filters_len, word_filter_count, 
                char_filters_len, char_filter_count,
                activation="relu", padding="same"):

    char_sequence_input = Input(shape=(word_max_sequence_len, char_max_sequence_len))
    word_sequence_input = Input(shape=(word_max_sequence_len,))
    pos_tag_sequence_input = Input(shape=(word_max_sequence_len,))
    
    char_embedded_sequences = TimeDistributed(
        Embedding(
            input_dim=char_vocab_size, 
            output_dim=char_vector_size,
            embeddings_initializer="truncated_normal",  # TODO: Change this?
            trainable=True
        ))(char_sequence_input)
#     char_embedded_sequences = Dropout(
#         rate=0.5,
#         noise_shape=(None, word_max_sequence_len, 1, char_vector_size),
#         seed=42
#     )(char_embedded_sequences)

    word_embedding_layer = Embedding(word_vocab_size, word_vector_size,
                                     weights=[word_embedding_matrix],
                                     input_length=word_max_sequence_len,
                                     trainable=False)
    word_embedded_sequences = word_embedding_layer(word_sequence_input)
    word_embedded_sequences = Dropout(
        rate=0.5,
        noise_shape=(None, 1, word_vector_size),
        seed=42
    )(word_embedded_sequences)
    
    pos_tag_embedded_sequences = Embedding(pos_vocab_size, pos_vector_size,
                                           embeddings_initializer="truncated_normal",
                                           trainable=True)(pos_tag_sequence_input)
    pos_tag_embedded_sequences = Dropout(
        rate=0.5,
        noise_shape=(None, 1, pos_vector_size),
        seed=42
    )(pos_tag_embedded_sequences)
    
    char_layers = []
    for filter_len in char_filters_len:
        char_layer = TimeDistributed(
            Conv1D(
                char_filter_count,
                filter_len,
                padding=padding
            )
        )(char_embedded_sequences)
        char_layer = TimeDistributed(
            Conv1D(
                char_filter_count, 
                filter_len, 
                padding=padding
            )
        )(char_layer)
        char_layer = BatchNormalization(momentum=0.0)(char_layer)
        char_layers.append(TimeDistributed(GlobalMaxPooling1D())(char_layer))
    
    word_layer = Concatenate()([word_embedded_sequences, pos_tag_embedded_sequences] + char_layers)

    layers = []
    for filter_len in word_filters_len:
        layer = Conv1D(
            word_filter_count,
            filter_len,
            activation=activation,
            padding=padding
        )(word_layer)
        layer = BatchNormalization(momentum=0.0)(layer)
        layers.append(GlobalMaxPooling1D()(layer))

    layer = Concatenate()(layers)
    preds = Dense(output_size, activation="softmax")(layer)
    model = Model(inputs=[word_sequence_input, pos_tag_sequence_input, char_sequence_input], outputs=[preds])

    return model

model = build_model(
    word_vocab_size=len(word_index),
    word_vector_size=w2v.vector_size, 
    word_embedding_matrix=word_embedding_matrix,
    char_vocab_size=len(char_index),
    char_vector_size=CHAR_VECTOR_SIZE,
    pos_vocab_size=len(pos_tag_index),
    pos_vector_size=POS_VECTOR_SIZE,
    output_size=lbl_enc.classes_.shape[0],
    word_max_sequence_len=WORD_MAX_SEQUENCE_LEN,
    char_max_sequence_len=CHAR_MAX_SEQUENCE_LEN,
    word_filters_len=WORD_FILTERS_LEN,
    word_filter_count=WORD_FILTER_COUNT,
    char_filters_len=CHAR_FILTERS_LEN,
    char_filter_count=CHAR_FILTER_COUNT,
    activation=ACTIVATION,
    padding=PADDING
)

model.compile(
    optimizer="nadam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 15, 10)]     0                                            
__________________________________________________________________________________________________
time_distributed_10 (TimeDistri (None, 15, 10, 32)   4320        input_4[0][0]                    
__________________________________________________________________________________________________
time_distributed_11 (TimeDistri (None, 15, 10, 64)   4160        time_distributed_10[0][0]        
__________________________________________________________________________________________________
time_distributed_14 (TimeDistri (None, 15, 10, 64)   6208        time_distributed_10[0][0]        
____________________________________________________________________________________________

# Fitting the CNN

In [55]:
gc.collect()

2211

In [107]:
model.fit(
    x=(train_word_sequences, train_pos_tag_sequences, train_char_sequences),
    y=train_target,
    batch_size=4096,
    epochs=10,
    validation_data=(
        (dev_word_sequences, dev_pos_tag_sequences, dev_char_sequences),
        dev_target
    ),
    validation_freq=1
)

Train on 1369984 samples, validate on 499625 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [16]:
model.fit(
    x=(train_word_sequences, train_char_sequences),
    y=train_target,
    batch_size=4096,
    epochs=10,
    validation_data=(
        (dev_word_sequences, dev_char_sequences),
        dev_target
    ),
    validation_freq=1
)

Train on 1369986 samples, validate on 499625 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f40823f4588>

In [16]:
model.fit(
    x=(train_word_sequences, train_char_sequences),
    y=train_target,
    batch_size=4096,
    epochs=10,
    validation_data=(
        (dev_word_sequences, dev_char_sequences),
        dev_target
    ),
    validation_freq=1
)

Train on 4983500 samples, validate on 499625 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fdd30be7dd8>

In [17]:
datasets["dev"]["predictions"] = model.predict(
    (dev_word_sequences, dev_char_sequences), batch_size=1024, verbose=0
).argmax(axis=1)

In [None]:
balanced_accuracy_score(datasets["dev"]["target"], datasets["dev"]["predictions"])

In [None]:
balanced_accuracy_score(datasets["dev"]["target"], datasets["dev"]["predictions"])

In [16]:
model.fit(
    x=(train_word_sequences, train_char_sequences),
    y=train_target,
    batch_size=4096,
    epochs=10,
    validation_data=(
        (dev_word_sequences, dev_char_sequences),
        dev_target
    ),
    validation_freq=1
)

Train on 4983500 samples, validate on 499625 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fdd30be7dd8>