In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf

from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from os import path
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from string import punctuation
from tensorflow.keras.layers import Concatenate, Conv1D, Dense, Embedding, GlobalMaxPooling1D, Input, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm_notebook
from unidecode import unidecode

np.random.seed(42)
tf.compat.v1.random.set_random_seed(42)

# Data Ingestion

## Dataset constants

In [2]:
DIR_PATH = "../data/"
LANGUAGE = "spanish"
DROP_COLUMNS = ["split", "language"]
UNRELIABLE_SAMPLING = 0.25

## Dataset Loading

In [3]:
%%time
def load_data(base_path, language, drop_columns, unreliable_sampling):
    datasets = {}
    for ds in tqdm_notebook(["train_reliable", "train_unreliable", "dev", "test"]):
        if ds == "train_unreliable" and unreliable_sampling == 0:
            continue
        
        df = pd.read_parquet(
            path.join(base_path, f"{language}", f"{ds}.parquet")
        ).drop(drop_columns, axis=1, errors="ignore")
        
        if ds == "train_unreliable" and 0 < unreliable_sampling < 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(frac=unreliable_sampling)
            ).reset_index(drop=True)
        elif ds == "train_unreliable" and unreliable_sampling > 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(n=int(unreliable_sampling))
            ).reset_index(drop=True)
        
        if ds == "train_reliable":
            datasets["train"] = df
        elif ds == "train_unreliable":
            datasets["train"] = pd.concat([
                datasets["train"],
                df
            ], ignore_index=True)
        else:
            datasets[ds] = df
    
    w2v = KeyedVectors.load_word2vec_format(
        path.join(base_path, f"{language}", "word2vec.bin.gz"), 
        binary=True
    )
    
    return datasets, w2v

datasets, w2v = load_data(DIR_PATH, LANGUAGE, DROP_COLUMNS, UNRELIABLE_SAMPLING)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 1min 20s, sys: 17.4 s, total: 1min 37s
Wall time: 1min 17s


# Data Preprocessing

## Label Encoding

In [4]:
%%time
def label_encoder(*dfs):
    labels = pd.concat(dfs)["category"].tolist()
    lbl_enc = LabelEncoder().fit(labels)

    return lbl_enc

lbl_enc = label_encoder(datasets["train"], datasets["dev"])

for split in ["train", "dev"]:
    datasets[split]["target"] = lbl_enc.transform(datasets[split]["category"])
    datasets[split].drop(["category"], axis=1, inplace=True)

CPU times: user 10.5 s, sys: 1.38 s, total: 11.9 s
Wall time: 4.97 s


## Text curation

### Punctuation removal

In [5]:
%%time

def remove_punctuation(datasets, punctuation, column="tokens"):
    for split in tqdm_notebook(datasets):
        datasets[split]["non_punct_tokens"] = datasets[split][column].apply(
            lambda words: [w for w in words if w not in punctuation]
        )
    return datasets

datasets = remove_punctuation(datasets, punctuation, "words")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 11.9 s, sys: 720 ms, total: 12.6 s
Wall time: 11.2 s


### Stopwords removal

In [6]:
%%time

def remove_stopwords(datasets, stopwords, column="tokens"):
    for split in tqdm_notebook(datasets):
        datasets[split]["non_sw_tokens"] = datasets[split][column].apply(
            lambda words: [w for w in words if w not in stopwords]
        )
    return datasets

datasets = remove_stopwords(datasets, set(stopwords.words(LANGUAGE)), "non_punct_tokens")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 5.81 s, sys: 156 ms, total: 5.97 s
Wall time: 5.96 s


### Word Vectorization

In [17]:
%%time

def word_with_vector(word, w2v, stemmer):
    if word in w2v:
        return word
    elif word.capitalize() in w2v:
        return word.capitalize()
    elif unidecode(word) in w2v:
        return unidecode(word)
    elif unidecode(word.capitalize()) in w2v:
        return unidecode(word.capitalize())
    elif stemmer.stem(word) in w2v:
        return stemmer.stem(word)
    elif word.isdigit():
        return "DIGITO"
    else:
        return "<UNK>"
    # TODO: Lemmatization? Other normalizations?

def word_vectorize(datasets, language, w2v, column="tokens"):
    stemmer = SnowballStemmer(language)
    for split in tqdm_notebook(datasets):
        datasets[split]["normalized_tokens"] = datasets[split][column].apply(
            lambda words: [word_with_vector(w, w2v, stemmer) for w in words]
        )
    return datasets

datasets = word_vectorize(datasets, LANGUAGE, w2v, "non_sw_tokens")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 2min 25s, sys: 571 ms, total: 2min 26s
Wall time: 2min 26s


In [18]:
%%time
def words_to_idx(all_words, w2v, null_token="<NULL>",
                 unknown_token="<UNK>", num_token="DIGITO"):
    word_index = {word for words in all_words for word in words if word in w2v}
    word_index = {word: idx for idx, word in enumerate(sorted(word_index), start=1)}
    word_index[null_token] = 0
    if num_token not in word_index:
        word_index[num_token] = len(word_index)
    word_index[unknown_token] = len(word_index)

    return word_index

word_index = words_to_idx(pd.concat(list(datasets.values()), sort=False)["normalized_tokens"], w2v)

print(f"Vocab length: {len(word_index)}")

Vocab length: 118201
CPU times: user 10.7 s, sys: 852 ms, total: 11.6 s
Wall time: 8.29 s


## Characters Preprocessing

In [23]:
%%time
def normalize_titles(datasets, column="tokens"):
    for split in datasets:
        datasets[split]["normalized_title"] = datasets[split][column].apply(lambda tokens: " ".join(tokens))
    return datasets

datasets = normalize_titles(datasets, "non_sw_tokens")

CPU times: user 1.66 s, sys: 170 µs, total: 1.66 s
Wall time: 1.66 s


In [42]:
%%time
def chars_to_idx(titles, null_token="<NULL>", unknown_token="<UNK>"):
    char_index = {char for title in titles for char in title}
    char_index = {char: idx for idx, char in enumerate(sorted(char_index), start=1)}
    char_index[null_token] = 0
    char_index[unknown_token] = len(char_index)

    return char_index

char_index = chars_to_idx(
    pd.concat(
        list(datasets.values()), 
        ignore_index=True, 
        sort=False
    )["normalized_tokens"].apply(lambda tokens: " ".join(tokens))
)

print(f"Char vocab length: {len(char_index)}")

Char vocab length: 135
CPU times: user 7.15 s, sys: 202 ms, total: 7.35 s
Wall time: 5.27 s


# Network Data Preparation

## Word Sequences

In [48]:
%%time

WORD_MAX_SEQUENCE_LEN = 15

def word_sequence_padding(series, word_index, max_len):
    return pad_sequences(
            series.apply(
                lambda words: [word_index.get(word, word_index["<UNK>"]) for word in words]
            ).tolist(), maxlen=max_len
        )

train_word_sequences = word_sequence_padding(
    datasets["train"]["normalized_tokens"], word_index, WORD_MAX_SEQUENCE_LEN
)

dev_word_sequences = word_sequence_padding(
    datasets["dev"]["normalized_tokens"], word_index, WORD_MAX_SEQUENCE_LEN
)

test_word_sequences = word_sequence_padding(
    datasets["test"]["normalized_tokens"], word_index, WORD_MAX_SEQUENCE_LEN
)

CPU times: user 19.7 s, sys: 159 ms, total: 19.8 s
Wall time: 19.8 s


In [49]:
%%time
def get_embedding_matrix(word_index, w2v):
    embedding_matrix = np.zeros((len(word_index), w2v.vector_size))

    for word, i in word_index.items():
        if word in w2v and word not in {"<NULL>", "<UNK>", "<NUM>"}:
            embedding_matrix[i] = w2v[word]
        elif word == "<UNK>" or word == "<NUM>":
            embedding_matrix[i] = np.random.normal(size=(w2v.vector_size,))

    return embedding_matrix

word_embedding_matrix = get_embedding_matrix(word_index, w2v)

CPU times: user 375 ms, sys: 60.1 ms, total: 435 ms
Wall time: 433 ms


## Character Sequences

In [93]:
%%time

CHAR_MAX_SEQUENCE_LEN = 10

def char_sequence_padding(series, char_index, char_max_len, word_max_len):
    return pad_sequences(
        series.apply(
            lambda words: pad_sequences([
                [char_index.get(char, char_index["<UNK>"]) for char in word]
            for word in words], maxlen=char_max_len)
    ), maxlen=word_max_len, value=np.zeros(char_max_len))

train_char_sequences = char_sequence_padding(
    datasets["train"]["normalized_tokens"], char_index, CHAR_MAX_SEQUENCE_LEN, WORD_MAX_SEQUENCE_LEN
)

dev_char_sequences = char_sequence_padding(
    datasets["dev"]["normalized_tokens"], char_index, CHAR_MAX_SEQUENCE_LEN, WORD_MAX_SEQUENCE_LEN
)

test_char_sequences = char_sequence_padding(
    datasets["test"]["normalized_tokens"], char_index, CHAR_MAX_SEQUENCE_LEN, WORD_MAX_SEQUENCE_LEN
)

CPU times: user 2min 50s, sys: 2.87 s, total: 2min 53s
Wall time: 2min 52s


## Targets

In [58]:
%%time

train_target = to_categorical(
    datasets["train"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

dev_target = to_categorical(
    datasets["dev"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

CPU times: user 616 ms, sys: 3.93 s, total: 4.54 s
Wall time: 4.54 s


# CNN Building

## Model Constants

In [116]:
WORD_FILTERS_LEN = [2, 3, 4, 5]
WORD_FILTER_COUNT = 128

CHAR_FILTERS_LEN = [3, 4]
CHAR_FILTER_COUNT = 32
CHAR_VECTOR_SIZE = 32

ACTIVATION = "relu"
PADDING = "same"

## Model Building

In [117]:
def build_model(word_vocab_size, word_vector_size, word_embedding_matrix,
                char_vocab_size, char_vector_size, output_size,
                word_max_sequence_len, char_max_sequence_len,
                word_filters_len, word_filter_count, 
                char_filters_len, char_filter_count,
                activation="relu", padding="same"):

    char_sequence_input = Input(shape=(word_max_sequence_len, char_max_sequence_len))
    word_sequence_input = Input(shape=(word_max_sequence_len,))
    
    char_embedded_sequences = TimeDistributed(
        Embedding(
            input_dim=char_vocab_size, 
            output_dim=char_vector_size,
            embeddings_initializer="truncated_normal",  # TODO: Change this?
            trainable=True
        ))(char_sequence_input)

    word_embedding_layer = Embedding(word_vocab_size, word_vector_size,
                                     weights=[word_embedding_matrix],
                                     input_length=word_max_sequence_len,
                                     trainable=False)
    word_embedded_sequences = word_embedding_layer(word_sequence_input)

    char_layers = []
    for filter_len in char_filters_len:
        char_layer = TimeDistributed(
            Conv1D(
                char_filter_count,
                filter_len,
                activation=activation, # TODO: No activation?
                padding=padding
            )
        )(char_embedded_sequences)
        char_layers.append(TimeDistributed(GlobalMaxPooling1D())(char_layer))
    
    word_layer = Concatenate()([word_embedded_sequences] + char_layers)
    
    layers = []
    for filter_len in word_filters_len:
        layer = Conv1D(
            word_filter_count,
            filter_len,
            activation=activation,
            padding=padding
        )(word_layer)
        layers.append(GlobalMaxPooling1D()(layer))

    layer = Concatenate()(layers)
    preds = Dense(output_size, activation="softmax")(layer)
    model = Model(inputs=[word_sequence_input, char_sequence_input], outputs=[preds])

    return model

model = build_model(
    word_vocab_size=len(word_index),
    word_vector_size=w2v.vector_size, 
    word_embedding_matrix=word_embedding_matrix,
    char_vocab_size=len(char_index),
    char_vector_size=CHAR_VECTOR_SIZE,
    output_size=lbl_enc.classes_.shape[0],
    word_max_sequence_len=WORD_MAX_SEQUENCE_LEN,
    char_max_sequence_len=CHAR_MAX_SEQUENCE_LEN,
    word_filters_len=WORD_FILTERS_LEN,
    word_filter_count=WORD_FILTER_COUNT,
    char_filters_len=CHAR_FILTERS_LEN,
    char_filter_count=CHAR_FILTER_COUNT,
    activation=ACTIVATION,
    padding=PADDING
)

model.compile(
    optimizer="nadam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           [(None, 15, 10)]     0                                            
__________________________________________________________________________________________________
time_distributed_28 (TimeDistri (None, 15, 10, 32)   4320        input_16[0][0]                   
__________________________________________________________________________________________________
input_17 (InputLayer)           [(None, 15)]         0                                            
__________________________________________________________________________________________________
time_distributed_29 (TimeDistri (None, 15, 10, 32)   3104        time_distributed_28[0][0]        
____________________________________________________________________________________________

# Fitting the CNN

In [None]:
model.fit(
    x=(train_word_sequences, train_char_sequences),
    y=train_target,
    batch_size=4096,
    epochs=5,
    validation_data=(
        (dev_word_sequences, dev_char_sequences),
        dev_target
    ),
    validation_freq=1
)

Train on 2725070 samples, validate on 499625 samples
Epoch 1/5
Epoch 2/5

In [None]:
datasets["dev"]["predictions"] = model.predict(
    dev_word_sequences, batch_size=1024, verbose=0
).argmax(axis=1)

In [None]:
balanced_accuracy_score(datasets["dev"]["target"], datasets["dev"]["predictions"])