In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf

from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from os import path
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from string import punctuation
from tensorflow.keras.layers import Dense, Embedding, Input, LSTM, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm_notebook
from unidecode import unidecode

np.random.seed(42)
tf.compat.v1.random.set_random_seed(42)

# Data Ingestion

## Dataset constants

In [2]:
DIR_PATH = "../data/"
LANGUAGE = "spanish"
DROP_COLUMNS = ["split", "language"]
UNRELIABLE_SAMPLING = 0.25

## Dataset Loading

In [3]:
%%time
def load_data(base_path, language, drop_columns, unreliable_sampling):
    datasets = {}
    for ds in tqdm_notebook(["train_reliable", "train_unreliable", "dev", "test"]):
        if ds == "train_unreliable" and unreliable_sampling == 0:
            continue
        
        df = pd.read_parquet(
            path.join(base_path, f"{language}", f"{ds}.parquet")
        ).drop(drop_columns, axis=1, errors="ignore")
        
        if ds == "train_unreliable" and 0 < unreliable_sampling < 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(frac=unreliable_sampling)
            ).reset_index(drop=True)
        elif ds == "train_unreliable" and unreliable_sampling > 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(n=int(unreliable_sampling))
            ).reset_index(drop=True)
        
        if ds == "train_reliable":
            datasets["train"] = df
        elif ds == "train_unreliable":
            datasets["train"] = pd.concat([
                datasets["train"],
                df
            ], ignore_index=True)
        else:
            datasets[ds] = df
    
    w2v = KeyedVectors.load_word2vec_format(
        path.join(base_path, f"{language}", "word2vec.bin.gz"), 
        binary=True
    )
    
    return datasets, w2v

datasets, w2v = load_data(DIR_PATH, LANGUAGE, DROP_COLUMNS, UNRELIABLE_SAMPLING)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 1min 27s, sys: 19.1 s, total: 1min 46s
Wall time: 1min 26s


# Data Preprocessing

## Label Encoding

In [4]:
%%time
def label_encoder(*dfs):
    labels = pd.concat(dfs)["category"].tolist()
    lbl_enc = LabelEncoder().fit(labels)

    return lbl_enc

lbl_enc = label_encoder(datasets["train"], datasets["dev"])

for split in ["train", "dev"]:
    datasets[split]["target"] = lbl_enc.transform(datasets[split]["category"])
    datasets[split].drop(["category"], axis=1, inplace=True)

CPU times: user 10.1 s, sys: 1.54 s, total: 11.7 s
Wall time: 5.42 s


## Text curation

### Punctuation removal

In [5]:
%%time

def remove_punctuation(datasets, punctuation, column="tokens"):
    for split in tqdm_notebook(datasets):
        datasets[split]["non_punct_tokens"] = datasets[split][column].apply(
            lambda words: [w for w in words if w not in punctuation]
        )
    return datasets

datasets = remove_punctuation(datasets, punctuation, "words")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 13.4 s, sys: 786 ms, total: 14.2 s
Wall time: 12.8 s


### Stopwords removal

In [6]:
%%time

def remove_stopwords(datasets, stopwords, column="tokens"):
    for split in tqdm_notebook(datasets):
        datasets[split]["non_sw_tokens"] = datasets[split][column].apply(
            lambda words: [w for w in words if w not in stopwords]
        )
    return datasets

datasets = remove_stopwords(datasets, set(stopwords.words(LANGUAGE)), "non_punct_tokens")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 6.67 s, sys: 240 ms, total: 6.91 s
Wall time: 6.9 s


### Word Vectorization

In [7]:
%%time

def word_with_vector(word, w2v, stemmer):
    if word in w2v:
        return word
    elif word.capitalize() in w2v:
        return word.capitalize()
    elif unidecode(word) in w2v:
        return unidecode(word)
    elif unidecode(word.capitalize()) in w2v:
        return unidecode(word.capitalize())
    elif stemmer.stem(word) in w2v:
        return stemmer.stem(word)
    elif word.isdigit():
        return "DIGITO"
    else:
        return "<UNK>"
    # TODO: Lemmatization? Other normalizations?

def word_vectorize(datasets, language, w2v, column="tokens"):
    stemmer = SnowballStemmer(language)
    for split in tqdm_notebook(datasets):
        datasets[split]["normalized_title"] = datasets[split][column].apply(
            lambda words: [word_with_vector(w, w2v, stemmer) for w in words]
        )
    return datasets

datasets = word_vectorize(datasets, LANGUAGE, w2v, "non_punct_tokens")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 2min 39s, sys: 1.49 s, total: 2min 41s
Wall time: 2min 41s


In [8]:
%%time
def words_to_idx(all_words, w2v, null_token="<NULL>",
                 unknown_token="<UNK>", num_token="DIGITO"):
    word_index = {word for words in all_words for word in words if word in w2v}
    word_index = {word: idx for idx, word in enumerate(sorted(word_index), start=1)}
    word_index[null_token] = 0
    if num_token not in word_index:
        word_index[num_token] = len(word_index)
    word_index[unknown_token] = len(word_index)

    return word_index

word_index = words_to_idx(pd.concat(list(datasets.values()), sort=False)["normalized_title"], w2v)

print(f"Vocab length: {len(word_index)}")

Vocab length: 118362
CPU times: user 15.8 s, sys: 1.03 s, total: 16.8 s
Wall time: 13.5 s


In [9]:
%%time

MAX_SEQUENCE_LEN = 8

def sequence_padding(series, word_index, max_len):
    return pad_sequences(
            series.apply(
                lambda words: [word_index.get(word, word_index["<UNK>"]) for word in words]
            ).tolist(), maxlen=max_len
        )

train_word_sequences = sequence_padding(
    datasets["train"]["normalized_title"], word_index, MAX_SEQUENCE_LEN
)

dev_word_sequences = sequence_padding(
    datasets["dev"]["normalized_title"], word_index, MAX_SEQUENCE_LEN
)

test_word_sequences = sequence_padding(
    datasets["test"]["normalized_title"], word_index, MAX_SEQUENCE_LEN
)

CPU times: user 24.1 s, sys: 676 ms, total: 24.7 s
Wall time: 24.5 s


In [10]:
%%time

train_target = to_categorical(
    datasets["train"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

dev_target = to_categorical(
    datasets["dev"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

CPU times: user 442 ms, sys: 3.79 s, total: 4.23 s
Wall time: 4.22 s


In [11]:
%%time
def get_embedding_matrix(word_index, w2v):
    embedding_matrix = np.zeros((len(word_index), w2v.vector_size))

    for word, i in word_index.items():
        if word in w2v and word not in {"<NULL>", "<UNK>", "<NUM>"}:
            embedding_matrix[i] = w2v[word]
        elif word == "<UNK>" or word == "<NUM>":
            embedding_matrix[i] = np.random.normal(size=(w2v.vector_size,))

    return embedding_matrix

word_embedding_matrix = get_embedding_matrix(word_index, w2v)

CPU times: user 460 ms, sys: 160 ms, total: 620 ms
Wall time: 618 ms


# RNN Building

## Model Constants

In [12]:
LSTM_UNITS = [128, 128]
# DROPOUT = [0.1, 0.1]
BIDIRECTIONAL = True

## Model Building

In [30]:
def build_model(word_vocab_size, word_vector_size, word_embedding_matrix, output_size, max_sequence_len,
                lstm_units, bidirectional):
    word_embedding_layer = Embedding(word_vocab_size, word_vector_size,
                                     weights=[word_embedding_matrix],
                                     input_length=max_sequence_len,
                                     trainable=False)

    word_sequence_input = Input(shape=(max_sequence_len,))
    layer = word_embedding_layer(word_sequence_input)

    for idx, units in enumerate(lstm_units[:-1]):
        if bidirectional:
            layer = Bidirectional(LSTM(units, return_sequences=len(lstm_units) > 1))(layer)
        else:
            layer = LSTM((units))(layer)
    
    if bidirectional:
        layer = Bidirectional(LSTM(lstm_units[-1]))(layer)
    else:
        layer = LSTM(lstm_units[-1])(layer)
    
    preds = Dense(output_size, activation="softmax")(layer)
    model = Model(word_sequence_input, preds)

    return model

model = build_model(
    word_vocab_size=len(word_index),
    word_vector_size=w2v.vector_size, 
    word_embedding_matrix=word_embedding_matrix,
    output_size=lbl_enc.classes_.shape[0],
    max_sequence_len=MAX_SEQUENCE_LEN,
    lstm_units=LSTM_UNITS,
    bidirectional=BIDIRECTIONAL
)

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 8, 300)            35508600  
_________________________________________________________________
bidirectional_5 (Bidirection (None, 8, 256)            439296    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense_1 (Dense)              (None, 1573)              404261    
Total params: 36,746,397
Trainable params: 1,237,797
Non-trainable params: 35,508,600
_________________________________________________________________


# Fitting the RNN

In [None]:
model.fit(
    x=train_word_sequences,
    y=train_target,
    batch_size=4096,
    epochs=15,
    validation_data=(dev_word_sequences, dev_target),
    validation_freq=1
)

Train on 2725070 samples, validate on 499625 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15

In [None]:
datasets["dev"]["predictions"] = model.predict(
    dev_word_sequences, batch_size=1024, verbose=0
).argmax(axis=1)

In [None]:
balanced_accuracy_score(datasets["dev"]["target"], datasets["dev"]["predictions"])