In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from gensim.models import KeyedVectors
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Concatenate, Conv1D, Dense, Embedding, GlobalMaxPooling1D, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from unidecode import unidecode

np.random.seed(42)
tf.compat.v1.random.set_random_seed(42)

# Data Ingestion

In [None]:
def load_data(file_path, language, columns, sample=0):
    df = pd.read_parquet(file_path)
    df["words"] = df["words"].apply(lambda words: [unidecode(w) for w in words])
    if sample == 0:
        return df[df["language"] == language][columns]
    else:
        return df[df["language"] == language].groupby("category").apply(
            lambda cat: cat.sample(frac=sample)
        ).reset_index(drop=True)[columns]

In [None]:
train_data = "../data/meli/train_reliable.parquet"
trainu_data = "../data/meli/train_unreliable.parquet"
dev_data = "../data/meli/dev.parquet"
test_data = "../data/meli/test.parquet"
language = "spanish"
word_vectors = f"../data/{language}/{language}-word2vec.bin.gz"
columns = ["words", "pos", "category"]
max_sequence_len = 20
pos_vector_size = 3
filters = [2, 3, 5]
filter_count = 256
network_size = 0
layer_size = 128
activation = "relu"
padding = "valid"

In [None]:
%%time
train_df = pd.concat([
    load_data(train_data, language, columns), 
    load_data(trainu_data, language, columns, 0.25)
])
train_df.head()

In [None]:
%%time
dev_df = pd.concat([
    load_data(dev_data, language, columns)
])
dev_df.head()

In [None]:
%%time
test_df = pd.concat([
    load_data(test_data, language, ["id", "words", "pos"])
])
test_df.head()

In [None]:
%%time
w2v = KeyedVectors.load_word2vec_format(word_vectors, binary=True)

# Data Preprocessing

## Label Encoding

In [None]:
%%time
def label_encoder(*dfs):
    labels = pd.concat(dfs)["category"].tolist()
    lbl_enc = LabelEncoder().fit(labels)

    return lbl_enc

lbl_enc = label_encoder(train_df, dev_df)

train_df["target"] = lbl_enc.transform(train_df["category"])
train_df.drop(["category"], axis=1, inplace=True)

dev_df["target"] = lbl_enc.transform(dev_df["category"])
dev_df.drop(["category"], axis=1, inplace=True)

## Words/PoS to index

In [None]:
%%time
def words_to_idx(all_words, w2v, null_token="<NULL>", unknown_token="<UNK>"):
    word_index = {word for words in all_words for word in words if word in w2v}
    word_index = {word: idx for idx, word in enumerate(sorted(word_index), start=1)}
    word_index[null_token] = 0
    word_index[unknown_token] = len(word_index)

    return word_index

def pos_to_idx(all_pos, null_pos="<NULL>"):
    pos_index = {pos for pos_tags in all_pos for pos in pos_tags}
    pos_index = {pos: idx for idx, pos in enumerate(sorted(pos_index), start=1)}
    pos_index[null_pos] = 0

    return pos_index

word_index = words_to_idx(pd.concat([train_df, dev_df, test_df], sort=False)["words"], w2v)
pos_index = pos_to_idx(pd.concat([train_df, dev_df, test_df], sort=False)["pos"])

print(f"Vocab length: {len(word_index)} - PoS length: {len(pos_index)}")

In [None]:
%%time
def sequence_padding(df, word_index, pos_index, max_len):
    return (
        pad_sequences(
            df["words"].apply(
                lambda words: [word_index.get(word, word_index["<UNK>"]) for word in words]
            ).tolist(), maxlen=max_len
        ),
        pad_sequences(
            df["pos"].apply(
                lambda pos: [pos_index.get(p) for p in pos]
            ).tolist(), maxlen=max_len
        )
    )

train_word_sequences, train_pos_sequences = sequence_padding(
    train_df, word_index, pos_index, max_sequence_len
)

dev_word_sequences, dev_pos_sequences = sequence_padding(
    dev_df, word_index, pos_index, max_sequence_len
)

test_word_sequences, test_pos_sequences = sequence_padding(
    test_df, word_index, pos_index, max_sequence_len
)

In [None]:
%%time

train_target = to_categorical(
    train_df["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

dev_target = to_categorical(
    dev_df["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

In [None]:
%%time
def get_embedding_matrix(word_index, w2v):
    embedding_matrix = np.zeros((len(word_index), w2v.vector_size))

    for word, i in word_index.items():
        if word in w2v and word not in {"<NULL>", "<UNK>"}:
            embedding_matrix[i] = w2v[word]
        if word == "<UNK>":
            embedding_matrix[i] = np.random.normal(size=(w2v.vector_size,))

    return embedding_matrix

word_embedding_matrix = get_embedding_matrix(word_index, w2v)

# CNN Building

In [None]:
def build_cnn(word_vocab_size, word_vector_size, word_embedding_matrix, 
              pos_vocab_size, pos_vector_size, output_size, max_sequence_len,
              filters, filter_count, network_size, layer_size,
              activation="relu", padding="valid"):
    word_embedding_layer = Embedding(word_vocab_size, word_vector_size,
                                     weights=[word_embedding_matrix],
                                     input_length=max_sequence_len,
                                     trainable=False)
    pos_embedding_layer = Embedding(pos_vocab_size, pos_vector_size,
                                    embeddings_initializer="truncated_normal",
                                    input_length=max_sequence_len)

    word_sequence_input = Input(shape=(max_sequence_len,))
    word_embedded_sequences = word_embedding_layer(word_sequence_input)

    pos_sequence_input = Input(shape=(max_sequence_len,))
    pos_embedded_sequences = pos_embedding_layer(pos_sequence_input)
    
    embedded_sequences = Concatenate()([word_embedded_sequences, pos_embedded_sequences])

    layers = []
    for filter_size in filters:
        layer = Conv1D(
            filter_count,
            filter_size,
            activation=activation,
            padding=padding
        )(embedded_sequences)
        layer = GlobalMaxPooling1D()(layer)
        layers.append(layer)

    layer = Concatenate()(layers)    

    for _ in range(network_size):
        layer = Dense(layer_size, activation=activation)(layer)

    preds = Dense(output_size, activation="softmax")(layer)
    model = Model([word_sequence_input, pos_sequence_input], preds)

    return model

model = build_cnn(
    word_vocab_size=len(word_index),
    word_vector_size=w2v.vector_size, 
    word_embedding_matrix=word_embedding_matrix,
    pos_vocab_size=len(pos_index),
    pos_vector_size=pos_vector_size,
    output_size=lbl_enc.classes_.shape[0],
    max_sequence_len=max_sequence_len,
    filters=filters,
    filter_count=filter_count,
    network_size=network_size,
    layer_size=layer_size,
    activation=activation,
    padding=padding
)

model.compile(
    optimizer="nadam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

# Fitting the CNN

In [None]:
model.fit(
    x=(train_word_sequences, train_pos_sequences),
    y=train_target,
    batch_size=1024,
    epochs=10,
    validation_data=(
        (dev_word_sequences, dev_pos_sequences),
        dev_target
    ),
    validation_freq=5
)

In [None]:
dev_df["predictions"] = model.predict(
    (dev_word_sequences, dev_pos_sequences), batch_size=1024, verbose=0
).argmax(axis=1)

In [None]:
balanced_accuracy_score(dev_df["target"], dev_df["predictions"])