In [65]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf

from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from os import path
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from string import punctuation
from tensorflow.keras.layers import Concatenate, Conv1D, Dense, Embedding, GlobalMaxPooling1D, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm_notebook
from unidecode import unidecode

np.random.seed(42)
tf.compat.v1.random.set_random_seed(42)

# Data Ingestion

## Dataset constants

In [49]:
DIR_PATH = "../data/"
LANGUAGE = "spanish"
DROP_COLUMNS = ["label_quality", "split", "language", "words", "pos"]
UNRELIABLE_SAMPLING = 0.25

## Dataset Loading

In [3]:
%%time
def load_data(base_path, language, drop_columns, unreliable_sampling):
    datasets = {}
    for ds in tqdm_notebook(["train_reliable", "train_unreliable", "dev", "test"]):
        if ds == "train_unreliable" and unreliable_sampling == 0:
            continue
        
        df = pd.read_parquet(
            path.join(base_path, f"{language}", f"{ds}.parquet")
        ).drop(drop_columns, axis=1, errors="ignore")
        
        if ds == "train_unreliable" and 0 < unreliable_sampling < 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(frac=unreliable_sampling)
            ).reset_index(drop=True)
        elif ds == "train_unreliable" and unreliable_sampling > 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(n=int(unreliable_sampling))
            ).reset_index(drop=True)
        
        if ds == "train_reliable":
            datasets["train"] = df
        elif ds == "train_unreliable":
            datasets["train"] = pd.concat([
                datasets["train"],
                df
            ], ignore_index=True)
        else:
            datasets[ds] = df
    
    w2v = KeyedVectors.load_word2vec_format(
        path.join(base_path, f"{language}", "word2vec.bin.gz"), 
        binary=True
    )
    
    return datasets, w2v

datasets, w2v = load_data(DIR_PATH, LANGUAGE, DROP_COLUMNS, UNRELIABLE_SAMPLING)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


CPU times: user 1min 23s, sys: 21.2 s, total: 1min 45s
Wall time: 1min 23s


In [4]:
datasets["train"].head()

Unnamed: 0,title,category
0,Play Station 2 + Volante Hooligans.,GAME_CONSOLES
1,Pilas Energizer Max Aa X1 - Tira X 20 Pilas,CELL_BATTERIES
2,Afeitadora Electrica Philips Hq6904 + Envio Gr...,SHAVING_MACHINES
3,Estufa Calefactor Volcan 2500 Kcal/h 42512v Si...,HOME_HEATERS
4,Reloj Pared Vox Tronic Blanco Numeros 23cm Gar...,WALL_CLOCKS


In [5]:
datasets["dev"].head()

Unnamed: 0,title,category
0,Rosario Contador De Billetes Uv / Mg Detecta F...,BILL_COUNTERS
1,Portón De Chapa 3 Hojas Mtr 2.50 Sin Marco,GARAGE_DOORS
2,Base Simil Cemento - 30 Cm X 5 Mm,CAKE_TOPPERS
3,"Disfraz De General Grievous Para Adultos, Tall...",COSTUMES
4,Hermoso Árbol De Navidad En Madera De Pino,CHRISTMAS_TREES


In [6]:
datasets["test"].head()

Unnamed: 0,id,title
0,9,Disco Rigido Externo Western Digital Elements ...
1,10,Picadora De Carne Fineschi Legitima 32
2,14,Set Barreta Automotor Bremen X3 Unid. 6756 20 ...
3,15,Miel Organica X250gr. (sin Tacc)
4,19,Bandeja Giradiscos Omnitronic Bd1320


## Model constants

In [99]:
MAX_SEQUENCE_LEN = 10

# Data Preprocessing

## Label Encoding

In [7]:
%%time
def label_encoder(*dfs):
    labels = pd.concat(dfs)["category"].tolist()
    lbl_enc = LabelEncoder().fit(labels)

    return lbl_enc

lbl_enc = label_encoder(datasets["train"], datasets["dev"])

for split in ["train", "dev"]:
    datasets[split]["target"] = lbl_enc.transform(datasets[split]["category"])
    datasets[split].drop(["category"], axis=1, inplace=True)

CPU times: user 9.99 s, sys: 1.08 s, total: 11.1 s
Wall time: 4.07 s


## Text curation

### Capitalization

In [None]:
%%time

def lowercase_titles(datasets):
    for split in tqdm_notebook(datasets):
        datasets[split]["title"] = datasets[split]["title"].str.lower()
    return datasets

datasets = lowercase_titles(datasets)

### Tokenization

In [8]:
%%time

def tokenization(datasets, language):
    for split in tqdm_notebook(datasets):
        datasets[split]["title"] = datasets[split]["title"].apply(
            lambda title: word_tokenize(title, language=language)
        )
    return datasets

datasets = tokenization(datasets, LANGUAGE)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 5min 47s, sys: 5.89 s, total: 5min 53s
Wall time: 5min 51s


### Punctuation removal

In [23]:
%%time

def remove_punctuation(datasets, punctuation):
    for split in tqdm_notebook(datasets):
        datasets[split]["title"] = datasets[split]["title"].apply(
            lambda words: [w for w in words if w not in punctuation]
        )
    return datasets

datasets = remove_punctuation(datasets, punctuation)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 6.7 s, sys: 425 ms, total: 7.12 s
Wall time: 7.11 s


### Stopwords removal

In [50]:
%%time

def remove_stopwords(datasets, stopwords):
    for split in tqdm_notebook(datasets):
        datasets[split]["title"] = datasets[split]["title"].apply(
            lambda words: [w for w in words if w not in stopwords]
        )
    return datasets

datasets = remove_stopwords(datasets, set(stopwords.words(LANGUAGE)))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

CPU times: user 8.52 s, sys: 164 ms, total: 8.69 s
Wall time: 8.66 s


### Word Vectorization

In [76]:
%%time

def word_with_vector(word, w2v, stemmer):
    if word in w2v:
        return word
    elif word.capitalize() in w2v:
        return word.capitalize()
    elif word.upper() in w2v:
        return word.upper()
    elif unidecode(word) in w2v:
        return unidecode(word)
    elif unidecode(word.capitalize()) in w2v:
        return unidecode(word.capitalize())
    elif unidecode(word.upper()) in w2v:
        return unidecode(word.upper())
    elif stemmer.stem(word) in w2v:
        return stemmer.stem(word)
    elif re.search("\d+", word):
        return "<NUM>"
    else:
        return "<UNK>"
    # TODO: Lemmatization? Other normalizations?

def word_vectorize(datasets, language, w2v):
    stemmer = SnowballStemmer(language)
    for split in tqdm_notebook(datasets):
        datasets[split]["title"] = datasets[split]["title"].apply(
            lambda words: [word_with_vector(w, w2v, stemmer) for w in words]
        )
    return datasets

datasets = word_vectorize(datasets, LANGUAGE, w2v)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

CPU times: user 2min 38s, sys: 819 ms, total: 2min 39s
Wall time: 2min 39s


In [79]:
%%time
def words_to_idx(all_words, w2v, null_token="<NULL>",
                 unknown_token="<UNK>", num_token="<NUM>"):
    word_index = {word for words in all_words for word in words if word in w2v}
    word_index = {word: idx for idx, word in enumerate(sorted(word_index), start=1)}
    word_index[null_token] = 0
    word_index[num_token] = len(word_index)
    word_index[unknown_token] = len(word_index)

    return word_index

word_index = words_to_idx(pd.concat(list(datasets.values()), sort=False)["title"], w2v)

print(f"Vocab length: {len(word_index)}")

Vocab length: 121998
CPU times: user 11.3 s, sys: 476 ms, total: 11.8 s
Wall time: 7.57 s


In [84]:
%%time
def sequence_padding(series, word_index, max_len):
    return pad_sequences(
            series.apply(
                lambda words: [word_index.get(word, word_index["<UNK>"]) for word in words]
            ).tolist(), maxlen=max_len
        )

train_word_sequences = sequence_padding(
    datasets["train"]["title"], word_index, MAX_SEQUENCE_LEN
)

dev_word_sequences = sequence_padding(
    datasets["dev"]["title"], word_index, MAX_SEQUENCE_LEN
)

test_word_sequences = sequence_padding(
    datasets["test"]["title"], word_index, MAX_SEQUENCE_LEN
)

CPU times: user 24.9 s, sys: 264 ms, total: 25.1 s
Wall time: 25.1 s


In [85]:
%%time

train_target = to_categorical(
    datasets["train"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

dev_target = to_categorical(
    datasets["dev"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

CPU times: user 563 ms, sys: 3.84 s, total: 4.4 s
Wall time: 4.4 s


In [87]:
%%time
def get_embedding_matrix(word_index, w2v):
    embedding_matrix = np.zeros((len(word_index), w2v.vector_size))

    for word, i in word_index.items():
        if word in w2v and word not in {"<NULL>", "<UNK>", "<NUM>"}:
            embedding_matrix[i] = w2v[word]
        elif word == "<UNK>" or word == "<NUM>":
            embedding_matrix[i] = np.random.normal(size=(w2v.vector_size,))

    return embedding_matrix

word_embedding_matrix = get_embedding_matrix(word_index, w2v)

CPU times: user 474 ms, sys: 116 ms, total: 590 ms
Wall time: 588 ms


# CNN Building

In [100]:
FILTERS = [2, 3, 4, 5]
FILTER_COUNT = 128
ACTIVATION = "relu"
PADDING = "valid"

In [101]:
from tensorflow.keras.regularizers import l2

In [102]:
def build_cnn(word_vocab_size, word_vector_size, word_embedding_matrix, output_size, max_sequence_len,
              filters, filter_count, activation="relu", padding="valid"):
    word_embedding_layer = Embedding(word_vocab_size, word_vector_size,
                                     weights=[word_embedding_matrix],
                                     input_length=max_sequence_len,
                                     trainable=False)

    word_sequence_input = Input(shape=(max_sequence_len,))
    word_embedded_sequences = word_embedding_layer(word_sequence_input)

    layers = []
    for filter_size in filters:
        layer = Conv1D(
            filter_count,
            filter_size,
            activation=activation,
            padding=padding,
            kernel_regularizer=l2(0.01)
        )(word_embedded_sequences)
        layer = GlobalMaxPooling1D()(layer)
        layers.append(layer)

    layer = Concatenate()(layers)
    preds = Dense(output_size, activation="softmax")(layer)
    model = Model(word_sequence_input, preds)

    return model

model = build_cnn(
    word_vocab_size=len(word_index),
    word_vector_size=w2v.vector_size, 
    word_embedding_matrix=word_embedding_matrix,
    output_size=lbl_enc.classes_.shape[0],
    max_sequence_len=MAX_SEQUENCE_LEN,
    filters=FILTERS,
    filter_count=FILTER_COUNT,
    activation=ACTIVATION,
    padding=PADDING
)

model.compile(
    optimizer="nadam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10, 300)      36599400    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 9, 128)       76928       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 8, 128)       115328      embedding_1[0][0]                
____________________________________________________________________________________________

# Fitting the CNN

In [103]:
model.fit(
    x=train_word_sequences,
    y=train_target,
    batch_size=1024,
    epochs=10,
    validation_data=(dev_word_sequences, dev_target),
    validation_freq=5
)

Train on 2725070 samples, validate on 499625 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [97]:
datasets["dev"]["predictions"] = model.predict(
    dev_word_sequences, batch_size=1024, verbose=0
).argmax(axis=1)

In [98]:
balanced_accuracy_score(datasets["dev"]["target"], datasets["dev"]["predictions"])



0.7298081600204341