In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf

from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from os import path
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from string import punctuation
from tensorflow.keras.layers import Concatenate, Conv1D, Dense, Embedding, GlobalMaxPooling1D, Input, TimeDistributed
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm_notebook
from unidecode import unidecode

np.random.seed(42)
tf.compat.v1.random.set_random_seed(42)

# Data Ingestion

## Dataset constants

In [2]:
DIR_PATH = "../data/"
LANGUAGE = "spanish"
DROP_COLUMNS = ["split", "language"]
UNRELIABLE_SAMPLING = 1
KERAS_MODEL = "../experiments/2019-09-11_07.00.43_spanish_model.h5"

## Dataset Loading

In [7]:
%%time
def load_data(base_path, language, drop_columns, unreliable_sampling):
    datasets = {}
    for ds in tqdm_notebook(["train_reliable", "train_unreliable", "dev", "test"]):
        if ds == "train_unreliable" and unreliable_sampling == 0:
            continue
        
        df = pd.read_parquet(
            path.join(base_path, f"{language}", f"{ds}.parquet")
        ).drop(drop_columns, axis=1, errors="ignore")
        
        if ds == "train_unreliable" and 0 < unreliable_sampling < 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(frac=unreliable_sampling)
            ).reset_index(drop=True)
        elif ds == "train_unreliable" and unreliable_sampling > 1:
            df = df.groupby(["category"]).apply(
                lambda cat: cat.sample(n=int(unreliable_sampling))
            ).reset_index(drop=True)
        
        if ds == "train_reliable":
            datasets["train"] = df
        elif ds == "train_unreliable":
            datasets["train"] = pd.concat([
                datasets["train"],
                df
            ], ignore_index=True)
        else:
            datasets[ds] = df
    
#     w2v = KeyedVectors.load_word2vec_format(
#         path.join(base_path, f"{language}", "word2vec.bin.gz"), 
#         binary=True
#     )
    w2v = None
    
    return datasets, w2v

datasets, w2v = load_data(DIR_PATH, LANGUAGE, DROP_COLUMNS, UNRELIABLE_SAMPLING)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

CPU times: user 40 s, sys: 24.6 s, total: 1min 4s
Wall time: 46 s


# Data Preprocessing

## Label Encoding

In [8]:
%%time
def label_encoder(*dfs):
    labels = pd.concat(dfs)["category"].tolist()
    lbl_enc = LabelEncoder().fit(labels)

    return lbl_enc

lbl_enc = label_encoder(datasets["train"], datasets["dev"])

for split in ["train", "dev"]:
    datasets[split]["target"] = lbl_enc.transform(datasets[split]["category"])
    datasets[split].drop(["category"], axis=1, inplace=True)

CPU times: user 18.4 s, sys: 2.25 s, total: 20.7 s
Wall time: 13 s


In [10]:
for split in ["train", "dev"]:
    pd.DataFrame(datasets[split]["target"])\
        .to_parquet(path.join(DIR_PATH, f"{LANGUAGE}", "metadata", f"{split}_target.parquet"))

with open(path.join(DIR_PATH, LANGUAGE, "metadata", "classes.txt"), "w") as fh:
    fh.write("\n".join(lbl_enc.classes_.tolist()))

## Text curation

### Punctuation removal

In [5]:
%%time

def remove_punctuation(datasets, punctuation, column="tokens"):
    for split in tqdm_notebook(datasets):
        datasets[split]["non_punct_tokens"] = datasets[split][column].apply(
            lambda words: [w for w in words if w not in punctuation]
        )
    return datasets

datasets = remove_punctuation(datasets, punctuation, "words")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 31.2 s, sys: 1.51 s, total: 32.7 s
Wall time: 31.5 s


### Stopwords removal

In [6]:
%%time

def remove_stopwords(datasets, stopwords, column="tokens"):
    for split in tqdm_notebook(datasets):
        datasets[split]["non_sw_tokens"] = datasets[split][column].apply(
            lambda words: [w for w in words if w not in stopwords]
        )
    return datasets

datasets = remove_stopwords(datasets, set(stopwords.words(LANGUAGE)), "non_punct_tokens")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 21.1 s, sys: 884 ms, total: 22 s
Wall time: 21.9 s


### Word Vectorization

In [7]:
%%time

def word_with_vector(word, w2v, stemmer):
    if word in w2v:
        return word
    elif word.capitalize() in w2v:
        return word.capitalize()
    elif unidecode(word) in w2v:
        return unidecode(word)
    elif unidecode(word.capitalize()) in w2v:
        return unidecode(word.capitalize())
    elif stemmer.stem(word) in w2v:
        return stemmer.stem(word)
    elif word.isdigit():
        return "DIGITO"
    else:
        return "<UNK>"
    # TODO: Lemmatization? Other normalizations?

def word_vectorize(datasets, language, w2v, column="tokens"):
    stemmer = SnowballStemmer(language)
    for split in tqdm_notebook(datasets):
        datasets[split]["normalized_tokens"] = datasets[split][column].apply(
            lambda words: [word_with_vector(w, w2v, stemmer) for w in words]
        )
    return datasets

datasets = word_vectorize(datasets, LANGUAGE, w2v, "non_sw_tokens")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 7min 24s, sys: 3.1 s, total: 7min 27s
Wall time: 7min 27s


In [8]:
%%time
def words_to_idx(all_words, w2v, null_token="<NULL>",
                 unknown_token="<UNK>", num_token="DIGITO"):
    word_index = {word for words in all_words for word in words if word in w2v}
    word_index = {word: idx for idx, word in enumerate(sorted(word_index), start=1)}
    word_index[null_token] = 0
    if num_token not in word_index:
        word_index[num_token] = len(word_index)
    word_index[unknown_token] = len(word_index)

    return word_index

word_index = words_to_idx(pd.concat(list(datasets.values()), sort=False)["normalized_tokens"], w2v)

print(f"Vocab length: {len(word_index)}")

Vocab length: 161805
CPU times: user 30.3 s, sys: 1.61 s, total: 31.9 s
Wall time: 27.8 s


## Characters Preprocessing

In [9]:
%%time
def normalize_titles(datasets, column="tokens"):
    for split in datasets:
        datasets[split]["normalized_title"] = datasets[split][column].apply(lambda tokens: " ".join(tokens))
    return datasets

datasets = normalize_titles(datasets, "non_sw_tokens")

CPU times: user 4.22 s, sys: 524 ms, total: 4.75 s
Wall time: 4.74 s


In [10]:
%%time
def chars_to_idx(titles, null_token="<NULL>", unknown_token="<UNK>"):
    char_index = {char for title in titles for char in title}
    char_index = {char: idx for idx, char in enumerate(sorted(char_index), start=1)}
    char_index[null_token] = 0
    char_index[unknown_token] = len(char_index)

    return char_index

char_index = chars_to_idx(
    pd.concat(
        list(datasets.values()), 
        ignore_index=True, 
        sort=False
    )["normalized_tokens"].apply(lambda tokens: " ".join(tokens))
)

print(f"Char vocab length: {len(char_index)}")

Char vocab length: 105
CPU times: user 24.4 s, sys: 1.62 s, total: 26 s
Wall time: 23.9 s


# Network Data Preparation

## Word Sequences

In [11]:
%%time

WORD_MAX_SEQUENCE_LEN = 15

def word_sequence_padding(series, word_index, max_len):
    return pad_sequences(
            series.apply(
                lambda words: [word_index.get(word, word_index["<UNK>"]) for word in words]
            ).tolist(), maxlen=max_len
        )

train_word_sequences = word_sequence_padding(
    datasets["train"]["normalized_tokens"], word_index, WORD_MAX_SEQUENCE_LEN
)

dev_word_sequences = word_sequence_padding(
    datasets["dev"]["normalized_tokens"], word_index, WORD_MAX_SEQUENCE_LEN
)

test_word_sequences = word_sequence_padding(
    datasets["test"]["normalized_tokens"], word_index, WORD_MAX_SEQUENCE_LEN
)

CPU times: user 1min 3s, sys: 1.3 s, total: 1min 5s
Wall time: 1min 5s


In [None]:
%%time
def get_embedding_matrix(word_index, w2v):
    embedding_matrix = np.zeros((len(word_index), w2v.vector_size))

    for word, i in word_index.items():
        if word in w2v and word not in {"<NULL>", "<UNK>", "<NUM>"}:
            embedding_matrix[i] = w2v[word]
        elif word == "<UNK>" or word == "<NUM>":
            embedding_matrix[i] = np.random.normal(size=(w2v.vector_size,))

    return embedding_matrix

word_embedding_matrix = get_embedding_matrix(word_index, w2v)

## Character Sequences

In [14]:
%%time

CHAR_MAX_SEQUENCE_LEN = 10

def char_sequence_padding(series, char_index, char_max_len, word_max_len):
    return pad_sequences(
        series.apply(
            lambda words: pad_sequences([
                [char_index.get(char, char_index["<UNK>"]) for char in word]
            for word in words], maxlen=char_max_len)
    ), maxlen=word_max_len, value=np.zeros(char_max_len))

train_char_sequences = char_sequence_padding(
    datasets["train"]["normalized_tokens"], char_index, CHAR_MAX_SEQUENCE_LEN, WORD_MAX_SEQUENCE_LEN
)

dev_char_sequences = char_sequence_padding(
    datasets["dev"]["normalized_tokens"], char_index, CHAR_MAX_SEQUENCE_LEN, WORD_MAX_SEQUENCE_LEN
)

test_char_sequences = char_sequence_padding(
    datasets["test"]["normalized_tokens"], char_index, CHAR_MAX_SEQUENCE_LEN, WORD_MAX_SEQUENCE_LEN
)

CPU times: user 7min 27s, sys: 10.3 s, total: 7min 37s
Wall time: 7min 37s


## Targets

In [None]:
%%time

train_target = to_categorical(
    datasets["train"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

dev_target = to_categorical(
    datasets["dev"]["target"].tolist(),
    num_classes=lbl_enc.classes_.shape[0]
)

# CNN Building

## Model Constants

In [128]:
WORD_FILTERS_LEN = [2, 3, 4, 5]
WORD_FILTER_COUNT = 128

CHAR_FILTERS_LEN = [2, 3, 4]
CHAR_FILTER_COUNT = 64
CHAR_VECTOR_SIZE = 32

ACTIVATION = "relu"
PADDING = "same"

## Keras Model Loading

In [19]:
keras_model = load_model(KERAS_MODEL)
keras_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 15, 10)]     0                                            
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, 15, 10, 32)   3360        input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 15, 10, 64)   4160        time_distributed[0][0]           
______________________________________________________________________________________________

In [44]:
pretrained_model = Model(
    inputs=keras_model.inputs,
    outputs=[keras_model.get_layer("concatenate_1").output]
)
pretrained_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 15, 10)]     0                                            
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, 15, 10, 32)   3360        input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 15, 10, 64)   4160        time_distributed[0][0]           
____________________________________________________________________________________________

# Getting the transformed data

In [46]:
%%time
train_data = pretrained_model.predict((train_word_sequences, train_char_sequences), batch_size=4096, verbose=1)
dev_data = pretrained_model.predict((dev_word_sequences, dev_char_sequences), batch_size=4096, verbose=1)

CPU times: user 19min 59s, sys: 46min 14s, total: 1h 6min 14s
Wall time: 6min 9s


In [1]:
import pandas as pd
import xgboost as xgb

from sklearn.metrics import balanced_accuracy_score

In [2]:
%%time
train_data = xgb.DMatrix(
    pd.read_parquet("../data/spanish/metadata/train.parquet").values[:475],
    label=pd.read_parquet("../data/spanish/metadata/train_target.parquet")["target"].values[:475]
)

dev_data = xgb.DMatrix(
    pd.read_parquet("../data/spanish/metadata/dev.parquet").values,
    label=pd.read_parquet("../data/spanish/metadata/dev_target.parquet")["target"].values
)

with open("../data/spanish/metadata/classes.txt") as fh:
    classes_ = [l.strip() for l in fh.readlines()]

CPU times: user 52.4 s, sys: 15.2 s, total: 1min 7s
Wall time: 8.98 s


In [3]:
def xgb_bac(preds, dtrain):
    return "bacc", balanced_accuracy_score(dtrain.get_label(), preds)

In [6]:
param = {
    'eta': 0.3, 
    'max_depth': 6,  
    'objective': 'multi:softmax',
    'num_class': len(classes_),
    'silent': False,
    'verbosity': 1
}

In [7]:
eval_results = {}
model = xgb.train(
    param, 
    train_data,
    feval=xgb_bac,
    num_boost_round=10,
    evals=[(train_data, 'train'), (dev_data, 'eval')],
    evals_result=eval_results,
    verbose_eval=True
)

[0]	train-merror:0.983158	eval-merror:0.998363	train-bacc:0.004016	eval-bacc:0.000637
[1]	train-merror:0.972632	eval-merror:0.997026	train-bacc:0.007172	eval-bacc:0.00114
[2]	train-merror:0.945263	eval-merror:0.995024	train-bacc:0.015146	eval-bacc:0.00184


KeyboardInterrupt: 

# Spark 

In [1]:
%%time
train_data = spark.read.parquet("../data/spanish/metadata/train.parquet")
train_target = spark.read.parquet("../data/spanish/metadata/train_target.parquet")

dev_data = spark.read.parquet("../data/spanish/metadata/dev.parquet")
dev_target = spark.read.parquet("../data/spanish/metadata/dev_target.parquet")

# with open("../data/spanish/metadata/classes.txt") as fh:
#     classes_ = [l.strip() for l in fh.readlines()]

CPU times: user 7.43 ms, sys: 397 µs, total: 7.82 ms
Wall time: 3.03 s


In [6]:
dev_target.show(5)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:41227)
Traceback (most recent call last):
  File "/home/ccardellino/.local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ccardellino/.local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:41227)

# Validation

In [None]:
datasets["dev"]["predictions"] = model.predict(
    (dev_word_sequences, dev_char_sequences), batch_size=1024, verbose=0
).argmax(axis=1)

In [132]:
balanced_accuracy_score(datasets["dev"]["target"], datasets["dev"]["predictions"])



0.7593969529732915

In [127]:
balanced_accuracy_score(datasets["dev"]["target"], datasets["dev"]["predictions"])



0.7561903973067962