In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from gensim.models import KeyedVectors
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
DATA_DIR = "../data"

In [3]:
train_df = pd.read_parquet(DATA_DIR + "/meli/train_reliable.parquet")
dev_df = pd.read_parquet(DATA_DIR + "/meli/dev.parquet")

In [4]:
es_train_df = train_df[train_df.language == "spanish"]
es_dev_df = dev_df[dev_df.language == "spanish"]

es_lbl_enc = LabelEncoder().fit(es_train_df.category.tolist() + es_dev_df.category.tolist())
es_train_df["target"] = es_lbl_enc.transform(es_train_df.category)
es_dev_df["target"] = es_lbl_enc.transform(es_dev_df.category)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
pt_train_df = train_df[train_df.language == "portuguese"]
pt_dev_df = dev_df[dev_df.language == "portuguese"]

pt_lbl_enc = LabelEncoder().fit(pt_train_df.category.tolist() + pt_dev_df.category.tolist())
pt_train_df["target"] = pt_lbl_enc.transform(pt_train_df.category)
pt_dev_df["target"] = pt_lbl_enc.transform(pt_dev_df.category)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
es_w2v = KeyedVectors.load_word2vec_format(DATA_DIR + "/spanish/spanish-word2vec.bin.gz", binary=True)
pt_w2v = KeyedVectors.load_word2vec_format(DATA_DIR + "/portuguese/portuguese-word2vec.bin.gz", binary=True)

In [7]:
def tokens_to_sequence(tokens, token_to_idx, default_value="UNK"):
    return [token_to_idx.get(token, token_to_idx[default_value]) for token in tokens]

In [8]:
MAX_SEQUENCE_LEN = 15

# Spanish

## Data Preparation

In [32]:
all_words = pd.concat([es_train_df.words, es_dev_df.words])

es_word_index = {word for words in all_words for word in words if word in es_w2v}
es_word_index = {word: idx for idx, word in enumerate(sorted(es_word_index), start=1)}
es_index_word = {idx: word for idx, word in es_word_index.items()}
es_word_index["NULL"] = 0
es_index_word[0] = "NULL"

es_word_index["UNK"] = len(es_word_index)
es_index_word[len(es_index_word)] = "UNK"

In [33]:
es_train_token_sequences = tf.keras.preprocessing.sequence.pad_sequences(es_train_df["words"].apply(
    lambda words: tokens_to_sequence(words, es_word_index)
).tolist(), maxlen=MAX_SEQUENCE_LEN)

In [34]:
es_dev_token_sequences = tf.keras.preprocessing.sequence.pad_sequences(es_dev_df["words"].apply(
    lambda words: tokens_to_sequence(words, es_word_index)
).tolist(), maxlen=MAX_SEQUENCE_LEN)

In [35]:
es_train_target = tf.keras.utils.to_categorical(es_train_df.target.tolist(),
                                                num_classes=es_lbl_enc.classes_.shape[0])

es_dev_target = tf.keras.utils.to_categorical(es_dev_df.target.tolist(),
                                                num_classes=es_lbl_enc.classes_.shape[0])

## DL Setup

In [36]:
embedding_matrix = np.zeros((len(es_word_index), es_w2v.vector_size))

for word, i in es_word_index.items():
    if word in es_w2v and word not in {"NULL", "UNK"}:
        embedding_matrix[i] = es_w2v[word]
    if word == "UNK":
        np.random.seed(42)
        embedding_matrix[i] = np.random.normal(size=(es_w2v.vector_size,))

In [37]:
embedding_layer = tf.keras.layers.Embedding(len(es_word_index),
                                            es_w2v.vector_size,
                                            weights=[embedding_matrix],
                                            input_length=MAX_SEQUENCE_LEN,
                                            trainable=False)

In [38]:
sequence_input = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LEN,))
embedded_sequences = embedding_layer(sequence_input)

### CNN Deep

In [39]:
layer = tf.keras.layers.Conv1D(128, 2, activation="relu")(embedded_sequences)
layer = tf.keras.layers.MaxPooling1D(2)(layer)
layer = tf.keras.layers.Conv1D(128, 2, activation="relu")(embedded_sequences)
layer = tf.keras.layers.MaxPooling1D(2)(layer)
layer = tf.keras.layers.Conv1D(128, 2, activation="relu")(embedded_sequences)
layer = tf.keras.layers.GlobalMaxPooling1D()(layer)
layer = tf.keras.layers.Dense(128, activation="relu")(layer)
preds = tf.keras.layers.Dense(es_lbl_enc.classes_.shape[0], activation="softmax")(layer)
model = tf.keras.models.Model(sequence_input, preds)

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 15)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 15, 300)           16699500  
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 14, 128)           76928     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 1571)              202659    
Total params: 16,995,599
Trainable params: 296,099
Non-trainable params: 16,699,500
_________________________________________

### CNN Wide

In [63]:
layer1 = tf.keras.layers.Conv1D(128, 2, activation="relu",
                                kernel_regularizer=tf.keras.regularizers.l2(0.01))(embedded_sequences)
layer1 = tf.keras.layers.GlobalMaxPooling1D()(layer1)

layer2 = tf.keras.layers.Conv1D(128, 3, activation="relu",
                                kernel_regularizer=tf.keras.regularizers.l2(0.01))(embedded_sequences)
layer2 = tf.keras.layers.GlobalMaxPooling1D()(layer2)

layer3 = tf.keras.layers.Conv1D(128, 4, activation="relu",
                                kernel_regularizer=tf.keras.regularizers.l2(0.01))(embedded_sequences)
layer3 = tf.keras.layers.GlobalMaxPooling1D()(layer3)

layer = tf.keras.layers.Concatenate()([layer1, layer2, layer3])

preds = tf.keras.layers.Dense(es_lbl_enc.classes_.shape[0], activation="softmax")(layer)
model = tf.keras.models.Model(sequence_input, preds)

model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 15, 300)      16699500    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_35 (Conv1D)              (None, 14, 128)      76928       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_36 (Conv1D)              (None, 13, 128)      115328      embedding_1[0][0]                
____________________________________________________________________________________________

### MLP

In [None]:
layer = tf.keras.layers.Flatten()(embedded_sequences)
layer = tf.keras.layers.Dense(1024, activation="relu")(layer)
layer = tf.keras.layers.Dropout(0.3)(layer)
layer = tf.keras.layers.Dense(1024, activation="relu")(layer)
layer = tf.keras.layers.Dropout(0.3)(layer)
preds = tf.keras.layers.Dense(es_lbl_enc.classes_.shape[0], activation="softmax")(layer)
model = tf.keras.models.Model(sequence_input, preds)

model.summary()

## DL Run

In [62]:
model.compile(loss='categorical_crossentropy',
              optimizer='nadam',
              metrics=['accuracy'])

model.fit(es_train_token_sequences, es_train_target, 
          validation_data=(es_dev_token_sequences, es_dev_target),
          validation_freq=[5], epochs=10, batch_size=1024)

Train on 466611 samples, validate on 499625 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6c2a6ecf98>

In [64]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(es_train_token_sequences, es_train_target, 
          validation_data=(es_dev_token_sequences, es_dev_target),
          validation_freq=[5], epochs=10, batch_size=1024)

Train on 466611 samples, validate on 499625 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6d53d217b8>

In [54]:
es_train_preds = model.predict(es_train_token_sequences, batch_size=1024, verbose=0)
es_dev_preds = model.predict(es_dev_token_sequences, batch_size=1024, verbose=0)

In [55]:
es_train_df["predictions"] = es_train_preds.argmax(axis=1)
es_dev_df["predictions"] = es_dev_preds.argmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [56]:
print(balanced_accuracy_score(es_train_df.target, es_train_df.predictions))
print(balanced_accuracy_score(es_dev_df.target, es_dev_df.predictions))

0.9653978780061945
0.3639198812381765


In [57]:
balanced_accuracy_score(es_dev_df[es_dev_df.label_quality=="reliable"].target,
                        es_dev_df[es_dev_df.label_quality=="reliable"].predictions)



0.7703692876333945