# Análisis y Preprocesamiento (continuación)

Experimento: se utilizarán 3 modelos distintos (LR - Linear SVM - MLP) utilizando los datos anotados de las muestras de vectores de palabras generados luego de aplicar las 4 estrategias de representación que se estudiaron en el trabajo de Iacobacci. Se evaluará la performance con el objetivo de seleccionar la mejor estrategia.

In [224]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, GlobalMaxPooling1D, Input, concatenate
from keras.layers import Conv1D, MaxPooling1D, BatchNormalization, Activation
from keras import backend as K
from keras import optimizers, regularizers

### Cargamos los datos

In [196]:
word_vectors = np.load('./corpus_WiNER/word_vectors/wv_sample_exp_decay_W_5.npz')
entity_vector = np.load('./corpus_WiNER/entity_vectors/ev_sample_exp_decay_W_5.npz')

In [197]:
entities = list(entity_vector.items()[0][1])
print('# word vectors:', len(entities))
print('# non entities', entities.count('O'))
print(entities[0:10])

# word vectors: 518696
# non entities 273639
['MISC', 'MISC', 'O', 'O', 'ORG', 'ORG', 'MISC', 'MISC', 'O', 'MISC']


In [19]:
print(len(word_vectors.items()[0][1]))
print(len(entities))

518696
518696


### Dividimos los datos en train - dev - test

In [20]:
X_train, X_test, y_train, y_test = train_test_split(word_vectors.items()[0][1], entities,
                                                    test_size=0.10, 
                                                    random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.11111, 
                                                  random_state=42)
print('#train:', len(X_train))
print('#dev:', len(X_dev))
print('#test:', len(X_test))

#train: 414956
#dev: 51870
#test: 51870


In [6]:
param_grid = {
    'clf__random_state': [0],
    # parameter for LogisticRegression (smaller values -> stronger regularization)
    'clf__C': [0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 1.0, 2.0, 5.0, 10.0]
}
params_list = list(ParameterGrid(param_grid))
len(params_list)

10

In [12]:
def evaluate(model, X, y_true):
    y_pred = model.predict(X)
    acc = metrics.accuracy_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred, average='macro')
    return {'acc': acc, 'f1': f1}

### Logistic Regression

In [8]:
pipeline = Pipeline([
    ('clf', LogisticRegression())
])
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = evaluate(pipeline, X_dev, y_dev)
    print(result)
    results.append({
        **result,
        **params,
    })

{'acc': 0.6209176788124157, 'f1': 0.4848253961688366}
{'acc': 0.6270676691729323, 'f1': 0.5034629240863471}
{'acc': 0.6274339695392327, 'f1': 0.5053202449733059}
{'acc': 0.627549643339117, 'f1': 0.5057058268923362}
{'acc': 0.6276267592057065, 'f1': 0.5059113812011955}
{'acc': 0.6277231540389435, 'f1': 0.5060280162201334}
{'acc': 0.6278002699055331, 'f1': 0.5061783535668727}
{'acc': 0.6278002699055331, 'f1': 0.5062461635037615}
{'acc': 0.6278002699055331, 'f1': 0.5062461635037615}
{'acc': 0.6278002699055331, 'f1': 0.5062461635037615}


In [9]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__C,clf__random_state,f1
7,0.6278,2.0,0,0.506246
8,0.6278,5.0,0,0.506246
9,0.6278,10.0,0,0.506246
6,0.6278,1.0,0,0.506178
5,0.627723,0.5,0,0.506028
4,0.627627,0.3,0,0.505911
3,0.62755,0.1,0,0.505706
2,0.627434,0.05,0,0.50532
1,0.627068,0.01,0,0.503463
0,0.620918,0.001,0,0.484825


### LinearSVC

In [10]:
X_train_small, y_train_small = X_train[:8000], y_train[:8000]
X_dev_small, y_dev_small = X_dev[:1000], y_dev[:1000]
X_test_small, y_test_small = X_test[:1000], y_test[:1000]

In [13]:
model = LinearSVC()
model.fit(X_train_small, y_train_small)
results = evaluate(model, X_dev_small, y_dev_small)

In [17]:
print(results)

{'acc': 0.616, 'f1': 0.4821340726272302}


### Probemos con un Multi Layer Perceptron

In [310]:
X_train, X_test, y_train, y_test = train_test_split(word_vectors.items()[0][1], entities,
                                                    test_size=0.10, 
                                                    random_state=42)

In [311]:
print(len(X_train))
print(len(X_test))

466826
51870


In [312]:
y_train[:10]

['O', 'PER', 'O', 'LOC', 'MISC', 'O', 'ORG', 'O', 'O', 'ORG']

In [313]:
def tagToInt(tag):
    return {'O': 0, 'PER': 1, 'ORG': 2, 'LOC': 3, 'MISC': 4}[tag]    

In [314]:
y_train = [tagToInt(y) for y in y_train]
y_test = [tagToInt(y) for y in y_test]

In [315]:
y_train[:10] # this transformation is needed to apply to_categorical() keras method

[0, 1, 0, 3, 4, 0, 2, 0, 0, 2]

In [316]:
batch_size = 512  # For mini-batch gradient descent
num_classes = 5 # PER - LOC - ORG - MISC - O
epochs = 10
input_size = 300 # word vectors dimensionality
train_examples = 466826 # len(X_train)
test_examples = 51870 # len(X_test)

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [317]:
print(X_train.shape)
print(y_train.shape)

(466826, 300)
(466826, 5)


In [318]:
steps = 300 # ????
channels = 1
input_shape = (steps, channels) #3D tensor with shape: `(batch, steps, channels)`
# # Output shape
#     3D tensor with shape: `(batch, new_steps, filters)`
X_train.shape

(466826, 300)

In [319]:
X_train = np.expand_dims(X_train, 2)

In [320]:
X_train.shape

(466826, 300, 1)

### CNN Model TODO: cambiar y utilizar MLP

In [322]:
conv_filters = 10
pool_size = 3
inp = Input(shape=(X_train.shape[1],1), dtype='float64')
print(inp.shape)
# Specify each convolution layer and their kernel size i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(inp)
btch1_1 = BatchNormalization()(conv1_1)
maxp1_1 = MaxPooling1D(pool_size=pool_size)(btch1_1)
flat1_1 = Flatten()(maxp1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=4, activation='relu')(inp)
btch1_2 = BatchNormalization()(conv1_2)
maxp1_2 = MaxPooling1D(pool_size=pool_size)(btch1_2)
flat1_2 = Flatten()(maxp1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=5, activation='relu')(inp)
btch1_3 = BatchNormalization()(conv1_3)
maxp1_3 = MaxPooling1D(pool_size=pool_size)(btch1_3)
flat1_3 = Flatten()(maxp1_3)

# Gather all convolution layers
cnct = concatenate([flat1_1, flat1_2, flat1_3], axis=1)
drp1 = Dropout(0)(cnct)

dns1  = Dense(128, activation='relu')(drp1)
out = Dense(num_classes, activation='softmax')(dns1)#(drp2)

model = Model(inputs=inp, outputs=out)

(?, 300, 1)


In [323]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_32 (InputLayer)           (None, 300, 1)       0                                            
__________________________________________________________________________________________________
conv1d_84 (Conv1D)              (None, 298, 10)      40          input_32[0][0]                   
__________________________________________________________________________________________________
conv1d_85 (Conv1D)              (None, 297, 10)      50          input_32[0][0]                   
__________________________________________________________________________________________________
conv1d_86 (Conv1D)              (None, 296, 10)      60          input_32[0][0]                   
__________________________________________________________________________________________________
batch_norm

In [324]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [325]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 420143 samples, validate on 46683 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [328]:
X_test = np.expand_dims(X_test, 2) 

### Estrategia: Decaimiento Exponencial

In [330]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.9030835823008888
Test accuracy: 0.674146905714362


### Estrategia: Decaimiento Fraccional

In [335]:
word_vectors = np.load('./corpus_WiNER/word_vectors/wv_sample_frac_decay_W_5.npz')
entity_vector = np.load('./corpus_WiNER/entity_vectors/ev_sample_frac_decay_W_5.npz')
word_vecs = word_vectors.items()[0][1]
entities = list(entity_vector.items()[0][1])

In [336]:
# Preprocessing
X_train, X_test, y_train, y_test = train_test_split(word_vecs, entities,
                                                    test_size=0.10, 
                                                    random_state=42)
y_train = [tagToInt(y) for y in y_train]
y_test = [tagToInt(y) for y in y_test]
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
X_train = np.expand_dims(X_train, 2)
X_test = np.expand_dims(X_test, 2)

In [337]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [338]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 420143 samples, validate on 46683 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [339]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 1.0672896337389648
Test accuracy: 0.6471563524126156


### Estrategia: Promedio

In [340]:
word_vectors = np.load('./corpus_WiNER/word_vectors/wv_sample_mean_W_5.npz')
entity_vector = np.load('./corpus_WiNER/entity_vectors/ev_sample_mean_W_5.npz')
word_vecs = word_vectors.items()[0][1]
entities = list(entity_vector.items()[0][1])

In [341]:
# Preprocessing
X_train, X_test, y_train, y_test = train_test_split(word_vecs, entities,
                                                    test_size=0.10, 
                                                    random_state=42)
y_train = [tagToInt(y) for y in y_train]
y_test = [tagToInt(y) for y in y_test]
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
X_train = np.expand_dims(X_train, 2)
X_test = np.expand_dims(X_test, 2)

In [342]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [343]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 420143 samples, validate on 46683 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [344]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: nan
Test accuracy: 0.5277038750860855


Conclusión: la estrategia de Decaimiento Exponencial parece ser la más apropiada.