In [1]:
import os
import ast
import itertools
import pandas as pd
import numpy as np
import keras
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, KeyedVectors, FastText
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, GlobalMaxPooling1D, Input, concatenate, Lambda
from keras.layers import Conv1D, MaxPooling1D, BatchNormalization, Activation, Embedding
from keras import backend as K
from keras import optimizers, regularizers

Using TensorFlow backend.


# Preprocesamiento

In [None]:
def entityListFromSentence(senIdx, sen_length, art_entities_df):
    # We take the df with the entities of each sentence
    sen_entities_df = art_entities_df[art_entities_df.senIdx == senIdx]
    # An empty dataframe means that the sentence doesn't have any entity
    if sen_entities_df.empty:
        entities = ['O' for _ in range(sen_length)]
    else:
        entities = []
        i = 0
        for _, row in sen_entities_df.iterrows():
            while i < row['begin']:
                entities.append('O')
                i += 1
            while i < row['end']:
                entities.append(row['entityType'])
                i += 1
        while i < sen_length:
            entities.append('O')
            i += 1
    return entities

### Cargamos los primeros 1500 documentos que se van a utilizar para la parte supervisada

In [None]:
doc_filenames = os.listdir('./corpus_WiNER/docs_df/')
doc_filenames.sort()
doc_filenames = doc_filenames[0:1500]
coarseNE_filenames = os.listdir('./corpus_WiNER/coarseNE_df/')
coarseNE_filenames.sort()
coarseNE_filenames = coarseNE_filenames[0:1500]
docs = []
coarseNEs = []
for doc, ne in zip(doc_filenames, coarseNE_filenames):
    docs.append(pd.read_pickle('./corpus_WiNER/docs_df/'+ doc))
    coarseNEs.append(pd.read_pickle('./corpus_WiNER/coarseNE_df/'+ ne))        

In [None]:
docs_df = pd.concat(docs, ignore_index=True)
coarseNE_df = pd.concat(coarseNEs, ignore_index=True)

In [None]:
print('Cantidad de oraciones:', docs_df.shape[0])

In [None]:
print('Cantidad de artículos que no contienen ninguna entidad:', 
      len(docs_df.art_ID.unique()) - len(coarseNE_df.art_ID.unique()))

Nos quedamos con aquellos artículos que contienen al menos una entidad nombrada.

In [None]:
art_IDs = coarseNE_df.art_ID.unique()

Extraemos una muestra aleatoria de 4000 artículos

In [None]:
np.random.seed(42)
np.random.shuffle(art_IDs)
art_IDs_sample = art_IDs[0:4000]
#### Filtramos
articles_df = docs_df[docs_df.art_ID.isin(art_IDs_sample)]
entities_df = coarseNE_df[coarseNE_df.art_ID.isin(art_IDs_sample)]
articles_df.shape

In [None]:
# We consider only the articles with at least one entity.
# That's why we iterate over the coarseNE's articles.
sentences = []
entities = []
for art_ID in tqdm(np.nditer(art_IDs_sample)):
    article_df = articles_df[articles_df.art_ID == art_ID]
    art_entities_df = entities_df[entities_df.art_ID == art_ID] 
    article_df = article_df.reset_index(drop=True) # this is important for the entity matching.
    article_df['sen_length'] = article_df['sentence'].map(lambda x: len(x))
    fun = lambda senIdx: entityListFromSentence(senIdx, article_df.loc[senIdx, 'sen_length'],
                                                 art_entities_df)    
    article_df['entities'] = article_df.index.map(fun)
    sentences += list(article_df['sentence'])
    entities += list(article_df['entities'])

In [None]:
df = pd.DataFrame.from_dict({'sentence':sentences, 'entities':entities})
df.head()

In [None]:
def spread_words_entities(sentence, sen_entities, W):
    new_input = []
    L = len(sentence)
    # I: index of the target word
    for I in range(0, L):             
        words = []      
        # Padding with zeros on the left
        if I - W < 0:
            words += [''] * abs(I-W) #list(np.zeros(abs(I-W), dtype=int))
        # Concat vectors from the sentence
        for i in range(I - W, I + W + 1):             
            if i >= 0 and i < L:
                words.append(sentence[i])      
        # Padding with vector of zeros on the right
        if I + W >= L:
            words += list(np.zeros(abs(I+W+1-L), dtype=int))
        new_input.append((words, sen_entities[I]))
        
    return new_input

In [None]:
new_input = []
for idx, row in tqdm(df.iterrows()):
    new_input += spread_words_entities(row['sentence'], row['entities'], W=2)

In [None]:
input_data = pd.DataFrame(new_input, columns=['words', 'entityType'])
print(input_data.shape)
input_data.head()

In [None]:
def drop_non_entities(df, frac):
    '''
    Remove a fraction of non entities vectors (entityType == 'O')
    df: wordVector_Entity_df
    frac: float value between 0 and 1
    @return df with a fraction of the non entities rows removed
    '''
    sample = df[df.entityType == 'O'].sample(frac=frac, random_state=77)
    return df.drop(index=sample.index)

In [None]:
input_data = drop_non_entities(input_data, 0.80)

In [None]:
input_data.to_csv('./corpus_WiNER/words_entity_cnn_test.csv', index=False)

## Cargamos los datos ya pre-procesados

In [2]:
input_data = pd.read_csv('./corpus_WiNER/words_entity_cnn_test.csv')
print(input_data.shape)
input_data['entityType'].value_counts()

(518696, 2)


O       273639
MISC     91899
LOC      58713
PER      57497
ORG      36948
Name: entityType, dtype: int64

In [3]:
input_data.head()

Unnamed: 0,words,entityType
0,"['', '', 'Pier', 'Francesco', ""d'Jacopo""]",PER
1,"['', 'Pier', 'Francesco', ""d'Jacopo"", 'di']",PER
2,"['Pier', 'Francesco', ""d'Jacopo"", 'di', 'Domen...",PER
3,"['Francesco', ""d'Jacopo"", 'di', 'Domenico', 'T...",PER
4,"[""d'Jacopo"", 'di', 'Domenico', 'Toschi', '-LRB-']",PER


### Dividimos los datos en train - dev - test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(input_data['words'], input_data['entityType'],
                                                    test_size=0.10, 
                                                    random_state=42)
print('#train:', len(X_train))
print('#test:', len(X_test))

#train: 466826
#test: 51870


In [5]:
def tagToInt(tag):
    return {'O': 0, 'PER': 1, 'ORG': 2, 'LOC': 3, 'MISC': 4}[tag]    

In [6]:
y_train = [tagToInt(y) for y in y_train]
y_test = [tagToInt(y) for y in y_test]

In [7]:
y_train[:10] # this transformation is needed to apply to_categorical() keras method

[1, 0, 1, 0, 4, 0, 0, 0, 0, 4]

In [8]:
def transform_input(instances, mapping):
    """Replaces the words in instances with their index in mapping.
    Args:
        instances: a list of text instances.
        mapping: an dictionary from words to indices.
    Returns:
        A matrix with shape (n_instances, m_words)."""
    word_indices = []
    for instance in instances:
        l = []
        for word in ast.literal_eval(instance):
            try:
                l.append(mapping[word].index)
            except KeyError:
                l.append(0) # index to '</s>' word vector
        word_indices.append(l)
        
    return word_indices

In [9]:
w2v_model = KeyedVectors.load('./models/word2vecGoogle.model')

In [10]:
X_train = transform_input(X_train, w2v_model.vocab)

In [11]:
batch_size = 512  # For mini-batch gradient descent
num_classes = 5 # PER - LOC - ORG - MISC - O
epochs = 10
len_words = 5
input_size = len_words # amount of words by row
train_examples = len(X_train)
test_examples = len(X_test)
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [12]:
# https://stackoverflow.com/questions/43396572/dimension-of-shape-in-conv1d
steps = 5 # number of words in the sentence
channels = 1
input_shape = (steps, channels) #3D tensor with shape: `(batch, steps, channels)`
# # Output shape
#     3D tensor with shape: `(batch, new_steps, filters)`
X_train = np.asarray(X_train)

In [13]:
X_train.shape

(466826, 5)

In [14]:
X_train[:5]

array([[     0,      0,  66867,  46891,     10],
       [   499,     11,   5221,   1393,      0],
       [     0,  36989, 246915,      0,     26],
       [   166,   1187,     19,    379,      2],
       [    11,   9867,    111,      0, 159050]])

### CNN Model

In [17]:
conv_filters = 10
pool_size = 2
inp = Input(shape=(X_train.shape[1],))
print(inp.shape)

(?, 5)


In [18]:
emb = Embedding(len(w2v_model.vocab),  # Vocabulary size
                w2v_model.vector_size, # Embedding size
                weights=[w2v_model.vectors], # Word vectors
                trainable=False  # This indicates the word vectors must not be changed
                                 # during training.
      )(inp)
print(emb.shape)
# The output here has shape (batch_size (?), words_in_reviews (?), embedding_size)

(?, 5, 300)


In [None]:
# lamb = Lambda(lambda xin: K.concatenate(xin, axis=1), name='embedding_concat')(emb)
# # print(lamb.shape)
# emb = K.squeeze(emb, axis=2)
# print(emb.shape)

In [19]:
# Input shape
#     3D tensor with shape: `(batch, steps, channels)`

# Output shape
#     3D tensor with shape: `(batch, new_steps, filters)`
#     `steps` value might have changed due to padding or strides.

# Specify each convolution layer and their kernel size i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(emb)
btch1_1 = BatchNormalization()(conv1_1)
maxp1_1 = MaxPooling1D(pool_size=pool_size)(btch1_1)
flat1_1 = Flatten()(maxp1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(emb)
btch1_2 = BatchNormalization()(conv1_2)
maxp1_2 = MaxPooling1D(pool_size=pool_size)(btch1_2)
flat1_2 = Flatten()(maxp1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(emb)
btch1_3 = BatchNormalization()(conv1_3)
maxp1_3 = MaxPooling1D(pool_size=pool_size)(btch1_3)
flat1_3 = Flatten()(maxp1_3)

# Gather all convolution layers
cnct = concatenate([flat1_1, flat1_2, flat1_3], axis=1)
drp1 = Dropout(0)(cnct)

dns1  = Dense(128, activation='relu')(drp1)
out = Dense(num_classes, activation='softmax')(dns1)

In [20]:
model = Model(inputs=inp, outputs=out)

In [21]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 5, 300)       900000000   input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 3, 10)        9010        embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 3, 10)        9010        embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [22]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [23]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 420143 samples, validate on 46683 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
