In [248]:
import ml_utils as MLU
import numpy as np
import pandas as pd
import gensim
from gensim.models import KeyedVectors
from keras.models import Model, Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import TimeDistributed, Conv1D, Dense, Embedding
from keras.layers import Dropout, LSTM, Bidirectional, MaxPooling1D
from keras.layers import Input, concatenate, Concatenate, Flatten
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import datetime
import time
import keras.backend as K

##### Ignore Warnings

In [2]:
import warnings
warnings.filterwarnings("ignore")

#### Load PT Word2Vec model

In [4]:
%%time
pt_w2v_patch = 'C:\word_embedding\pt_cbow_s300.txt'
pt_w2v = KeyedVectors.load_word2vec_format(pt_w2v_patch, unicode_errors="ignore")
pt_model_w2v = {w: vec for w, vec in zip(pt_w2v.index2word, pt_w2v.syn0)}
del pt_w2v

Wall time: 4min 28s


#### Load EN Word2Vec model

In [None]:
%%time
en_w2v_patch = 'C:\word_embedding\GoogleNews-vectors-negative300.bin'
en_w2v = gensim.models.Word2Vec.load_word2vec_format(en_w2v_patch, binary=True)
en_model_w2v = {w: vec for w, vec in zip(en_w2v.index2word, en_w2v.syn0)}
del en_w2v

### Prepare Data

##### Load Harem

In [144]:
df = pd.read_csv('harem.csv')

##### Load Conll2003

In [None]:
df = pd.read_csv('conll2003.csv')

### Features dicitonary dic_feat

In [143]:
dic_feat = {}
SIZE_PAD = 20

##### PT Word2vec Feature

In [145]:
SIZE_PAD = 20
w2v_ = []
sentence = []
last_code = -1
for word, code in zip(df['Word'], df['sentence_code']):
    if last_code != code:
        last_code = code
        if len(sentence) > 0:
            w2v_.append(np.array(sentence))
        sentence = []
    if str(word).lower() in pt_model_w2v:
        sentence.append(pt_model_w2v[str(word).lower()])
    else:
        sentence.append(np.zeros(300, dtype="float32"))
if len(sentence) > 0:
    w2v_.append(np.array(sentence))
w2v_ = pad_sequences(w2v_, maxlen=SIZE_PAD, dtype='float32')
dic_feat['w2v'] = w2v_
del w2v_

In [146]:
dic_feat['w2v'].shape

(4385, 20, 300)

##### EN Word2Vec Feature

In [None]:
w2v_ = []
sentence = []
last_code = -1
for word, code in zip(df['Word'], df['sentence_code']):
    if last_code != code:
        last_code = code
        if len(sentence) > 0:
            w2v_.append(np.array(sentence))
        sentence = []
    if str(word).lower() in en_model_w2v:
        sentence.append(en_model_w2v[str(word).lower()])
    else:
        sentence.append(np.zeros(300, dtype="float32"))
if len(sentence) > 0:
    w2v_.append(np.array(sentence))
w2v_ = pad_sequences(w2v_, maxlen=SIZE_PAD, dtype='float32')
dic_feat['w2v'] = w2v_
del w2v_

##### POS Feature (To-Fix)

In [13]:
data_, pos2idx = MLU.myHotEncode([[p] for p in df['Pos']])
df['Pos'] = data_
del data_


pos_ = []
sentence = []
last_code = -1
for pos, code in zip(df['Pos'], df['sentence_code']):
    if last_code != code:
        last_code = code
        if len(sentence) > 0:
            w2v_.append(np.array(sentence))
        sentence = []
    if str(word).lower() in en_model_w2v:
        sentence.append(en_model_w2v[str(word).lower()])
    else:
        sentence.append(np.zeros(300, dtype="float32"))
if len(sentence) > 0:
    w2v_.append(np.array(sentence))
w2v_ = pad_sequences(w2v_, maxlen=SIZE_PAD, dtype='float32')
dic_feat['w2v'] = w2v_
del w2v_


##### Graphic Feature (To-Fix)

In [15]:
data_, graphic2idx = MLU.myHotEncode([[p] for p in df['Graphic']])
df['Graphic'] = data_
del data_

##### BIO Class

In [148]:
bio_classes = []
for bio, classe in zip(df['bio'], df['class']):
    if classe is None or type(classe) == float or classe.strip() == '':
        bio_classes.append('O')
    else:
        bio_classes.append(classe + '-' + bio)
data_, bio2idx = MLU.myHotEncode([[p] for p in bio_classes])
bio_classes = data_
del data_

bio_ = []
sentence = []
last_code = -1
for bio, code in zip(bio_classes, df['sentence_code']):
    if last_code != code:
        last_code = code
        if len(sentence) > 0:
            bio_.append(np.array(sentence))
        sentence = []
    sentence.append(bio)
if len(sentence) > 0:
    bio_.append(np.array(sentence))
bio_ = pad_sequences(bio_, maxlen=SIZE_PAD, dtype='float32')
dic_feat['bio'] = bio_
del bio_

##### BIOSE Class (To-Fix)

In [27]:
biose_classes = []
for biose, classe in zip(df['biose'], df['class']):
    if classe is None or type(classe) == float or classe.strip() == '':
        biose_classes.append('O')
    else:
        biose_classes.append(classe + '-' + biose)
data_, biose2idx = MLU.myHotEncode([[p] for p in biose_classes])
df['biose_class'] = data_
del biose_classes
del data_

### Keras Models

In [249]:
W2V_DIM = 300
OUTPUT_DIM = 21
def exemple_lstm_model():
    """
    model = Sequential()
    model.add(LSTM(64, input_dim=1000, name='LSTM0'))
    model.add(Dense(256, activation='relu', name='Dence1'))
    model.add(Dropout(0.5, name='Droupout2'))
    model.add(Dense(7, activation='sigmoid', name='Dense_out3'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy')
    """
    
    input_w2v = Input(shape=(None, W2V_DIM), name='input_w2v')
    w2v = LSTM(W2V_DIM, name='lstm_w2v', return_sequences=True)(input_w2v)
    #w2v2 = LSTM(W2V_DIM, name='lstm_w2v2', return_sequences=True)(w2v)
    
    #output = concatenate([w2v], name='concat_inputs')
    output = Dense(256, name='dense_concat')(w2v)
    output = Dense(OUTPUT_DIM, activation='softmax', name='dense_output')(output)
    model = Model(inputs=[input_w2v], outputs=[output])
    model.compile(loss='categorical_crossentropy', optimizer='nadam')
    #model.summary()
    return model

# ToDo (Felipe) - Diferentes funções que criam diferentes modelos que utilizam 
# features distintas, como CNN (morfológica), LSTM (Word2vec), LSTM (Graphics) ...
# Criar os modelos baseados nos modelos do trabalho CNN+LSTN+Bidirecional 

### Setting benchmark

In [263]:
def run_benchmark(run_id, model_create, dic_features, features, class_column, k=5, random_state=0, 
                  metric_average="macro", epochs=10, verbose=1):
    start_benchmark = time.time()
    accuracy = []
    precision = []
    recall = []
    f1 = []
    confusion = []
    execution_time = []
    test_time = []
    
    ss = ShuffleSplit(n_splits=k, random_state=0)
    k_count = 0
    for train_indexs, test_indexs in ss.split(dic_features[features[0]]):
        k_count += 1
        print(k_count, '/', k)
        x_train = []
        for feature in features:
            x_train.append(dic_features[feature][train_indexs])
        y_train = dic_features[class_column][train_indexs]
        
        x_test = []
        for feature in features:
            x_test.append(dic_features[feature][test_indexs])
        y_test = dic_features[class_column][test_indexs]
        
        model = model_create()
        start_time = time.time()
        model_ = model.fit(x_train, y_train, verbose=verbose, epochs=epochs)
        end_time = time.time() - start_time
        execution_time.append(end_time)
                
        start_time = time.time()
        result = model.predict(x_test)
        end_time = time.time() - start_time
        test_time.append(end_time)
    
        result = np.array([np.concatenate(t) for t in result]).round()
        y_test = np.array([np.concatenate(t) for t in y_test])
        
        accuracy.append(accuracy_score(result, y_test))
        precision.append(precision_score(result, y_test, average=metric_average))
        recall.append(recall_score(result, y_test, average=metric_average))
        f1.append(f1_score(result, y_test, average=metric_average))
        confusion.append(confusion_matrix(result.argmax(axis=1), y_test.argmax(axis=1)))
    
    print('')
    aux = time.time() - start_benchmark
    print('Run time benchmark:', aux)
    
    results = {
        'run_id': run_id,
        'datetime': datetime.datetime.now(),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion': confusion,
        'train_time': execution_time,
        'test_time': test_time,
        'benchmark_time': aux
    }
    
    return pd.DataFrame(results)

### Run

In [264]:
# Example
df_result = run_benchmark('example_run', exemple_lstm_model, dic_feat, ['w2v'], 'bio', epochs=10)

1 / 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
2 / 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
3 / 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
4 / 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
5 / 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Run time benchmark: 245.5698537826538


In [265]:
df_result

Unnamed: 0,accuracy,benchmark_time,confusion,datetime,f1,precision,recall,run_id,test_time,train_time
0,0.111617,245.569854,"[[137, 3, 3, 1, 0, 3, 3, 0, 2, 1, 1, 2, 1, 2, ...",2019-02-13 14:31:20.248050,0.322739,0.3195,0.369766,example_run,1.391279,46.006907
1,0.111617,245.569854,"[[127, 6, 1, 0, 0, 5, 2, 0, 0, 0, 0, 0, 1, 2, ...",2019-02-13 14:31:20.248050,0.33477,0.335441,0.375181,example_run,1.415215,46.65713
2,0.102506,245.569854,"[[120, 1, 2, 0, 1, 1, 1, 3, 1, 0, 1, 0, 2, 8, ...",2019-02-13 14:31:20.248050,0.303423,0.319078,0.334228,example_run,1.497045,47.407165
3,0.116173,245.569854,"[[133, 3, 2, 2, 0, 3, 2, 0, 1, 0, 0, 1, 1, 0, ...",2019-02-13 14:31:20.248050,0.286153,0.294326,0.312007,example_run,1.711406,47.547488
4,0.09795,245.569854,"[[119, 2, 2, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",2019-02-13 14:31:20.248050,0.352037,0.366727,0.381677,example_run,1.54387,47.669465


### Save Results

#### CUIDADO PRA NÃO SALVAR EM CIMA DE UM ARQUIVO COM RESULTADOS

In [266]:
# Sempre muda o nome do arquivo pra não salvar em cima !
df_result.to_csv('results/exemplo_DATA_E_HORA_AQUI.csv', index=False)