In [6]:
import ml_utils as MLU
import numpy as np
import pandas as pd
import gensim
from gensim.models import KeyedVectors
from keras.models import Model, Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import TimeDistributed, Conv1D, Dense, Embedding
from keras.layers import Dropout, LSTM, Bidirectional, MaxPooling1D
from keras.layers import Input, concatenate, Concatenate, Flatten
from sklearn.model_selection import GroupKFold
import datetime

##### Ignore Warnings

In [2]:
import warnings
warnings.filterwarnings("ignore")

#### Load PT Word2Vec model

In [None]:
%%time
pt_w2v_patch = 'C:\word_embedding\pt_cbow_s300.txt'
pt_w2v = KeyedVectors.load_word2vec_format(pt_w2v_patch, unicode_errors="ignore")
pt_model_w2v = {w: vec for w, vec in zip(pt_w2v.index2word, pt_w2v.syn0)}
del pt_w2v

#### Load EN Word2Vec model

In [None]:
%%time
en_w2v_patch = 'C:\word_embedding\GoogleNews-vectors-negative300.bin'
en_w2v = gensim.models.Word2Vec.load_word2vec_format(en_w2v_patch, binary=True)
en_model_w2v = {w: vec for w, vec in zip(en_w2v.index2word, en_w2v.syn0)}
del en_w2v

### Prepare Data

##### Load Harem

In [None]:
df = pd.open_csv('harem.csv')

##### Load Conll2003

In [None]:
df = pd.open_csv('conll2003.csv')

##### PT Word2vec Feature

In [None]:
df['w2v'] = [pt_w2v[word] for word in df['word']]

##### EN Word2Vec Feature

In [None]:
df['w2v'] = [en_w2v[word] for word in df['word']]

##### POS Feature

In [None]:
data_, pos2idx = MLU.myHotEncode([df['pos']])
df['Pos'] = data_[0]
del data_

##### Graphic Feature

In [None]:
data_, graphic2idx = MLU.myHotEncode([df['Graphic']])
df['Graphic'] = data_[0]
del data_

##### BIO Class

In [None]:
bio_classes = []
for bio, classe in zip(df['bio'], df['class']):
    if classe is None or classe.strip() == '':
        bio_classes.append('O')
    else:
        bio_classes.append(classe + '-' + bio)
df['bio_class'] = bio_classes
del bio_classes

##### BIOSE Class

In [None]:
biose_classes = []
for biose, classe in zip(df['biose'], df['class']):
    if classe is None or classe.strip() == '':
        biose_classes.append('O')
    else:
        biose_classes.append(classe + '-' + biose)
df['biose_class'] = biose_classes
del biose_classes

### Keras Models

In [None]:
def exemple_lstm_model():
    model = Sequential()
    model.add(LSTM(64, input_dim=1000, name='LSTM0'))
    model.add(Dense(256, activation='relu', name='Dence1'))
    model.add(Dropout(0.5, name='Droupout2'))
    model.add(Dense(7, activation='sigmoid', name='Dense_out3'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy')
    return model

# ToDo (Felipe) - Diferentes funções que criam diferentes modelos que utilizam 
# features distintas, como CNN (morfológica), LSTM (Word2vec), LSTM (Graphics) ...
# Criar os modelos baseados nos modelos do trabalho CNN+LSTN+Bidirecional 

### Setting benchmark

In [None]:
def run_benchmark(run_id, model, df, features, class_column, k=5, random_state=0, 
                  metric_average="macro", epochs=10, verbose=1):
    
    start_benchmark = time.time()
    accuracy = []
    precision = []
    recall = []
    f1 = []
    confusion = []
    execution_time = []
    test_time = []
    
    gp = GroupKFold(n_splits=k)
    for train_indexs, test_indexs in gp.split(df['Word'], groups=df['sentence_code']):
        x_train = []
        for feature in features:
            x_train.append(df[feature][train_indexs])
        y_train = df[class_column][train_indexs]
        
        x_test = []
        for feature in features:
            x_test.append(df[feature][test_indexs])
        y_test = df[class_column][test_indexs]
                
        start_time = time.time()
        model_ = model.fit(x_train, y_train, verbose=verbose, epochs=epochs)
        end_time = time.time() - start_time
        execution_time.append(end_time)
                
        start_time = time.time()
        result = model_.predict(x_test)
        end_time = time.time() - start_time
        test_time.append(end_time)

        accuracy.append(accuracy_score(result, y_test))
        precision.append(precision_score(result, y_test, average=metric_average))
        recall.append(recall_score(result, y_test, average=metric_average))
        f1.append(f1_score(result, y_test, average=metric_average))
        confusion.append(confusion_matrix(result, y_test))
    
    print('')
    aux = time.time() - start_benchmark
    print('Run time benchmark:', aux)
    
    results = {
        'run_id': run_id,
        'datetime': datetime.datetime.now(),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion': confusion,
        'train_time': execution_time,
        'test_time': test_time,
        'benchmark_time': aux
    }
    
    return pd.DataFrame(results)

### Run

In [None]:
# Example
df_result = run_benchmark('example_run', exemple_lstm_model(), df, ['w2v'], 'bio_class')

### Save Results

#### CUIDADO PRA NÃO SALVAR EM CIMA DE UM ARQUIVO COM RESULTADOS

In [None]:
# Sempre muda o nome do arquivo pra não salvar em cima !
df_result.to_csv('results/exemplo_070219.csv', index=False)