In [126]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from keras.layers import Dense, Dropout, LSTM
from keras.models import Sequential
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import datetime
import nltk
import time
import io

patch_wordembedding_dt = '/home/eduardo/word_embedding/wiki.multi.nl.vec'
patch_wordembedding_en = '/home/eduardo/word_embedding/wiki.multi.en.vec'
patch_wordembedding_es = '/home/eduardo/word_embedding/wiki.multi.es.vec'
patch_wordembedding_it = '/home/eduardo/word_embedding/wiki.multi.it.vec'
patch_wordembedding_pt = '/home/eduardo/word_embedding/wiki.multi.pt.vec'

embedding_dt = None
embedding_en = None
embedding_es = None
embedding_it = None
embedding_pt = None

### Add features

In [127]:
def create_features(features, df, embedding=None):
    for feature in features:
        create_feature(feature, df, embedding)
        
def create_feature(feature, df, embedding=None):
    if feature == 'bow':
        model = CountVectorizer(analyzer='word', strip_accents=None, 
                                ngram_range=(1, 1), lowercase=True, 
                                max_features=5000)
        model.fit(df['question'])
        ret = model.transform(df['question']).toarray()
        df['bow'] = [x for x in ret]
    
    if feature == 'embedding':
        if embedding is None:
            print('embedding None')
            return
        embds = []
        for question in df['question']:
            tokens = nltk.word_tokenize(question)
            embed = []
            for token in tokens:
                if token.lower() in embedding:
                    embed.append(embedding[token.lower()])
                else:
                    embed.append(np.zeros(300))
            embds.append(embed)
        df['embedding'] = embds

### Create supervised models

In [154]:
def create_models(models):
    ret = []
    for model in models:
        m = create_model(model)
        if m is not None:
            ret.append({'name': model, 'model': m})
    return ret
    
def create_model(model):
    if model == 'svm_linear':
        return svm_linear
    if model == 'lstm':
        return lstm_default
    return None

def svm_linear():
    return LinearSVC()

def lstm_default(in_dim=300, out_dim=7, drop=0.2):
    model = Sequential()
    model.add(LSTM(512, input_dim=in_dim, name='0_LSTM'))
    model.add(Dense(1024, activation='relu', name='1_Dense'))
    model.add(Dropout(drop, name='2_Droupout'))
    model.add(Dense(out_dim, activation='sigmoid', name='3_Dense'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy')
    return model

### UTILS

In [159]:
import warnings
warnings.filterwarnings("ignore")

In [160]:
def load_embedding(emb_path, nmax=50000):
    embedding = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in embedding, 'word found twice'
            embedding[word] = vect
            if len(embedding) == nmax:
                break
    return embedding

##### Nearst Neighbors

In [161]:
def get_nn(position, embedding, K=5):
    rank = []
    for word in embedding:
        rank.append({'word': word, 'score': np.linalg.norm(position-embedding[word])})
    list.sort(rank, key= lambda x: x['score'], reverse=True)
    return rank[-K:][::-1]

## UIUC Benchmark

In [162]:
embedding = load_embedding(patch_wordembedding_en)  # Select language
dataset_train, dataset_test = load_uiuc('en')  # Select language
create_features(['embedding'], dataset_train, embedding)
create_features(['embedding'], dataset_test, embedding)
models = create_models(['lstm'])
X_train = np.array([list(x) for x in dataset_train['embedding'].values])
X_test = np.array([list(x) for x in dataset_test['embedding'].values])
X_train = pad_sequences(X_train, maxlen=12, dtype='float', padding='post', truncating='post', value=0.0)
X_test = pad_sequences(X_test, maxlen=12, dtype='float', padding='post', truncating='post', value=0.0)
y_train = dataset_train['class'].values
y_test = dataset_test['class'].values
ohe = OneHotEncoder()
y_train = ohe.fit_transform([[y_] for y_ in y_train]).toarray()
y_test = ohe.fit_transform([[y_] for y_ in y_test]).toarray()

In [None]:
run_benchmark(models, X_train, y_train, X_test, y_test, sizes_train=[1000, 2000, 3000, 4000, 5500], save='results/UIUC_en_lstm_embedding_2.csv', onehot=ohe)

  lstm .

#### Load dataset

In [158]:
def load_uiuc(language):
    # language: 'en', 'pt', 'es'
    return pd.read_csv('datasets/UIUC_' + language + '/train.csv'), pd.read_csv('datasets/UIUC_' + language + '/test.csv')

#### Benchmark

In [157]:
def run_benchmark(models, X, y, x_test, y_test, save='default.csv', 
                  sizes_train=[], start_results=None, metric_average="macro", onehot=None):
    start_benchmark = time.time()
    last_model = None
    last_train_size = None
    results = pd.DataFrame()
    if start_results is not None:
        last_model = start_results['model'].tail(1)
        last_size_train = start_results['size_train'].tail(1)
        results = start_results
    for model in models:
            
        print(' ', model['name'], end=' ')
            
        for size_train in sizes_train:
            
            if start_results is not None:
                if model in start_results['model'].unique() and last_size_train >= size_train:
                        continue
            start_results = None
            
            print('.', end='')

            x_train = X[:size_train]
            y_train = y[:size_train]

            if 'lstm' in model['name']:
                m = model['model'](out_dim=6)
                start_time = time.time()
                m.fit(x_train, y_train, verbose=0, epochs=200)
                train_time = time.time() - start_time

                start_time = time.time()
                result = m.predict(x_test)
                test_time = time.time() - start_time
                result = onehot.inverse_transform(result)
                y_test_ = onehot.inverse_transform(y_test)
            else:
                m = model['model']()
                start_time = time.time()
                m.fit(x_train, y_train)
                train_time = time.time() - start_time

                start_time = time.time()
                result = m.predict(x_test)
                test_time = time.time() - start_time
                y_test_ = y_test
            
            data = {'datetime': datetime.datetime.now(),
                          'model': model['name'],
                          'accuracy': accuracy_score(result, y_test_),
                          'precision': precision_score(result, y_test_, average=metric_average),
                          'recall': recall_score(result, y_test_, average=metric_average),
                          'f1': f1_score(result, y_test_, average=metric_average),
                          'confusion': confusion_matrix(result, y_test_),
                          'train_size': size_train,
                          'execution_time': train_time,
                          'test_time': test_time}
            results = results.append([data])
            results.to_csv(save)
    print('')
    aux = time.time() - start_benchmark
    print('Run time benchmark:', aux)
    return pd.DataFrame(results)

## DISEQuA Benchmark

In [123]:
embedding = load_embedding(patch_wordembedding_es)  # Select language
dataset = load_disequa('SPA')  # Select language
create_features(['embedding'], dataset, embedding)
models = create_models(['lstm'])
X = np.array([list(x) for x in dataset['embedding'].values])
X = pad_sequences(X, maxlen=12, dtype='float', padding='post', truncating='post', value=0.0)
y = dataset['class'].values
ohe = OneHotEncoder()
y = ohe.fit_transform([[y_] for y_ in y]).toarray()

In [124]:
run_benchmark(models, X, y, sizes_train=[100,200,300,400], 
              save='results/DISEQuA_SPA_lstm_embedding.csv', onehot=ohe)

  lstm |..........|..........|..........|..........
Run time benchmark: 1553.6894555091858


Unnamed: 0,accuracy,confusion,datetime,execution_time,f1,fold,model,precision,recall,test_time,train_size
0,0.585714,"[[42, 1, 2, 0, 1, 2, 0], [6, 52, 18, 2, 12, 18...",2019-04-03 13:01:30.996886,21.10707,0.529203,0,lstm,0.544917,0.59196,7.06993,100
0,0.637143,"[[43, 1, 0, 0, 0, 0, 0], [0, 57, 7, 2, 4, 7, 7...",2019-04-03 13:02:00.374723,22.245998,0.553703,1,lstm,0.548604,0.72121,6.932064,100
0,0.665714,"[[38, 2, 1, 1, 2, 2, 1], [4, 47, 8, 6, 8, 7, 5...",2019-04-03 13:02:28.328092,20.4776,0.535619,2,lstm,0.544453,0.542451,7.281572,100
0,0.591429,"[[33, 1, 1, 0, 0, 0, 1], [4, 33, 1, 0, 2, 6, 1...",2019-04-03 13:02:57.714364,21.325567,0.500572,3,lstm,0.501752,0.535183,7.867481,100
0,0.611429,"[[34, 9, 3, 1, 2, 1, 0], [3, 39, 7, 0, 2, 1, 1...",2019-04-03 13:03:26.863669,21.1389,0.506312,4,lstm,0.519128,0.504406,7.812283,100
0,0.68,"[[42, 2, 0, 0, 0, 2, 1], [1, 51, 9, 2, 7, 4, 1...",2019-04-03 13:03:56.228102,21.692858,0.540616,5,lstm,0.561474,0.55142,7.474119,100
0,0.602857,"[[34, 8, 1, 2, 4, 4, 5], [8, 36, 5, 1, 11, 12,...",2019-04-03 13:04:25.970121,21.998751,0.452042,6,lstm,0.473417,0.490151,7.547056,100
0,0.511429,"[[33, 13, 9, 5, 5, 16, 5], [3, 38, 18, 1, 4, 4...",2019-04-03 13:04:56.561685,22.548464,0.367395,7,lstm,0.403865,0.357146,7.844875,100
0,0.674286,"[[44, 4, 0, 0, 0, 4, 0], [5, 53, 6, 2, 6, 5, 2...",2019-04-03 13:05:27.134062,22.623508,0.583353,8,lstm,0.580711,0.597111,7.756882,100
0,0.631429,"[[46, 6, 12, 1, 1, 6, 0], [3, 47, 19, 4, 6, 9,...",2019-04-03 13:05:58.129057,22.945976,0.51572,9,lstm,0.539671,0.541721,7.852409,100


In [None]:
run_benchmark(models, X, y, sizes_train=[100,200,300,400], save='results/DISEQuA_ENG_lstm_embedding.csv', onehot=ohe)

In [None]:
run_benchmark(models, X, y, sizes_train=[100,200,300,400], save='results/DISEQuA_ITA_lstm_embedding.csv', onehot=ohe)

In [None]:
run_benchmark(models, X, y, sizes_train=[100,200,300,400], save='results/DISEQuA_SPA_lstm_embedding.csv', onehot=ohe)

#### Load dataset

In [121]:
def load_disequa(language):
    df = pd.read_csv('datasets/DISEQuA/disequa.csv')
    return df[df['language'] == language]

#### Benchmark

In [136]:
def run_benchmark(models, X, y, folds=10, seed=0, save='default.csv', sizes_train=[], 
                  start_results=None, metric_average="macro", onehot=None):
    
    start_benchmark = time.time()
    
    last_model = None
    last_train_size = None
    last_fold = None
    results = pd.DataFrame()
    if start_results is not None:
        last_model = start_results['model'].tail(1)
        last_size_train = start_results['size_train'].tail(1)
        last_fold = start_results['fold'].tail(1)
        results = start_results
    
    
    
    for model in models:
            
        print(' ', model['name'], end=' ')
            
        for size_train in sizes_train:
            print('|', end='')
            size_test = len(X) - size_train
            rs = ShuffleSplit(n_splits=folds, train_size=size_train, test_size=size_test, 
                          random_state=seed)
            fold = 0
            for train_indexs, test_indexs in rs.split(X):
            
                if start_results is not None:
                    if model in start_results['model'].unique() and last_size_train >= size_train and last_fold >= fold:
                        continue
                start_results = None

                print('.', end='')

                x_train = X[train_indexs]
                y_train = y[train_indexs]
                x_test = X[test_indexs]
                y_test = y[test_indexs]

                
                if 'lstm' in model['name']:
                    m = model['model']()
                    start_time = time.time()
                    m.fit(x_train, y_train, verbose=0, epochs=100)
                    train_time = time.time() - start_time

                    start_time = time.time()
                    result = m.predict(x_test)
                    test_time = time.time() - start_time
                    result = onehot.inverse_transform(result)
                    y_test = onehot.inverse_transform(y_test)
                    
                else:
                    m = model['model']()
                    start_time = time.time()
                    m.fit(x_train, y_train)
                    train_time = time.time() - start_time

                    start_time = time.time()
                    result = m.predict(x_test)
                    test_time = time.time() - start_time

                data = {'datetime': datetime.datetime.now(),
                        'model': model['name'],
                        'accuracy': accuracy_score(result, y_test),
                        'precision': precision_score(result, y_test, average=metric_average),
                        'recall': recall_score(result, y_test, average=metric_average),
                        'f1': f1_score(result, y_test, average=metric_average),
                        'confusion': confusion_matrix(result, y_test),
                        'train_size': size_train,
                        'fold': fold,
                        'execution_time': train_time,
                        'test_time': test_time}
                results = results.append([data])
                results.to_csv(save)
                fold += 1
    print('')
    aux = time.time() - start_benchmark
    print('Run time benchmark:', aux)
    return pd.DataFrame(results)