In [10]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from keras.layers import Dense, Dropout, LSTM
from sklearn.metrics import matthews_corrcoef
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC, SVR
from keras.models import Sequential
import tensorflow as tf
import pandas as pd
import numpy as np
import datetime
import random
import keras
import nltk
import time
import io
import os

patch_wordembedding_dt = '/home/eduardo/word_embedding/wiki.multi.nl.vec'
patch_wordembedding_en = '/home/eduardo/word_embedding/wiki.multi.en.vec'
patch_wordembedding_es = '/home/eduardo/word_embedding/wiki.multi.es.vec'
patch_wordembedding_it = '/home/eduardo/word_embedding/wiki.multi.it.vec'
patch_wordembedding_pt = '/home/eduardo/word_embedding/wiki.multi.pt.vec'

embedding_dt = None
embedding_en = None
embedding_es = None
embedding_it = None
embedding_pt = None

### Add features

In [11]:
def create_feature(feature, df, df_2, embedding=None):
    if feature == 'bow':
        model = CountVectorizer(analyzer='word', strip_accents=None, 
                                ngram_range=(1, 1), lowercase=True, 
                                max_features=5000)
        model.fit(df['question'])
        ret = model.transform(df_2['question']).toarray()
        df_2['bow'] = [x for x in ret]

    if feature == 'tfidf':
        model = TfidfVectorizer(analyzer='word', strip_accents=None, 
                                ngram_range=(1, 1), lowercase=True, 
                                max_features=5000)
        model.fit(df['question'])
        ret = model.transform(df_2['question']).toarray()
        df_2['tfidf'] = [x for x in ret]
    
    if feature == 'tfidf_3gram':
        model = TfidfVectorizer(analyzer='word', strip_accents=None, 
                                ngram_range=(1, 2), lowercase=True, 
                                max_features=80000)
        model.fit(df['question'])
        ret = model.transform(df_2['question']).toarray()
        df_2['tfidf'] = [x for x in ret]

    if feature == 'embedding':
        if embedding is None:
            print('Error: embedding None')
            return
        embds = []
        for question in df_2['question']:
            tokens = nltk.word_tokenize(question)
            embed = []
            for token in tokens:
                if token.lower() in embedding:
                    embed.append(embedding[token.lower()])
                else:
                    embed.append(np.zeros(300))
            embds.append(embed)
        df_2['embedding'] = embds

### Create supervised models

In [None]:
def svm_linear():
    return LinearSVC()

def lstm_default(in_dim=300, out_dim=7, drop=0.2):
    model = Sequential()
    model.add(LSTM(256, input_dim=in_dim, name='0_LSTM'))
    model.add(Dropout(drop, name='1_Droupout'))
    model.add(Dense(128, activation='relu', name='2_Dense'))
    model.add(Dropout(drop, name='3_Droupout'))
    model.add(Dense(out_dim, activation='softmax', name='4_Dense'))
    otimizer = keras.optimizers.Adam(lr=0.01) #decay = 0.0001
    model.compile(optimizer=otimizer, loss='categorical_crossentropy')
    return model

def random_forest():
    return RandomForestClassifier(n_estimators=500)

def mlp(in_dim=5000, out_dim=7, drop=0.65):
    model = Sequential()
    model.add(Dense(128, input_dim=in_dim, activation='relu'))
    model.add(Dropout(drop))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(drop))
    model.add(Dense(out_dim, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='Adam')
    return model

### UTILS

In [13]:
import warnings
warnings.filterwarnings("ignore")

In [14]:
def load_embedding(emb_path, nmax=50000):
    embedding = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in embedding, 'word found twice'
            embedding[word] = vect
            if len(embedding) == nmax:
                break
    return embedding

##### Nearst Neighbors

In [15]:
def get_nn(position, embedding, K=5):
    rank = []
    for word in embedding:
        rank.append({'word': word, 'score': np.linalg.norm(position-embedding[word])})
    list.sort(rank, key= lambda x: x['score'], reverse=True)
    return rank[-K:][::-1]

## Benchmark UIUC

In [31]:
def run_benchmark(model, X, y, x_test, y_test, sizes_train=[], runs=30, save='default.csv', 
                  metric_average="macro", onehot=None, out_dim=6, epochs=10):
    start_benchmark = time.time()
    results = pd.DataFrame()

    for size_train in sizes_train:

        print('\n'+str(size_train), end='|')

        for run in range(runs):
            print('.', end='')
            x_train = X[:size_train]
            y_train = y[:size_train]

            if 'lstm' in model['name'] or 'mlp' in model['name']:
                m = model['model'](out_dim=len(onehot.categories_[0]))
                start_time = time.time()
                m.fit(x_train, y_train, verbose=0, epochs=epochs)
                train_time = time.time() - start_time
                start_time = time.time()
                result = m.predict(x_test)
                test_time = time.time() - start_time
                #print('  nan:', np.any(np.isnan(result)), end='')
                #print('  fin:', np.all(np.isfinite(result)), end='')
                result = np.nan_to_num(result)
                result = onehot.inverse_transform(result)
                y_test_ = onehot.inverse_transform(y_test)
            else:
                m = model['model']()
                start_time = time.time()
                m.fit(x_train, y_train)
                train_time = time.time() - start_time

                start_time = time.time()
                result = m.predict(x_test)
                test_time = time.time() - start_time
                y_test_ = y_test

            data = {'datetime': datetime.datetime.now(),
                    'model': model['name'],
                    'accuracy': accuracy_score(result, y_test_),
                    'precision': precision_score(result, y_test_, average=metric_average),
                    'recall': recall_score(result, y_test_, average=metric_average),
                    'f1': f1_score(result, y_test_, average=metric_average),
                    'mcc': matthews_corrcoef(result, y_test_),
                    'confusion': confusion_matrix(result, y_test_),
                    'run': run + 1,
                    'train_size': size_train,
                    'execution_time': train_time,
                    'test_time': test_time}
            results = results.append([data])
            results.to_csv(save)
    print('')
    aux = time.time() - start_benchmark
    print('Run time benchmark:', aux)
    return pd.DataFrame(results)

#### Load dataset

In [17]:
def load_uiuc(language):
    # language: 'en', 'pt', 'es'
    return pd.read_csv('datasets/UIUC_' + language + '/train.csv'), pd.read_csv('datasets/UIUC_' + language + '/test.csv')

## Run UIUC Benchmark

#### LSTM + WordEmbedding

In [None]:
for language in ['pt']:
    print('\n\nLanguage: ', language)
    embedding = load_embedding('/home/eduardo/word_embedding/wiki.multi.' + language + '.vec')
    dataset_train, dataset_test = load_uiuc(language)
    create_feature('embedding', dataset_train, dataset_train, embedding)
    create_feature('embedding', dataset_train, dataset_test, embedding)
    model = {'name': 'lstm', 'model': lstm_default}
    X_train = np.array([list(x) for x in dataset_train['embedding'].values])
    X_test = np.array([list(x) for x in dataset_test['embedding'].values])
    X_train = pad_sequences(X_train, maxlen=12, dtype='float', padding='post', truncating='post', value=0.0)
    X_test = pad_sequences(X_test, maxlen=12, dtype='float', padding='post', truncating='post', value=0.0)
    y_train = dataset_train['class'].values
    y_test = dataset_test['class'].values
#     y_train_sub = dataset_train['sub_class'].values
#     sub_classes = set()
#     for sc in y_train_sub:
#         sub_classes.add(sc)
#     y_test_sub = dataset_test['sub_class'].values
#     X_test_sub_ = []
#     y_test_sub_ = []
#     for i in range(len(X_test)):
#         if y_train_sub[i] in sub_classes:
#             X_test_sub_.append(X_test[i])
#             y_test_sub_.append(y_train_sub[i])
#     X_test_sub_ = np.array(X_test_sub_)
#     y_test_sub_ = np.array(y_test_sub_)
    ohe = OneHotEncoder()
    y_train = ohe.fit_transform([[y_] for y_ in y_train]).toarray()
    y_test = ohe.transform([[y_] for y_ in y_test]).toarray() 
    run_benchmark(model, X_train, y_train, X_test, y_test, sizes_train=[1000, 2000, 3000, 4000, 5500],
                  runs=30, save='results/UIUC_lstm_embedding_' + language + '.csv', epochs=100, onehot=ohe)
    #run_benchmark(model, X_train, y_train_sub, X_test_sub_, y_test_sub_, sizes_train=[1000, 2000, 3000, 4000, 5500],
    #              save='results/UIUCsub_svm_tfidf_' + language + '.csv')

#### SVM + TF-IDF

In [None]:
for language in ['en', 'es', 'pt']:
    print('\n\nLanguage: ', language)
    dataset_train, dataset_test = load_uiuc(language)
    create_feature('tfidf', dataset_train, dataset_train, embedding)
    create_feature('tfidf', dataset_train, dataset_test, embedding)
    model = {'name': 'svm', 'model': svm_linear}
    X_train = np.array([list(x) for x in dataset_train['tfidf'].values])
    X_test = np.array([list(x) for x in dataset_test['tfidf'].values])
    y_train = dataset_train['class'].values
    y_test = dataset_test['class'].values
    run_benchmark(model, X_train, y_train, X_test, y_test, sizes_train=[1000, 2000, 3000, 4000, 5500],
                  save='results/UIUC_svm_tfidf_' + language + '.csv')

#### RFC + TF-IDF

In [None]:
for language in ['en', 'es', 'pt']:
    print('\n\nLanguage: ', language)
    dataset_train, dataset_test = load_uiuc(language)
    create_feature('tfidf', dataset_train, dataset_train, embedding)
    create_feature('tfidf', dataset_train, dataset_test, embedding)
    model = {'name': 'rfc', 'model': random_forest}
    X_train = np.array([list(x) for x in dataset_train['tfidf'].values])
    X_test = np.array([list(x) for x in dataset_test['tfidf'].values])
    y_train = dataset_train['class'].values
    y_test = dataset_test['class'].values
    run_benchmark(model, X_train, y_train, X_test, y_test, sizes_train=[1000, 2000, 3000, 4000, 5500],
                  save='results/UIUC_rfc_tfidf_' + language + '.csv')

#### SVM + TFIDF_3gram + RFE

In [38]:
for language in ['en', 'es', 'pt']:
    print('\n\nLanguage: ', language)
    dataset_train, dataset_test = load_uiuc(language)
    create_feature('tfidf_3gram', dataset_train, dataset_train)
    create_feature('tfidf_3gram', dataset_train, dataset_test)
    model = {'name': 'svm_skb', 'model': svm_linear}
    X_train = np.array([list(x) for x in dataset_train['tfidf'].values])
    X_test = np.array([list(x) for x in dataset_test['tfidf'].values])
    y_train = dataset_train['class'].values
    y_test = dataset_test['class'].values
    
    classes = list(dataset_train['class'].unique())
    y_train_ = [classes.index(c) for c in y_train]
    
    print('SKB Start')
    #selector = selector.fit(X_train, y_train_)
    skb = SelectKBest(chi2, k=5000).fit(X_train, y_train_)
    X_train = skb.transform(X_train)
    X_test = skb.transform(X_test)
    print('SKB End')
    
    run_benchmark(model, X_train, y_train, X_test, y_test, sizes_train=[1000, 2000, 3000, 4000, 5500],
                  runs=1, save='results/UIUC_svm_skb_tfidf3gram_' + language + '.csv')



Language:  en
SKB Start
SKB End

1000|.
2000|.
3000|.
4000|.
5500|.
Run time benchmark: 1.2348272800445557


Language:  es
SKB Start
SKB End

1000|.
2000|.
3000|.
4000|.
5500|.
Run time benchmark: 1.5204951763153076


Language:  pt
SKB Start
SKB End

1000|.
2000|.
3000|.
4000|.
5500|.
Run time benchmark: 1.346195936203003


## DISEQuA Benchmark

#### Benchmark

In [39]:
def run_benchmark(model, X, y, folds=10, save='default.csv', sizes_train=[],
                  start_results=None, metric_average="macro", onehot=None):
    
    start_benchmark = time.time()
    results = pd.DataFrame()
    for size_train in sizes_train:
        print('\n'+str(size_train)+'|', end='')
        size_test = len(X) - size_train
        rs = ShuffleSplit(n_splits=folds, train_size=size_train, test_size=size_test)
        fold = 0
        for train_indexs, test_indexs in rs.split(X):
            print('.', end='')
            x_train = X[train_indexs]
            y_train = y[train_indexs]
            x_test = X[test_indexs]
            y_test = y[test_indexs]

            if 'lstm' in model['name']:
                m = model['model']()
                start_time = time.time()
                m.fit(x_train, y_train, verbose=0, epochs=100)
                train_time = time.time() - start_time

                start_time = time.time()
                result = m.predict(x_test)
                test_time = time.time() - start_time
                result = onehot.inverse_transform(result)
                y_test = onehot.inverse_transform(y_test)
            else:
                m = model['model']()
                start_time = time.time()
                m.fit(x_train, y_train)
                train_time = time.time() - start_time

                start_time = time.time()
                result = m.predict(x_test)
                test_time = time.time() - start_time

            data = {'datetime': datetime.datetime.now(),
                    'accuracy': accuracy_score(result, y_test),
                    'precision': precision_score(result, y_test, average=metric_average),
                    'recall': recall_score(result, y_test, average=metric_average),
                    'f1': f1_score(result, y_test, average=metric_average),
                    'confusion': confusion_matrix(result, y_test),
                    'train_size': size_train,
                    'fold': fold,
                    'execution_time': train_time,
                    'test_time': test_time}
            results = results.append([data])
            results.to_csv(save)
            fold += 1
    print('')
    aux = time.time() - start_benchmark
    print('Run time benchmark:', aux)
    return pd.DataFrame(results)

#### Load dataset

In [40]:
def load_disequa(language):
    df = pd.read_csv('datasets/DISEQuA/disequa.csv')
    return df[df['language'] == language]

### RUN DISEQuA Benchmark

##### SVM + TFIDF

In [None]:
for language in ['DUT', 'ENG', 'ITA', 'SPA']:
    print('\n\nLanguage: ', language)
    dataset = load_disequa(language)
    create_feature('tfidf', dataset, dataset, embedding)
    model = {'name': 'svm', 'model': svm_linear}
    X = np.array([list(x) for x in dataset['tfidf'].values])
    y = dataset['class'].values
    run_benchmark(model, X, y, sizes_train=[100,200,300,400],
                  save='results/DISEQuA_svm_tfidf_' + language + '.csv')

##### RFC + TFIDF

In [None]:
for language in ['DUT', 'ENG', 'ITA', 'SPA']:
    print('\n\nLanguage: ', language)
    dataset = load_disequa(language)
    create_feature('tfidf', dataset, dataset, embedding)
    model = {'name': 'rfc', 'model': random_forest}
    X = np.array([list(x) for x in dataset['tfidf'].values])
    y = dataset['class'].values
    run_benchmark(model, X, y, sizes_train=[100,200,300,400],
                  save='results/DISEQuA_rfc_tfidf_' + language + '.csv')

##### SVM + TFIDF_3gram + SKB

In [None]:
for language in ['DUT', 'ENG', 'ITA', 'SPA']:
    print('\n\nLanguage: ', language)
    dataset = load_disequa(language)
    create_feature('tfidf_3gram', dataset, dataset)
    model = {'name': 'svm', 'model': svm_linear}
    X = np.array([list(x) for x in dataset['tfidf'].values])
    y = dataset['class'].values
    skb = SelectKBest(chi2, k=2000).fit(X, y)
    X = skb.transform(X)
    run_benchmark(model, X, y, sizes_train=[100,200,300,400],
                  save='results/DISEQuA_svm_tfidf_3gram_' + language + '.csv')



Language:  DUT

100|..........
200|..........
300|..........
400|..........
Run time benchmark: 1.1172964572906494


Language:  ENG

100|..........
200|..........
300|..........
400|..........
Run time benchmark: 1.2064998149871826


Language:  ITA

100|..........
200|..........
300|..........
400|..........

##### LSTM + Embedding

In [None]:
for language, embd_l in zip(['SPA'], ['es']):
    print('\n\nLanguage: ', language)
    embedding = load_embedding('/home/eduardo/word_embedding/wiki.multi.' + embd_l + '.vec')
    dataset = load_disequa(language)
    create_feature('embedding', dataset, dataset, embedding)
    model = {'name': 'lstm', 'model': lstm_default}
    X = np.array([list(x) for x in dataset['embedding'].values])
    y = dataset['class'].values
    X = pad_sequences(X, maxlen=12, dtype='float', padding='post', truncating='post', value=0.0)
    ohe = OneHotEncoder()
    y = ohe.fit_transform([[y_] for y_ in y]).toarray()
    run_benchmark(model, X, y, sizes_train=[100,200,300,400], onehot=ohe,
                  save='results/DISEQuA_lstm_embedding_' + language + '.csv')