In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import time
import datetime

## UIUC Benchmark

In [15]:
dataset = load_dataset('pt')
create_features(['bow'], dataset['train'], dataset['test'])
models = create_models(['svm_linear'])
X = np.array([list(x) for x in dataset['train']['bow'].values])
y = dataset['train']['class'].values
X_ = np.array([list(x) for x in dataset['test']['bow'].values])
y_ = dataset['test']['class'].values

In [16]:
run_benchmark(models, X, y, X_, y_, sizes_train=[1000, 2000, 3000, 4000, 5500], save='results/pt_svm_bow.csv')

  svm_linear .....
Run time benchmark: 0.7749648094177246


Unnamed: 0,accuracy,confusion,datetime,execution_time,f1,model,precision,recall,test_time,train_size
0,0.776,"[[3, 2, 0, 0, 0, 0], [6, 131, 25, 4, 15, 15], ...",2019-03-28 16:10:05.544153,0.031757,0.721988,svm_linear,0.698136,0.77423,0.005424,1000
0,0.816,"[[3, 0, 0, 0, 0, 0], [6, 130, 21, 4, 13, 6], [...",2019-03-28 16:10:05.643786,0.082604,0.762654,svm_linear,0.732438,0.858675,0.004861,2000
0,0.834,"[[3, 2, 0, 0, 0, 0], [6, 129, 16, 2, 8, 4], [0...",2019-03-28 16:10:05.781466,0.118417,0.769136,svm_linear,0.751979,0.803127,0.004499,3000
0,0.85,"[[4, 2, 0, 0, 0, 0], [5, 130, 18, 3, 6, 5], [0...",2019-03-28 16:10:05.989715,0.188642,0.80091,svm_linear,0.782601,0.831609,0.004568,4000
0,0.852,"[[5, 2, 0, 0, 0, 0], [4, 128, 16, 2, 7, 4], [0...",2019-03-28 16:10:06.266016,0.255941,0.815029,svm_linear,0.801578,0.835882,0.004606,5500


In [52]:
y.shape

(5452, 15)

#### Open dataset

In [5]:
def load_dataset(language):
    # language: 'en', 'pt', 'es'
    dataset = {
        'train': pd.read_csv('datasets/UIUC_' + language + '/train.csv'),
        'test': pd.read_csv('datasets/UIUC_' + language + '/test.csv'),
    }
    return dataset

#### Add Features

In [6]:
 # features: ['bow', ]
def create_features(features, df_train_, df_test_):
    for feature in features:
        create_feature(feature, df_train_, df_test_)
        
def create_feature(feature, df_train_, df_test_):
    if feature == 'bow':
        model = CountVectorizer(analyzer='word', strip_accents=None, 
                                ngram_range=(1, 1), lowercase=True, 
                                max_features=5000)
        model.fit(pd.concat([df_train_['question'], df_test_['question']]))
        ret = model.transform(df_train_['question']).toarray()
        df_train_['bow'] = [x for x in ret]
        ret = model.transform(df_test_['question']).toarray()
        df_test_['bow'] = [x for x in ret]    

#### Create supervised models

In [7]:
def create_models(models):
    ret = []
    for model in models:
        m = create_model(model)
        if m is not None:
            ret.append({'name': model, 'model': m})
    return ret
    
def create_model(model):
    if model == 'svm_linear':
        return svm_linear
    return None

def svm_linear():

    return LinearSVC()

### Benchmark

In [8]:
def run_benchmark(models, X, y, x_test, y_test, save='default.csv', sizes_train=[], start_results=None, metric_average="macro"):
    
    start_benchmark = time.time()
    
    last_model = None
    last_train_size = None
    results = pd.DataFrame()
    if start_results is not None:
        last_model = start_results['model'].tail(1)
        last_size_train = start_results['size_train'].tail(1)
        results = start_results
    for model in models:
            
        print(' ', model['name'], end=' ')
            
        for size_train in sizes_train:
            
            if start_results is not None:
                if model in start_results['model'].unique() and last_size_train >= size_train:
                        continue
            start_results = None
            
            print('.', end='')

            x_train = X[:size_train]
            y_train = y[:size_train]

            m = model['model']()
            start_time = time.time()
            m.fit(x_train, y_train)
            train_time = time.time() - start_time

            start_time = time.time()
            result = m.predict(x_test)
            test_time = time.time() - start_time
            
            data = {'datetime': datetime.datetime.now(),
                          'model': model['name'],
                          'accuracy': accuracy_score(result, y_test),
                          'precision': precision_score(result, y_test, average=metric_average),
                          'recall': recall_score(result, y_test, average=metric_average),
                          'f1': f1_score(result, y_test, average=metric_average),
                          'confusion': confusion_matrix(result, y_test),
                          'train_size': size_train,
                          'execution_time': train_time,
                          'test_time': test_time}
            results = results.append([data])
            results.to_csv(save)
    print('')
    aux = time.time() - start_benchmark
    print('Run time benchmark:', aux)
    return pd.DataFrame(results)