# Meta-Sampling: Estratégia de seleção de instâncias baseada em meta-aprendizado

## Elaboração de meta-base 

In [1]:
DATA_DIR = '../../metabase/'

In [2]:
%%time
import os
import pandas as pd

csv_files =  [file for file in os.listdir(DATA_DIR) if file.endswith('csv')]
df_list = [pd.read_csv(os.path.join(DATA_DIR, file), index_col=False) for file in csv_files]

CPU times: user 5.28 s, sys: 1.9 s, total: 7.19 s
Wall time: 5.83 s


Juntando dataframes:

In [3]:
df = pd.concat(df_list, keys=csv_files)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.index.names=['dataset', 'index']

Adicionando a informação referente à query:

In [58]:
def add_query_index(x):
    x['query_index']=range(len(x))
    return x
df = df.groupby(['dataset','estimator', 'query-strategy'], group_keys=False).apply(add_query_index)

Rotulando qual melhor estratégia para cada tripla `(dataset, estimator, query_index)` com base na acurácia:

In [68]:
meta_base = df.groupby(['dataset','estimator','query_index'], group_keys=True).apply(lambda x: x.loc[x['accuracy'].idxmax()])
meta_base.drop(['estimator', 'accuracy', 'f1-micro', 'f1-macro', 'f1-weighted','query_index'], axis=1, inplace=True)

Fazendo download do arquivo:

In [69]:
meta_base.to_csv('metabase.csv')

## Treinamento de Meta-Modelo

In [72]:
meta_base =  pd.read_csv('metabase.csv', index_col=['dataset', 'estimator', 'query_index'])

Substituindo valores infinitos por `np.nan`

In [211]:
meta_base.replace([np.inf, -np.inf], np.nan, inplace=True)

Separando uma base para teste e o restante para treinamento (LOO)

In [212]:
train_bases = meta_base.index.levels[0][:-1]
test_base = meta_base.index.levels[0][-1:]

len(train_bases), len(test_base)

(87, 1)

Separando features e rótulos (utilizando apenas o classificador SVM)

In [214]:
train_data = meta_base.loc[train_bases].xs("SVC", level='estimator').reset_index()
test_data = meta_base.loc[test_base].xs("SVC", level='estimator').reset_index()

Separando conjuntos em treino e teste

In [216]:
X_train = train_data.drop(columns=['dataset', 'query-strategy'])
y_train = train_data['query-strategy']

X_test = test_data.drop(columns=['dataset', 'query-strategy'])
y_test = test_data['query-strategy']

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(6508, 180) (6508,)
(100, 180) (100,)


Induzindo modelo

In [210]:
import numpy as np 
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



clf = Pipeline([
    ('mean_inputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('meta-model', RandomForestClassifier())
])

clf.fit(X_train, y_train)

In [218]:
clf.predict(X_test)

array(['margin_sampling', 'margin_sampling', 'margin_sampling',
       'uncertainty_sampling', 'margin_sampling', 'uncertainty_sampling',
       'margin_sampling', 'uncertainty_sampling', 'uncertainty_sampling',
       'margin_sampling', 'uncertainty_sampling', 'margin_sampling',
       'uncertainty_sampling', 'margin_sampling', 'margin_sampling',
       'margin_sampling', 'margin_sampling', 'margin_sampling',
       'margin_sampling', 'margin_sampling', 'margin_sampling',
       'margin_sampling', 'margin_sampling', 'uncertainty_batch_sampling',
       'margin_sampling', 'margin_sampling', 'margin_sampling',
       'margin_sampling', 'margin_sampling', 'margin_sampling',
       'uncertainty_batch_sampling', 'margin_sampling', 'margin_sampling',
       'margin_sampling', 'margin_sampling', 'margin_sampling',
       'margin_sampling', 'margin_sampling', 'margin_sampling',
       'margin_sampling', 'margin_sampling', 'margin_sampling',
       'margin_sampling', 'consensus_entropy_samplin

In [207]:
np.isfinite(X_train.fillna(0).iloc[-1])

query_index                 True
attr_conc.mean              True
attr_conc.sd                True
attr_ent.mean               True
attr_ent.sd                 True
                            ... 
wg_dist.sd                  True
worst_node.mean             True
worst_node.mean.relative    True
worst_node.sd               True
worst_node.sd.relative      True
Name: 6507, Length: 180, dtype: bool