In [None]:
from notebook_prelude import *

In [None]:
from experiments import get_all_tasks
import sklearn.model_selection
from transformers.pipelines.classifiers import get_classifier_params

In [None]:
task_type = 'graph_combined'
dataset = 'ng20'
graph_type = 'concept-map'
version = 'v2'

all_tasks = get_all_tasks()
filtered_tasks = [t for t in all_tasks if t.type == task_type and dataset in t.name and graph_type in t.name and version in t.name]

assert len(filtered_tasks) == 1

In [None]:
task = filtered_tasks[0]
X, Y, estimator, param_grid = task.fn()
X, Y = np.array(X), np.array(Y)

In [None]:
classifier_params = get_classifier_params()
param_grid = dict(classifier_params, **param_grid)
params = list(sklearn.model_selection.ParameterGrid(param_grid))
param = [x for x in params if x['features__text__vectorizer__vectorizer__binary']][0]

In [None]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, stratify = Y, test_size = 0.2, random_state = 42)    

In [None]:
estimator.set_params(**param)
Y_pred_train = estimator.fit(X_train, Y_train)

In [None]:
Y_pred_test = estimator.predict(X_test)

In [None]:
sklearn.metrics.f1_score(Y_test, Y_pred_test, average = 'macro')

In [None]:
clf = estimator.named_steps['classifier']
# one-vs-rest
coefs = np.sum(clf.coef_, axis = 0)
coefs_idx = np.argsort(coefs)

def get_fast_wl_vectorizer(estimator):
    pass

def get_feature_transformer(estimator, transformer_name):
    return [pipe for name, pipe in estimator.named_steps['features'].transformer_list if name == transformer_name][0]

def get_text_vectorizer(pipeline):
    return get_feature_transformer(estimator, 'text').named_steps['vectorizer'].named_steps['vectorizer']

def get_text_features(pipeline):
    return get_text_vectorizer(pipeline).vocabulary_

text_features = get_text_features(estimator)
trans_fast_wl = get_feature_transformer(estimator, 'fast_wl_pipeline').named_steps['feature_extraction'].named_steps['feature_extraction'].named_steps['fast_wl']

len_features_combined = coefs.shape[0]
len_text_features = len(text_features)
len_graph_features_simple = trans_fast_wl.phi_list[-1].shape[1]


for idx, coef in enumerate(reversed(coefs_idx)):
    if coef > len(text_features):
        print(idx)


In [None]:
top = 100
highest = coefs_idx[-top - 1:]
lowest = coefs_idx[:top]
highest_vals = coefs[highest]
lowest_vals = coefs[lowest]
fig, ax = plt.subplots()
pd.DataFrame(list(highest_vals) + list(lowest_vals)).sort_values(0)[0].plot(kind = 'bar', ax = ax)
ax.grid('off')