In [None]:
from notebook_prelude import *

In [None]:
import experiments
import experiments.task_runner
import sklearn.model_selection
from transformers.pipelines.classifiers import get_classifier_params

In [None]:
task_type = 'graph_combined'
dataset = 'ng20'
graph_type = 'concept-map'
version = 'v2'

tasks = experiments.get_filtered_tasks(task_type=task_type, dataset='ng20')
filtered_tasks = [t for t in tasks if version in t.name]
assert len(filtered_tasks) == 1

task = filtered_tasks[0]
X, Y, estimator, param_grid = task.fn()
X, Y = np.array(X), np.array(Y)

In [None]:
param_grid = add_classifier_to_params(param_grid)
params = list(sklearn.model_selection.ParameterGrid(param_grid))
param = [x for x in params if x['features__text__vectorizer__vectorizer__binary']][0]

In [None]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, stratify = Y, test_size = 0.2, random_state = 42)    

In [None]:
estimator.set_params(**param)

In [None]:
cv = sklearn.model_selection.StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
result = sklearn.model_selection.cross_val_score(estimator, X, y=Y, cv=cv, scoring = 'f1_macro')
np.mean(result)

In [None]:
clf = estimator.named_steps['classifier']
# one-vs-rest
coefs = np.sum(clf.coef_, axis = 0)
coefs_idx = np.argsort(coefs)

def get_fast_wl_vectorizer(estimator):
    return get_feature_transformer(estimator, 'fast_wl_pipeline').named_steps['feature_extraction'].named_steps['feature_extraction'].named_steps['fast_wl']

def get_feature_transformer(estimator, transformer_name):
    return [pipe for name, pipe in estimator.named_steps['features'].transformer_list if name == transformer_name][0]

def get_text_vectorizer(pipeline):
    return get_feature_transformer(estimator, 'text').named_steps['vectorizer'].named_steps['vectorizer']

def get_text_features(pipeline):
    return get_text_vectorizer(pipeline).vocabulary_

text_features = get_text_features(estimator)
trans_fast_wl = get_fast_wl_vectorizer(estimator)

len_features_combined = coefs.shape[0]
len_text_features = len(text_features)
len_graph_features_simple = trans_fast_wl.phi_list[-1].shape[1]
len_graph_fast_wl_iterations = len(trans_fast_wl.phi_list) - 1

In [None]:
idx_2_text = {idx: text for text, idx in text_features.items()}

In [None]:
top = 100
highest = coefs_idx[-top - 1:]
lowest = coefs_idx[:top]
highest_vals = coefs[highest]
lowest_vals = coefs[lowest]
fig, ax = plt.subplots()
labels = np.concatenate((highest, lowest))
labels = ['g' if x > len(text_features) else idx_2_text[x] for x in labels]
pd.DataFrame(dict(idx=labels, vals=list(highest_vals) + list(lowest_vals))).set_index('idx').sort_values('vals').vals.plot(kind = 'bar', ax = ax)
ax.grid('off')

In [None]:
cmaps = ['viridis', 'plasma', 'inferno', 'magma', 'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds', 'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu', 'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn',  'binary', 'gist_yarg', 'gist_gray', 'gray', 'bone', 'pink', 'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia', 'hot', 'afmhot', 'gist_heat', 'copper']

def plot_coefs(coefs, log=False, lines = [], cmap = None, fig = None, ax = None):
    _coefs = np.copy(coefs)
    
    size = _coefs.shape[0]
    new_size = int(np.floor(np.sqrt(_coefs.shape[0]))) + 1
    added = np.power(new_size, 2) - size
    _coefs = np.append(_coefs, [0] * added)
    _coefs = _coefs.reshape(new_size, -1)
    
    
    if log:
        _coefs = np.log(_coefs)
    
    if ax is None:
        fig, ax = plt.subplots()
    
    fig = ax.get_figure()
    img = ax.imshow(_coefs, cmap=plt.get_cmap(cmap))
    ax.grid('off')
    
    for line_y in lines:
        ax.axhline(line_y / new_size)
    
    fig.colorbar(img)
    
    return ax

lines = []
lines.append(len_text_features)
lines += [len_text_features + ((i + 1) * len_graph_features_simple) for i in range(len_graph_fast_wl_iterations)]
for cmap in cmaps:
    ax = plot_coefs(coefs, lines = lines, cmap=cmap)
    ax.set_title(cmap)
    ax.get_figure().tight_layout()
    plt.show()


In [None]:
features_lens = [0] + lines
df_features = pd.DataFrame(columns = ['label', 'coef'])
for idx, (start, end) in enumerate(zip(features_lens[:-1], features_lens[1:])):
    label = 'text' if idx == 0 else 'graph'#'graph_{}'.format(idx)
    els = coefs[start:end]
    df_ = pd.DataFrame(dict(label = [label] * len(els), coef = els))
    df_features = pd.concat([df_features, df_])

hist, bin_edges = np.histogram(df_features.coef, bins = 300)
#fig, axes = plt.subplots(ncols = len(df_features.groupby('label')))
fig, ax = plt.subplots(figsize = (EXPORT_FIG_WIDTH_BIG, EXPORT_FIG_HEIGHT_BIG - 2))
for (feature_label, df_) in df_features.groupby('label'):
    df_.coef.plot(kind='hist', ax = ax, label = feature_label, logy = True, alpha = 0.7, bins = bin_edges, legend = True, stacked = True)
ax.set_xlabel('SVM coefficient value')
#ax.set_title('Histogram of SVM coefficients trained with combined features (text + graph)\n(Dataset: {})'.format(dataset))
fig.tight_layout()

In [None]:
def get_sum_of_coefs(indices):
    vals = coefs[indices]
    vals_plus = vals[vals > 0]
    vals_minus = vals[vals < 0]
    return np.sum(vals_plus), np.sum(vals_minus)

# Text
vals_plus_t, vals_minus_t = get_sum_of_coefs(range(len_text_features))
vals_plus_g, vals_minus_g = get_sum_of_coefs(range(len_text_features, len(coefs)))

df_vals = pd.DataFrame([
    dict(label = 'text', plus = vals_plus_t, minus = vals_minus_t, num_features = len_text_features),
    dict(label = 'graph', plus = vals_plus_g, minus = vals_minus_g, num_features = len_features_combined - len_text_features)
]).set_index('label')
df_vals['absolute'] = df_vals.minus.abs() + df_vals.plus
df_vals['val_per_feature'] = df_vals.absolute / df_vals.num_features
df_vals

In [None]:
df_features.groupby('label').coef.describe()

In [None]:
def get_steps(e):
    for x in ['steps', 'transformer_list']:
        if hasattr(e, x):
            return getattr(e, x)
    return []

def print_pipeline(pipeline, depth = 0, delim = ' ' * 3, print_type = True):
    steps = get_steps(pipeline)
    for name, step in steps:
        print('{} {:40} {}'.format(delim * (depth + 1), name, '({})'.format(type(step).__name__) if print_type else ''))
        print_pipeline(step, depth = depth + 1, delim = delim)

print_pipeline(estimator)