# Combined graph- and text features

Here we train a _Perceptron_ on the combined features and look at the trained weights to gain an insight into the importance of the individual features.

In [None]:
from notebook_prelude import *

In [None]:
import experiments
import experiments.task_runner
from experiments import task_runner, task_helper
import sklearn.model_selection
from transformers.pipelines.classifiers import get_classifier_params
from transformers.pipelines import pipeline_helper

In [None]:
task_type = 'graph_combined'
dataset = 'ng20'
graph_type = 'concept-map'
version = 'v3'

tasks = experiments.get_filtered_tasks(task_type=task_type, dataset=dataset, task_name_filter=version)
filtered_tasks = [t for t in tasks if version in t.name]
assert len(filtered_tasks) == 1

task = filtered_tasks[0]
X, Y, estimator, param_grid_ = task.fn()
X, Y = np.array(X), np.array(Y)

In [None]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, stratify = Y, test_size = 0.2, random_state = 42)

param_grid = task_helper.add_classifier_to_params(param_grid_)
param_grid = dict(param_grid, **dict(
    classifier=[sklearn.linear_model.Perceptron()],
    classifier__penalty=['l1', 'l2'],
    features__text__vectorizer__vectorizer__binary=[True],
    features__fast_wl_pipeline__feature_extraction__feature_extraction__fast_wl__ignore_label_order=[True],
    features__fast_wl_pipeline__feature_extraction__feature_extraction__fast_wl__use_early_stopping=[False],
    features__fast_wl_pipeline__feature_extraction__feature_extraction__phi_picker__use_zeroth=[True],
    features__fast_wl_pipeline__feature_extraction__feature_extraction__fast_wl__h=[5],
))

del param_grid['classifier__C']
for k, v in sorted(pipeline_helper.remove_complex_types(param_grid).items(), key=lambda x: x[0]):
    print('{:100} {}'.format(k, v))

In [None]:
estimators = []

for params in sklearn.model_selection.ParameterGrid(param_grid):
    penalty = params['classifier__penalty']
    
    print('Regularization'.format(penalty.upper()))
    clf = sklearn.base.clone(estimator)
    clf.set_params(**params)
    
    print('\tFitting')
    clf.fit(X_train, Y_train)
    
    print('\tPredicting')
    for name, y_true, x in [('train', Y_train, X_train), ('test', Y_test, X_test)]:
        y_pred = clf.predict(x)
        f1_score = sklearn.metrics.f1_score(y_true, y_pred, average='macro')
        print('\t\tf1_macro {:6} {:.3f}'.format(name, f1_score))

    coefs = np.copy(clf.named_steps['classifier'].coef_)
    estimators.append((params, clf, coefs))

## Retrieve feature lenghts

In [None]:
def get_fast_wl_vectorizer(pipeline):
    return get_feature_transformer(pipeline, 'fast_wl_pipeline').named_steps['feature_extraction'].named_steps['feature_extraction'].named_steps['fast_wl']

def get_feature_transformer(pipeline, transformer_name):
    return [pipe for name, pipe in pipeline.named_steps['features'].transformer_list if name == transformer_name][0]

def get_text_vectorizer(pipeline):
    return get_feature_transformer(pipeline, 'text').named_steps['vectorizer'].named_steps['vectorizer']

def get_text_features(pipeline):
    return get_text_vectorizer(pipeline).vocabulary_

## Create combined graph/text vector

In [None]:
estimator = estimators[0][1]
features = estimator.named_steps['features'].fit_transform(X_test, Y_test)

In [None]:
text_features = get_text_features(estimator)
trans_fast_wl = get_fast_wl_vectorizer(estimator)

len_features_combined = features.shape[1]
len_text_features = len(text_features)
len_graph_features_simple = trans_fast_wl.phi_list[-1].shape[1]
len_graph_features = len_features_combined - len_text_features
# Do not use_zeroth? Then -1
len_graph_fast_wl_iterations = len(trans_fast_wl.phi_list)
assert (len_graph_fast_wl_iterations * len_graph_features_simple) == len_graph_features
assert len_graph_features + len_text_features == len_features_combined

lines = []
lines.append(len_text_features)
lines += [len_text_features + ((i + 1) * len_graph_features_simple) for i in range(len_graph_fast_wl_iterations)]

### Sparsity of graph and text features

The ratio between graph- and text features summed up is basically a measurement of the importance of the distinct features when also looking at the coefficients

In [None]:
feature_sum = np.squeeze(np.asarray(np.sum(features, axis=0)))
sum_text_features = np.sum(feature_sum[:len_text_features])
sum_graph_features = np.sum(feature_sum[len_text_features:])

print('Sum of all features\n\tgraph/text={:.3f}'.format(sum_graph_features / sum_text_features))

## Perceptron coefficients histogram

In [None]:
used_estimator = estimators[0]
used_coefs = used_estimator[2]
# Sum up all coefs
coefs = np.sum(used_coefs, axis = 0)
idx_2_text = {idx: text for text, idx in text_features.items()}

In [None]:
features_lens = [0] + lines
df_features = pd.DataFrame(columns = ['label', 'coef'])
for idx, (start, end) in enumerate(zip(features_lens[:-1], features_lens[1:])):
    label = 'text' if idx == 0 else 'graph'
    els = coefs[start:end]
    df_ = pd.DataFrame(dict(label = [label] * len(els), coef = els))
    df_features = pd.concat([df_features, df_])

hist, bin_edges = np.histogram(df_features.coef, bins = 100)
fig, ax = plt.subplots(figsize = (EXPORT_FIG_WIDTH_BIG, EXPORT_FIG_HEIGHT_BIG - 2))
for (feature_label, df_) in df_features.groupby('label'):
    df_.coef.plot(kind='hist', ax = ax, label = feature_label, logy = True, alpha = 0.7, bins = bin_edges, legend = True, stacked = True)
ax.set_xlabel('Perceptron coefficient value')
fig.tight_layout()

In [None]:
def get_sum_of_coefs(coefs, indices):
    vals = coefs[0, indices]
    vals_plus = vals[vals > 0]
    vals_minus = vals[vals < 0]
    return np.sum(vals_plus), np.sum(vals_minus)

feature_range = [('text', len_text_features)] + [('graph_{}'.format(i + 1), len_graph_features_simple) for i in range(len_graph_fast_wl_iterations)]

data = collections.defaultdict(lambda: [])
for params, clf, coefs in estimators:
    current = 0
    vals = []
    for name, num_features in feature_range:
        num_features -= 1
        end = current + num_features
        vals.append(((current, end), get_sum_of_coefs(coefs, list(range(current, end)))))
        current = end
    
    for (name, num_features), ((start, end), (val_plus, val_minus)) in zip(feature_range, vals):
        data['type'].append(params['classifier__penalty'])
        data['label'].append(name)
        data['plus'].append(val_plus)
        data['minus'].append(val_minus)
        data['start'].append(start)
        data['end'].append(end)
        data['num_features'].append(num_features)

df_vals = pd.DataFrame(data).set_index('label')
df_vals['absolute'] = df_vals.minus.abs() + df_vals.plus
df_vals['val_per_feature'] = df_vals.absolute / df_vals.num_features
df_vals

### Plot

In [None]:
fig, axes = plt.subplots(figsize=(11, 6), nrows=2, sharex=True)

for ax, (title, df_) in zip(axes, df_vals.groupby('type')):
    if len(df_[df_.index == 'graph_all']): continue
    df_ = df_.sort_index()
    graph_features = df_[df_.index.str.contains('graph')]
    sum_ = graph_features.sum().to_frame().T
    sum_.index = ['graph_all']
    df_ = df_.append(sum_).sort_index()
    df_.set_index(df_.index.map(lambda x: x.replace('graph_', 'Graph ').replace('all', '(All)').title()), inplace=True)
    df_.absolute.plot(kind='barh', ax = ax, log=True, title='Regularization: {}'.format(title.upper()))
    for idx, (label, df__) in enumerate(df_.iterrows()):
        val = df__.absolute
        ax.text(val * 0.9, idx, '{:.0f}'.format(val), fontdict=dict(horizontalalignment='right', verticalalignment='center', weight='bold'), color='white')
    
for ax in axes:
    ax.set_xlabel('sum of absolute coefficient values (log)')
    ax.grid(False)

fig.tight_layout()
save_fig(fig, 'combined_coefs_l1_l2_regularization')

## Cross-validation (unused)

In [None]:
if False:
    cv = sklearn.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    gscv = sklearn.model_selection.GridSearchCV(estimator, param_grid=param_grid, scoring='f1_macro', cv=cv, verbose=2)
    gscv_result = gscv.fit(X_train, Y_train)

## Coef heatmap

In [None]:
def plot_coefs_(coefs, top=100):
    coefs_idx = np.argsort(coefs)
    highest = coefs_idx[-top - 1:]
    lowest = coefs_idx[:top]
    highest_vals = coefs[highest]
    lowest_vals = coefs[lowest]
    fig, ax = plt.subplots()
    labels = np.concatenate((highest, lowest))
    #labels = ['g' if x > len(text_features) else idx_2_text[x] for x in labels]
    labels = ['G' if x > len(text_features) else 'T' for x in labels]
    pd.DataFrame(dict(idx=labels, vals=list(highest_vals) + list(lowest_vals))).set_index('idx').sort_values('vals').vals.plot(kind = 'bar', ax = ax)
    ax.grid('off')
    return fig, ax

plot_coefs_(coefs)

In [None]:
cmaps = ['viridis', 'plasma', 'inferno', 'magma', 'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds', 'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu', 'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn',  'binary', 'gist_yarg', 'gist_gray', 'gray', 'bone', 'pink', 'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia', 'hot', 'afmhot', 'gist_heat', 'copper']

def plot_coefs_heatmap(coefs, log=False, lines = [], cmap = None, fig = None, ax = None):
    _coefs = np.copy(coefs)
    
    size = _coefs.shape[0]
    new_size = int(np.floor(np.sqrt(_coefs.shape[0]))) + 1
    added = np.power(new_size, 2) - size
    _coefs = np.append(_coefs, [0] * added)
    _coefs = _coefs.reshape(new_size, -1)
    
    
    if log:
        _coefs = np.log(_coefs)
    
    if ax is None:
        fig, ax = plt.subplots()
    
    fig = ax.get_figure()
    img = ax.imshow(_coefs, cmap=plt.get_cmap(cmap))
    ax.grid('off')
    
    for line_y in lines:
        ax.axhline(line_y / new_size)
    
    fig.colorbar(img)
    
    return ax

for cmap in cmaps:
    ax = plot_coefs_heatmap(coefs, lines = lines, cmap=cmap)
    ax.set_title(cmap)
    ax.get_figure().tight_layout()
    plt.show()
