In [None]:
from notebook_prelude import *

In [None]:
import experiments
import experiments.task_runner
from experiments import task_runner, task_helper
import sklearn.model_selection
from transformers.pipelines.classifiers import get_classifier_params

In [None]:
task_type = 'graph_combined'
dataset = 'ng20'
graph_type = 'concept-map'
version = 'v2'

tasks = experiments.get_filtered_tasks(task_type=task_type, dataset=dataset, task_name_filter=version)
filtered_tasks = [t for t in tasks if version in t.name]
assert len(filtered_tasks) == 1

task = filtered_tasks[0]
X, Y, estimator, param_grid = task.fn()
X, Y = np.array(X), np.array(Y)

In [None]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, stratify = Y, test_size = 0.2, random_state = 42)

param_grid = task_helper.add_classifier_to_params(param_grid)
param_grid = dict(param_grid, **dict(
    classifier=[sklearn.linear_model.Perceptron()],
    classifier__penalty=['l1', 'l2'],
    classifier__C=[1e-2, 1e-1],
    features__text__vectorizer__vectorizer__binary=[True],
    features__fast_wl_pipeline__feature_extraction__feature_extraction__fast_wl__ignore_label_order=[True],
    features__fast_wl_pipeline__feature_extraction__feature_extraction__fast_wl__use_early_stopping=[False],
    features__fast_wl_pipeline__feature_extraction__feature_extraction__fast_wl__h=[5],
))

del param_grid['classifier__C']
param_grid

In [None]:
cv = sklearn.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
gscv = sklearn.model_selection.GridSearchCV(estimator, param_grid=param_grid, scoring='f1_macro', cv=cv, verbose=2)
gscv_result = gscv.fit(X_train, Y_train)

In [None]:
from transformers.pipelines import pipeline_helper
estimators = []
NUM_ELEMENTS = len(X_train)
#NUM_ELEMENTS = 100

for params in sklearn.model_selection.ParameterGrid(param_grid):
    print('Starting classification: ')
    pprint(pipeline_helper.remove_complex_types_simple(params))
    clf = sklearn.base.clone(estimator)
    clf.set_params(**params)
    clf.fit(X_train[:NUM_ELEMENTS], Y_train[:NUM_ELEMENTS])
    print('Predicting')
    Y_test_pred = clf.predict(X_test)
    f1_score = sklearn.metrics.f1_score(Y_test, Y_test_pred, average='macro')
    print(params['classifier__penalty'], f1_score)
    estimators.append((params, clf, np.copy(clf.named_steps['classifier'].coef_)))

In [None]:
used_estimator=estimators[1][1]
#clf = used_estimator.named_steps['classifier']
# one-vs-rest
used_coefs = estimators[1][2]
coefs = np.sum(used_coefs, axis = 0)
coefs_idx = np.argsort(coefs)

def get_fast_wl_vectorizer(pipeline):
    return get_feature_transformer(pipeline, 'fast_wl_pipeline').named_steps['feature_extraction'].named_steps['feature_extraction'].named_steps['fast_wl']

def get_feature_transformer(pipeline, transformer_name):
    return [pipe for name, pipe in pipeline.named_steps['features'].transformer_list if name == transformer_name][0]

def get_text_vectorizer(pipeline):
    return get_feature_transformer(pipeline, 'text').named_steps['vectorizer'].named_steps['vectorizer']

def get_text_features(pipeline):
    return get_text_vectorizer(pipeline).vocabulary_

text_features = get_text_features(used_estimator)
trans_fast_wl = get_fast_wl_vectorizer(used_estimator)
len_features_combined = coefs.shape[0]
len_text_features = len(text_features)
len_graph_features_simple = trans_fast_wl.phi_list[-1].shape[1]
len_graph_features = len_features_combined - len_text_features
len_graph_fast_wl_iterations = len(trans_fast_wl.phi_list) - 1
assert (len_graph_fast_wl_iterations * len_graph_features_simple) == len_graph_features
assert len_graph_features + len_text_features == len_features_combined

idx_2_text = {idx: text for text, idx in text_features.items()}

In [None]:
def plot_coefs_(coefs, top=100):
    coefs_idx = np.argsort(coefs)
    highest = coefs_idx[-top - 1:]
    lowest = coefs_idx[:top]
    highest_vals = coefs[highest]
    lowest_vals = coefs[lowest]
    fig, ax = plt.subplots()
    labels = np.concatenate((highest, lowest))
    #labels = ['g' if x > len(text_features) else idx_2_text[x] for x in labels]
    labels = ['G' if x > len(text_features) else 'T' for x in labels]
    pd.DataFrame(dict(idx=labels, vals=list(highest_vals) + list(lowest_vals))).set_index('idx').sort_values('vals').vals.plot(kind = 'bar', ax = ax)
    ax.grid('off')
    return fig, ax

plot_coefs_(coefs)

In [None]:
cmaps = ['viridis', 'plasma', 'inferno', 'magma', 'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds', 'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu', 'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn',  'binary', 'gist_yarg', 'gist_gray', 'gray', 'bone', 'pink', 'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia', 'hot', 'afmhot', 'gist_heat', 'copper']

def plot_coefs_heatmap(coefs, log=False, lines = [], cmap = None, fig = None, ax = None):
    _coefs = np.copy(coefs)
    
    size = _coefs.shape[0]
    new_size = int(np.floor(np.sqrt(_coefs.shape[0]))) + 1
    added = np.power(new_size, 2) - size
    _coefs = np.append(_coefs, [0] * added)
    _coefs = _coefs.reshape(new_size, -1)
    
    
    if log:
        _coefs = np.log(_coefs)
    
    if ax is None:
        fig, ax = plt.subplots()
    
    fig = ax.get_figure()
    img = ax.imshow(_coefs, cmap=plt.get_cmap(cmap))
    ax.grid('off')
    
    for line_y in lines:
        ax.axhline(line_y / new_size)
    
    fig.colorbar(img)
    
    return ax

lines = []
lines.append(len_text_features)
lines += [len_text_features + ((i + 1) * len_graph_features_simple) for i in range(len_graph_fast_wl_iterations)]
for cmap in cmaps:
    ax = plot_coefs_heatmap(coefs, lines = lines, cmap=cmap)
    ax.set_title(cmap)
    ax.get_figure().tight_layout()
    plt.show()


In [None]:
features_lens = [0] + lines
df_features = pd.DataFrame(columns = ['label', 'coef'])
for idx, (start, end) in enumerate(zip(features_lens[:-1], features_lens[1:])):
    label = 'text' if idx == 0 else 'graph'#'graph_{}'.format(idx)
    els = coefs[start:end]
    df_ = pd.DataFrame(dict(label = [label] * len(els), coef = els))
    df_features = pd.concat([df_features, df_])

hist, bin_edges = np.histogram(df_features.coef, bins = 300)
#fig, axes = plt.subplots(ncols = len(df_features.groupby('label')))
fig, ax = plt.subplots(figsize = (EXPORT_FIG_WIDTH_BIG, EXPORT_FIG_HEIGHT_BIG - 2))
for (feature_label, df_) in df_features.groupby('label'):
    df_.coef.plot(kind='hist', ax = ax, label = feature_label, logy = True, alpha = 0.7, bins = bin_edges, legend = True, stacked = True)
ax.set_xlabel('SVM coefficient value')
#ax.set_title('Histogram of SVM coefficients trained with combined features (text + graph)\n(Dataset: {})'.format(dataset))
fig.tight_layout()

In [None]:
def get_sum_of_coefs(indices):
    vals = coefs[indices]
    vals_plus = vals[vals > 0]
    vals_minus = vals[vals < 0]
    return np.sum(vals_plus), np.sum(vals_minus)

feature_range = [('text', len_text_features)] + [('graph_{}'.format(i + 1), len_graph_features_simple) for i in range(len_graph_fast_wl_iterations)]

for params, clf, coefs in estimators:
    current = 0
    vals = []
    for name, num_features in feature_range:
        num_features -= 1
        end = current + num_features
        vals.append(((current, end), get_sum_of_coefs(range(current, end))))
        current = end

    data = collections.defaultdict(lambda: [])
    for (name, num_features), ((start, end), (val_plus, val_minus)) in zip(feature_range, vals):
        data['type'].append(params['classifier__'])
        data['label'].append(name)
        data['plus'].append(val_plus)
        data['minus'].append(val_minus)
        data['start'].append(start)
        data['end'].append(end)
        data['num_features'].append(num_features)
    
df_vals = pd.DataFrame(data).set_index('label')
df_vals['absolute'] = df_vals.minus.abs() + df_vals.plus
df_vals['val_per_feature'] = df_vals.absolute / df_vals.num_features
df_vals

In [None]:
fig, axes = plt.subplots(nrows=2)


for ax, title, df_ in zip(axes, ['l1 regularization', 'l2 regularization'], [df_vals_l1, df_vals_l2]):
    if len(df_[df_.index == 'graph_all']): continue
    graph_features = df_[df_.index.str.contains('graph')]
    sum_ = graph_features.sum().to_frame().T
    #display(df_.append(graph_features.sum(), ignore_index=False))
    sum_.index = ['graph_all']
    display()
    df_ = df_.append(sum_).sort_index()
    df_.absolute.plot(kind='barh', ax = ax, log=True, title=title)
    for idx, (label, df__) in enumerate(df_.iterrows()):
        val = df__.absolute
        ax.text(val * 1.2, idx, '{:.2f}'.format(val), fontdict=dict(horizontalalignment='left', verticalalignment='center'))
    
for ax in axes:
    ax.set_xlabel('sum of absolute coefs values (log)')

fig.suptitle('Sum of absolute coef values per feature type (Classifier: Perceptron)')

fig.tight_layout()
fig.subplots_adjust(top=0.87, hspace=0.6)
save_fig(fig, 'combined_coefs_l1_l2_regularization')

In [None]:
X_text, Y_text = dataset_helper.get_dataset('ling-spam')
features = [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
labels = [1, 2, 3, 4]
clf = sklearn.svm.LinearSVC(penalty='l1')
clf.fit(features, labels)

In [None]:
df_features.groupby('label').coef.describe()

In [None]:
def get_steps(e):
    for x in ['steps', 'transformer_list']:
        if hasattr(e, x):
            return getattr(e, x)
    return []

def print_pipeline(pipeline, depth = 0, delim = ' ' * 3, print_type = True):
    steps = get_steps(pipeline)
    for name, step in steps:
        print('{} {:40} {}'.format(delim * (depth + 1), name, '({})'.format(type(step).__name__) if print_type else ''))
        print_pipeline(step, depth = depth + 1, delim = delim)

print_pipeline(estimator)