In [None]:
%%javascript
require("notebook/js/notebook").Notebook.prototype.scroll_to_bottom = function () {}

In [None]:
from notebook_prelude import *

In [None]:
N_JOBS = 2
VERBOSE = 11

DATASET = 'ling-spam'
DATASET = 'ng20'
#DATASET = 'reuters-21578'

In [None]:
from classification.classifiers import get_classifiers

classifiers = get_classifiers()
classifier_param_grid = dict(
    classifier = classifiers,
    classifier__max_iter= [2000],
    classifier__C = [2e-2, 1e-2],#, 1e-1],#, 1e-2, 1e-1, 1],
    classifier__tol = [1e-4],
    #classifier__dual = [False],
    #classifier__penalty = ['l1']
)

## Combined graph and text

In [None]:
def get_cv(X, Y, use_cv = False):
    if use_cv:
        return sklearn.model_selection.StratifiedKFold(
            n_splits=3,
            random_state=42,
            shuffle=True
        )
    else:
        _, _, _, _, X_train_i, X_test_i = train_test_split(X, Y, test_size=0.33, is_precomputed=False)
        return [(X_train_i, X_test_i)]
    
def process(X, Y, estimator, param_grid):
    gscv = sklearn.model_selection.GridSearchCV(estimator=estimator, param_grid=param_grid, cv=get_cv(X, Y), scoring='f1_macro', n_jobs=N_JOBS, verbose=VERBOSE, refit='f1_macro')
    gscv_result = gscv.fit(X, Y)
    return gscv_result

def get_combined_graph_datasets(graph_cache_file_filter = None):
    for graph_cache_file in dataset_helper.get_all_cached_graph_datasets():
        if graph_cache_file_filter and graph_cache_file_filter not in graph_cache_file: continue
        if 'concept' not in graph_cache_file: continue
        X_combined, Y_combined = graph_helper.get_filtered_text_graph_dataset(graph_cache_file)
        graphs = [g for (g, _, _) in X_combined]
        empty_graphs = len([1 for g in graphs if nx.number_of_nodes(g) == 0 or nx.number_of_edges(g) == 0])
        num_vertices = sum([nx.number_of_nodes(g) for g in graphs]) + empty_graphs
        X_combined = [(graph, text) for (graph, text, _) in X_combined]
        yield graph_cache_file, X_combined, Y_combined, num_vertices

In [None]:
from transformers.pipelines import graph_pipeline
from transformers import fast_wl_graph_kernel_transformer

pipeline, param_grid = graph_pipeline.get_combined_pipeline()

param_grid_values = dict(
    # Graph
    features__fast_wl_pipeline__feature_extraction__fast_wl__h = [7],
    features__fast_wl_pipeline__feature_extraction__fast_wl__round_to_decimals = [-1],
    features__fast_wl_pipeline__feature_extraction__fast_wl__node_weight_function = [None, fast_wl_graph_kernel_transformer.degrees_metric, fast_wl_graph_kernel_transformer.pagerank_metric],
    features__fast_wl_pipeline__feature_extraction__phi_picker__return_iteration = ['stacked'],
    features__fast_wl_pipeline__feature_extraction__normalizer = [sklearn.preprocessing.MaxAbsScaler()],
    # Text
    features__text__vectorizer__vectorizer__binary = [True],
    features__text__vectorizer__vectorizer__ngram_range = [(1, 1)],
    features__text__vectorizer__vectorizer__stop_words = ['english']
)

combined_param_grid = dict(param_grid, **param_grid_values)


estimator_param_grid = dict(classifier_param_grid, **combined_param_grid)
estimator_param_grid

In [None]:
from classification.classification_tasks import train_test_split

for graph_cache_file, X_combined, Y_combined, num_vertices in get_combined_graph_datasets(DATASET):
    print(graph_cache_file)
    estimator_param_grid['features__fast_wl_pipeline__feature_extraction__fast_wl__phi_dim'] = [num_vertices]
    gscv_result = process(X_combined, Y_combined, pipeline, estimator_param_grid)
    print(gscv_result)
    break

In [None]:
df = pd.DataFrame(gscv_result.cv_results_)
df

## Text-only

In [None]:
text_only_param_grid = dict(estimator_param_grid)
text_vectorizer = pipeline.named_steps['features'].transformer_list[0][1].named_steps['vectorizer']
text_only_pipeline = sklearn.pipeline.Pipeline([
    ('preprocessing', None),
    ('vectorizer', text_vectorizer),
    ('classifier', None)
])

# Only keep text and classifier params
text_only_param_grid = {k.replace('features__text__vectorizer__', ''): v for k, v in text_only_param_grid.items() if k.startswith('classifier') or k.startswith('features__text')}
text_only_param_grid

In [None]:
for dataset in dataset_helper.get_all_available_dataset_names():
    if dataset != DATASET: continue
    X, Y = dataset_helper.get_dataset(dataset)
    pipeline = text_only_pipeline.named_steps['vectorizer']
    gscv_result_text_only = process(X, Y, pipeline, text_only_param_grid)
    break

In [None]:
df = pd.DataFrame(gscv_result.cv_results_)
df_text = pd.DataFrame(gscv_result_text_only.cv_results_)

df_text['type'] = 'text'
df['type'] = 'combined'

df_all = df_text.append(df)

In [None]:
def get_node_weight_function_name(x):
    if callable(x):
        return x.__name__
    if x is None:
        return 'None'
    return ''
    

df_all['node_weight_function'] = df_all.param_features__fast_wl_pipeline__feature_extraction__fast_wl__node_weight_function.apply(get_node_weight_function_name)
df_all.groupby(['type', 'node_weight_function']).mean_test_score.max().to_frame()

In [None]:
vars(gscv_result)