In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from transformers.wl_graph_kernel_transformer import WLGraphKernelTransformer
from transformers.fast_wl_graph_kernel_transformer import FastWLGraphKernelTransformer
from transformers.phi_picker_transformer import PhiPickerTransformer
import graph_helper
import dataset_helper
import wl
import os
import pickle
import json
import tempfile
import gc

gc.collect()

TARGETS = [
    'dataset_graph_cooccurrence_1_only-nouns_ling-spam.npy',
    'dataset_graph_cooccurrence_1_all_ling-spam.npy'
]

for cache_file in dataset_helper.get_all_cached_graph_datasets():
    graph_dataset_cache_file = cache_file.split('/')[-1]
    
    if graph_dataset_cache_file not in TARGETS: continue
        
    result_file = 'data/results/{}.results.npy'.format(graph_dataset_cache_file)
    print('{}\tDataset File: {}'.format('#' * 10, graph_dataset_cache_file))

    if os.path.exists(result_file):
        print('\tAlready calculated result: {}'.format(result_file))
        continue

    if not os.path.exists(cache_file):
        print('\tCould not find cachefile: "{}". Skipping.'.format(cache_file))
        continue
        
    X, Y = dataset_helper.get_dataset_cached(cache_file)
    X, Y = np.array(X), np.array(Y)

    #cache_dir = tempfile.mkdtemp()
    p = Pipeline([
        ('wl_transformer', FastWLGraphKernelTransformer()),
        ('phi_picker', PhiPickerTransformer()),
        ('scaler', None),
        ('clf', None)
    ])#, memory = cache_dir)

    param_grid = dict(
        wl_transformer__h=[2],
        phi_picker__return_iteration=[-1, 0, 1],
        scaler = [sklearn.preprocessing.Normalizer(norm="l1")],
        # clf = [sklearn.linear_model.PassiveAggressiveClassifier(max_iter = 1000, tol = 1e-3)],
        # clf = [sklearn.svm.LinearSVC()],
        # clf = [sklearn.svm.LinearSVR()],
        # clf = [sklearn.svm.NuSVC()],
        # clf = [sklearn.svm.NuSVR()],
        # clf = [sklearn.svm.OneClassSVM()],
        clf = [sklearn.linear_model.PassiveAggressiveClassifier(max_iter = 1000)],
        # clf = [sklearn.svm.SVR()],
        # clf = [sklearn.svm.l1_min_c()],
        clf__max_iter=[1000],
        clf__tol = [1e-3],
        clf__class_weight=['balanced']
    )

    cv = GridSearchCV(
        estimator = p,
        param_grid=param_grid,
        cv=3,
        scoring = 'f1_macro',
        n_jobs=1,
        verbose = 11
    )
    
    gscv_result = cv.fit(X, Y)
    with open(result_file, 'wb') as f:
        pickle.dump(gscv_result.cv_results_, f)
    break
    #gscv_result.best_estimator_, gscv_result.cv_results_

In [None]:
gscv_result.best_score_, gscv_result.best_params_