# Baselines

In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
import pandas as pd
import helper
import matplotlib
import matplotlib.pyplot as plt
import w2v_d2v
import dataset_helper
import preprocessing
import classifier_baseline
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from transformers.wl_graph_kernel_transformer import WLGraphKernelTransformer
from transformers.preprocessing_transformer import PreProcessingTransformer
import graph_helper
import dataset_helper
import wl
import os
import pickle

for dataset_name in dataset_helper.get_all_available_dataset_names():
    if dataset_name != 'ling-spam': continue
            
    result_file = 'data/results/text_{}.results.npy'.format(dataset_name)
    
    if os.path.exists(result_file): continue
    
    X, Y = dataset_helper.get_dataset(dataset_name, use_cached= True)
    
    p = Pipeline([
        ('preprocessing', None),
        ('count_vectorizer', sklearn.feature_extraction.text.CountVectorizer()),
        ('TfidfTransformer', sklearn.feature_extraction.text.TfidfTransformer()),
        ('clf', None)
    ])
    
    param_grid = dict(
        preprocessing = [None, PreProcessingTransformer(only_nouns = True)],
        count_vectorizer__stop_words = ['english'],
        clf = [sklearn.linear_model.PassiveAggressiveClassifier(max_iter = 1000)],
        clf__class_weight = ['balanced']
    )

    cv = sklearn.model_selection.StratifiedKFold(n_splits = 3, random_state= 42, shuffle= True)
    gscv = GridSearchCV(estimator = p, param_grid=param_grid, cv=cv, scoring = 'f1_macro', n_jobs=1, verbose = 11)
    gscv_result = gscv.fit(X, Y)
    
    with open(result_file, 'wb') as f:
        pickle.dump(gscv_result.cv_results_, f)
    print('Best score:\t{:.5f}\nBest params:\t{}'.format(gscv_result.best_score_, gscv_result.best_params_))

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] clf__max_iter=1000, clf__class_weight=balanced, preprocessing=None, count_vectorizer__stop_words=english 
[CV]  clf__max_iter=1000, clf__class_weight=balanced, preprocessing=None, count_vectorizer__stop_words=english, score=0.9831845402322218, total=   3.1s
[CV] clf__max_iter=1000, clf__class_weight=balanced, preprocessing=None, count_vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.2s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__class_weight=balanced, preprocessing=None, count_vectorizer__stop_words=english, score=0.9981218498609896, total=   3.0s
[CV] clf__max_iter=1000, clf__class_weight=balanced, preprocessing=None, count_vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.3s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__class_weight=balanced, preprocessing=None, count_vectorizer__stop_words=english, score=0.988759328358209, total=   2.9s
[CV] clf__max_iter=1000, clf__class_weight=balanced, preprocessing=PreProcessingTransformer(only_nouns=True, return_text=True), count_vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.4s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__class_weight=balanced, preprocessing=PreProcessingTransformer(only_nouns=True, return_text=True), count_vectorizer__stop_words=english, score=0.9831001208385142, total=  20.5s
[CV] clf__max_iter=1000, clf__class_weight=balanced, preprocessing=PreProcessingTransformer(only_nouns=True, return_text=True), count_vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   45.1s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__class_weight=balanced, preprocessing=PreProcessingTransformer(only_nouns=True, return_text=True), count_vectorizer__stop_words=english, score=0.988759328358209, total=  19.0s
[CV] clf__max_iter=1000, clf__class_weight=balanced, preprocessing=PreProcessingTransformer(only_nouns=True, return_text=True), count_vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min remaining:    0.0s


[CV]  clf__max_iter=1000, clf__class_weight=balanced, preprocessing=PreProcessingTransformer(only_nouns=True, return_text=True), count_vectorizer__stop_words=english, score=0.9795453449456517, total=  20.8s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.9min finished


Best score:	0.99002
Best params:	{'clf__max_iter': 1000, 'clf__class_weight': 'balanced', 'preprocessing': None, 'count_vectorizer__stop_words': 'english'}


## Graphs

In [None]:
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from transformers.wl_graph_kernel_transformer import WLGraphKernelTransformer
from transformers.fast_wl_graph_kernel_transformer import FastWLGraphKernelTransformer
from transformers.phi_picker_transformer import PhiPickerTransformer
import graph_helper
import dataset_helper
import wl
import os
import pickle
import json
import tempfile
import gc

gc.collect()

TARGETS = [
    'dataset_graph_cooccurrence_1_only-nouns_ling-spam.npy',
    'dataset_graph_cooccurrence_2_only-nouns_ling-spam.npy',
    'dataset_graph_cooccurrence_3_only-nouns_ling-spam.npy',
    'dataset_graph_cooccurrence_4_only-nouns_ling-spam.npy',
    'dataset_graph_cooccurrence_1_all_ling-spam.npy',
    'dataset_graph_gml_ling-spam-single.npy'
]

for cache_file in dataset_helper.get_all_cached_graph_datasets():
    graph_dataset_cache_file = cache_file.split('/')[-1]
    
    if graph_dataset_cache_file not in TARGETS: continue
        
    result_file = 'data/results/{}.results.npy'.format(graph_dataset_cache_file)
    print('{}\tDataset File: {}'.format('#' * 10, graph_dataset_cache_file))

    if os.path.exists(result_file):
        print('\tAlready calculated result: {}'.format(result_file))
        continue

    if not os.path.exists(cache_file):
        print('\tCould not find cachefile: "{}". Skipping.'.format(cache_file))
        continue
        
    X, Y = dataset_helper.get_dataset_cached(cache_file)
    X, Y = np.array(X), np.array(Y)
    
    p = Pipeline([
        ('wl_transformer', FastWLGraphKernelTransformer()),
        ('phi_picker', PhiPickerTransformer()),
        ('scaler', None),
        ('clf', None)
    ])

    param_grid = dict(
        wl_transformer__h=[2],
        phi_picker__return_iteration=[0, 1, 2],
        scaler = [None, sklearn.preprocessing.Normalizer(norm="l1", copy = False)],
        clf = [sklearn.linear_model.PassiveAggressiveClassifier(max_iter = 1000)],
        clf__max_iter=[1000],
        clf__tol = [1e-3],
        clf__class_weight=['balanced']
    )

    cv = sklearn.model_selection.StratifiedKFold(n_splits = 3, random_state= 42, shuffle= True)
    gscv = GridSearchCV(
        estimator = p,
        param_grid=param_grid,
        cv=cv,
        scoring = 'f1_macro',
        n_jobs=1,
        verbose = 11
    )
    
    try:
        gscv_result = gscv.fit(X, Y)
    except Exception as e:
        print('Error occured during fitting: {}'.format(e))
        continue
    with open(result_file, 'wb') as f:
        pickle.dump(gscv_result.cv_results_, f)


##########	Dataset File: dataset_graph_cooccurrence_1_all_ling-spam.npy
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=None,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced 
FastWLGraphKernelTransformer.fit: len(X)=1928, H=2
FastWLGraphKernelTransformer.fit: Found empty graphs in training set: 1
FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
FastWLGraphKernelTransformer.transform: len(X)=965, H=2
FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
[CV]  phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
            

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.3s remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced, score=0.9962717449316616, total=  22.8s
[CV] phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_we

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.0min remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced, score=0.9774053673971641, total=  23.0s
[CV] phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=Normalizer(copy=Fal

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.6min remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
FastWLGraphKernelTransformer.transform: len(X)=965, H=2
FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
[CV]  phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=Normalizer(copy=False, norm='l1'), clf__class_weight=balanced, score=0.985090077562498, total=  21.2s
[CV] phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False)

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.1min remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=Normalizer(copy=False, norm='l1'), clf__class_weight=balanced, score=0.9907481165123087, total=  22.2s
[CV] phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.6min remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=Normalizer(copy=False, norm='l1'), clf__class_weight=balanced, score=0.9868529490269273, total=  22.8s
[CV] phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.1min remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
FastWLGraphKernelTransformer.transform: len(X)=965, H=2
FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
[CV]  phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced, score=0.879621775360511, total=  20.9s
[CV] phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_wei

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.6min remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced, score=0.9062743064557947, total=  23.1s
[CV] phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_we

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.2min remaining:    0.0s


FastWLGraphKernelTransformer.fit: Found empty graphs in training set: 0
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced, score=0.8679942878381814, total=  24.6s
[CV] phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.7min remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
FastWLGraphKernelTransformer.transform: len(X)=965, H=2
FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
[CV]  phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=Normalizer(copy=False, norm='l1'), clf__class_weight=balanced, score=0.8259082619363134, total=  22.0s
[CV] phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  5.2min remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=Normalizer(copy=False, norm='l1'), clf__class_weight=balanced, score=0.7146960995940753, total=  21.5s
[CV] phi_picker__return_iteration=1, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False

FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=2, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=Normalizer(copy=False, norm='l1'), clf__class_weight=balanced, score=0.7673118878522025, total=  22.0s
FastWLGraphKernelTransformer.fit: len(X)=2893, H=2
FastWLGraphKernelTransformer.fit: Found empty graphs in training set: 0


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  9.9min finished


FastWLGraphKernelTransformer.transform: len(X)=2893, H=2
##########	Dataset File: dataset_graph_cooccurrence_1_only-nouns_ling-spam.npy
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=None,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced 
FastWLGraphKernelTransformer.fit: len(X)=1928, H=2
FastWLGraphKernelTransformer.fit: Found empty graphs in training set: 1
FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
FastWLGraphKernelTransformer.transform: len(X)=965, H=2
FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
[CV]  phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressive

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.5s remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced, score=0.9888152347949847, total=  12.4s
[CV] phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_we

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   36.8s remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
FastWLGraphKernelTransformer.transform: len(X)=964, H=2
FastWLGraphKernelTransformer.transform: len(X)=1929, H=2
[CV]  phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced, score=0.9698738231962187, total=  13.4s
[CV] phi_picker__return_iteration=0, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight='balanced',
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=0.001,
              verbose=0, warm_start=False), scaler=Normalizer(copy=Fal

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   54.6s remaining:    0.0s


FastWLGraphKernelTransformer.transform: len(X)=1928, H=2
FastWLGraphKernelTransformer.transform: len(X)=965, H=2


## Appendix

### Pre-process docs for d2v and w2v

In [None]:
w2v_data_train = [w2v_d2v.w2v_preproess(doc) for doc in data_train_X]
w2v_data_test = [w2v_d2v.w2v_preproess(doc) for doc in data_test_X]

### Train Doc2Vec

In [None]:
model_d2v = w2v_d2v.train_d2v(w2v_data_train, data_train_Y, iterations = 5)

In [None]:
scores = w2v_d2v.score_d2v(clfs, data_train_Y, data_test_Y, model_d2v, w2v_data_train, w2v_data_test, steps = 1)