# Baselines

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import helper
import matplotlib
import matplotlib.pyplot as plt
import w2v_d2v
import dataset_helper
import preprocessing
import classifier_baseline
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from transformers.wl_graph_kernel_transformer import WLGraphKernelTransformer
from transformers.preprocessing_transformer import PreProcessingTransformer
import graph_helper
import dataset_helper
import wl
import os
import pickle

for dataset_name in dataset_helper.get_all_available_dataset_names():
    if dataset_name != 'ling-spam': continue
            
    result_file = 'data/results/text_{}.results.npy'.format(dataset_name)
    
    if os.path.exists(result_file): continue
    
    X, Y = dataset_helper.get_dataset(dataset_name, use_cached= True)
    
    p = Pipeline([
        ('preprocessing', None),
        ('count_vectorizer', sklearn.feature_extraction.text.CountVectorizer()),
        ('TfidfTransformer', sklearn.feature_extraction.text.TfidfTransformer()),
        ('clf', None)
    ])
    
    param_grid = dict(
        preprocessing = [None, PreProcessingTransformer(only_nouns = True)],
        count_vectorizer__stop_words = ['english'],
        clf = [sklearn.linear_model.PassiveAggressiveClassifier(max_iter = 1000)],
        clf__class_weight = ['balanced']
    )

    cv = sklearn.model_selection.StratifiedKFold(n_splits = 3, random_state= 42, shuffle= True)
    gscv = GridSearchCV(estimator = p, param_grid=param_grid, cv=cv, scoring = 'f1_macro', n_jobs=1, verbose = 11)
    gscv_result = gscv.fit(X, Y)
    
    with open(result_file, 'wb') as f:
        pickle.dump(gscv_result.cv_results_, f)
    print('Best score:\t{:.5f}\nBest params:\t{}'.format(gscv_result.best_score_, gscv_result.best_params_))

## Graphs

In [2]:
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from transformers.wl_graph_kernel_transformer import WLGraphKernelTransformer
from transformers.fast_wl_graph_kernel_transformer import FastWLGraphKernelTransformer
from transformers.phi_picker_transformer import PhiPickerTransformer
import graph_helper
import dataset_helper
import wl
import os
import pickle
import json
import tempfile
import gc
import traceback
import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

gc.collect()

for cache_file in dataset_helper.get_all_cached_graph_datasets():
    graph_dataset_cache_file = cache_file.split('/')[-1]
    
    result_file = 'data/results/{}.results.npy'.format(graph_dataset_cache_file)
    logger.info('{}\tDataset File: {}'.format('#' * 10, graph_dataset_cache_file))

    if os.path.exists(result_file):
        logger.warning('\tAlready calculated result: {}'.format(result_file))
        continue

    if not os.path.exists(cache_file):
        logger.warning('\tCould not find cachefile: "{}". Skipping.'.format(cache_file))
        continue
        
    gc.collect()
    
    X, Y = dataset_helper.get_dataset_cached(cache_file)
    X, Y = np.array(X), np.array(Y)
    
    p = Pipeline([
        ('wl_transformer', FastWLGraphKernelTransformer()),
        ('phi_picker', PhiPickerTransformer()),
        ('scaler', None),
        ('clf', None)
    ])

    param_grid = dict(
        wl_transformer__h=[2],
        phi_picker__return_iteration=[0, 1, 2],
        scaler = [None, sklearn.preprocessing.Normalizer(norm="l1", copy = False)],
        clf = [sklearn.linear_model.PassiveAggressiveClassifier(max_iter = 1000)],
        clf__max_iter=[1000],
        clf__tol = [1e-3],
        clf__class_weight=['balanced']
    )

    cv = sklearn.model_selection.StratifiedKFold(n_splits = 3, random_state= 42, shuffle= True)
    gscv = GridSearchCV(
        estimator = p,
        param_grid=param_grid,
        cv=cv,
        scoring = 'f1_macro',
        n_jobs=1,
        verbose = 11
    )
    if 1 == 1:
        try:
            gscv_result = gscv.fit(X, Y)
        except Exception as e:
            logger.exception(e)
            continue
    else:
        gscv_result = gscv.fit(X, Y)
    with open(result_file, 'wb') as f:
        pickle.dump(gscv_result.cv_results_, f)

INFO:root:##########	Dataset File: dataset_graph_cooccurrence_1_all_cade-ana.npy


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] phi_picker__return_iteration=0, clf=PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              fit_intercept=True, loss='hinge', max_iter=1000, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=None,
              verbose=0, warm_start=False), scaler=None, clf__class_weight=balanced, wl_transformer__h=2, clf__tol=0.001, clf__max_iter=1000 
FastWLGraphKernelTransformer.fit: len(X)=27317, H=2
FastWLGraphKernelTransformer.fit: Found empty graphs in training set: 3


KeyboardInterrupt: 

## Appendix

### Pre-process docs for d2v and w2v

In [None]:
w2v_data_train = [w2v_d2v.w2v_preproess(doc) for doc in data_train_X]
w2v_data_test = [w2v_d2v.w2v_preproess(doc) for doc in data_test_X]

### Train Doc2Vec

In [None]:
model_d2v = w2v_d2v.train_d2v(w2v_data_train, data_train_Y, iterations = 5)

In [None]:
scores = w2v_d2v.score_d2v(clfs, data_train_Y, data_test_Y, model_d2v, w2v_data_train, w2v_data_test, steps = 1)