In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

EXPORT_DPI = 100
EXPORT_FIG_SIZE = (8, 4)
EXPORT_FIG_SIZE_BIG = (10, 7)
EXPORT_FIG_WIDTH, EXPORT_FIG_HEIGHT = EXPORT_FIG_SIZE
EXPORT_FIG_WIDTH_BIG, EXPORT_FIG_HEIGHT_BIG = EXPORT_FIG_SIZE_BIG

import pandas as pd

pd.options.display.max_rows = 80
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = -1

import seaborn as sns
import matplotlib.pyplot as plt
sns.set('notebook', 'whitegrid', palette = 'deep')
plt.rcParams['figure.figsize'] = EXPORT_FIG_SIZE_BIG

In [None]:
from utils import graph_helper, dataset_helper
import networkx as nx
from transformers import fast_wl_pipeline, text_pipeline
from transformers.tuple_selector import TupleSelector
import sklearn
from sklearn import pipeline
from sklearn import svm
from sklearn import preprocessing
import numpy as np
import collections



def get_classifier():
    graph_fast_wl_grid_params = {
        'fast_wl__h': [5],
        'fast_wl__phi_dim': [None],
        'fast_wl__round_to_decimals': [10],
        'phi_picker__return_iteration': ['stacked'],
        'phi_picker__use_zeroth': [False]
    }

    grid_params_combined = dict({
        'classifier': []
    }, **dict({'features__fast_wl_pipeline__feature_extraction__' + k: val for k, val in
               graph_fast_wl_grid_params.items()}, **dict(
        features__fast_wl_pipeline__feature_extraction__fast_wl__phi_dim=[]
    )))

    combined_features = sklearn.pipeline.FeatureUnion([
        ('tfidf', sklearn.pipeline.Pipeline([
            ('selector', TupleSelector(tuple_index=1)),
            ('tfidf', text_pipeline.get_pipeline()),
        ])),
        ('fast_wl_pipeline', sklearn.pipeline.Pipeline([
            ('selector', TupleSelector(tuple_index=0, v_stack=False)),
            ('feature_extraction', fast_wl_pipeline.get_pipeline())
        ]))
    ], transformer_weights = dict(
        tfidf=1,
        fast_wl_pipeline=1
    ))

    pipeline = sklearn.pipeline.Pipeline([
        ('features', combined_features),
        ('scaler', sklearn.preprocessing.MaxAbsScaler()),
        ('classifier', None)
    ])
    
    return pipeline, grid_params_combined

cv = sklearn.model_selection.StratifiedKFold(
    n_splits=3,
    random_state=42,
    shuffle=True
)

Result = collections.namedtuple('Result', ['y_true', 'y_preds'])

dataset = 'ling-spam'
dataset = 'ng20'
#dataset = None
for graph_cache_file in dataset_helper.get_all_cached_graph_datasets(dataset):
    if 'concept' not in graph_cache_file or 'v2' not in graph_cache_file: continue
    #if 'cooccurrence' not in graph_cache_file: continue
    print(graph_cache_file)
    X_combined, Y_combined = graph_helper.get_filtered_text_graph_dataset(graph_cache_file)

    graphs = [g for (g, _, _) in X_combined]
    empty_graphs = len([1 for g in graphs if nx.number_of_nodes(g) == 0 or nx.number_of_edges(g) == 0])
    num_vertices = sum([nx.number_of_nodes(g) for g in graphs]) + empty_graphs
    fast_wl_pipeline.convert_graphs_to_tuples(graphs)
    X_combined = [(graph, text) for (graph, text, _) in X_combined]
    
    clfs, params = get_classifier()
    clf = sklearn.linear_model.PassiveAggressiveClassifier(class_weight = 'balanced', max_iter = 10000, verbose = 0, tol = 1e-5)
    #clf = sklearn.svm.LinearSVC(class_weight = 'balanced', max_iter = 10000, verbose = 1, tol = 1e-6)
    params['classifier'] = [clf]
    #params['features__transformer_weights'] = [{'tfidf': 1, 'fast_wl_pipeline': 1}, {'tfidf': 1, 'fast_wl_pipeline': 0}]
    #
    #params['features__fast_wl_pipeline__feature_extraction__fast_wl__h'] = [1, 10]
    params['features__fast_wl_pipeline__feature_extraction__phi_picker__use_zeroth'] = [True, False]
    params['features__fast_wl_pipeline__feature_extraction__fast_wl__phi_dim'] = [num_vertices]
    
    grid = sklearn.model_selection.ParameterGrid(params)
    
    assert len(grid) == 2
    
    results = []
    for train, test in cv.split(X_combined, Y_combined):
        X_train, Y_train, X_test, Y_test = np.array(X_combined)[train], np.array(Y_combined)[train], np.array(X_combined)[test], np.array(Y_combined)[test]
        result = Result(Y_test, [])
        for params_ in grid:
            print('set_params', params_)
            clfs.set_params(**params_)
            clfs.fit(X_train, Y_train)
            Y_pred = clfs.predict(X_test)
            result.y_preds.append(Y_pred)
        results.append(result)
        break
    break
        
        
#f1 = sklearn.metrics.f1_score(y_true=Y_test, y_pred=Y_pred, average='macro')
#clf_ = clfs.named_steps['classifier']
#num_text_features = len(clfs.named_steps['features'].transformer_list[0][1].named_steps['tfidf'].named_steps['TfidfTransformer'].vocabulary_.keys())
#num_graph_features = coefs.shape[1] - num_text_features

In [None]:
for result in results:
    y_true, (y_pred_a, y_pred_b) = result.y_true, result.y_preds
    f1 = sklearn.metrics.f1_score(y_true, y_pred_a, average='macro')
    f1_ = sklearn.metrics.f1_score(y_true, y_pred_b, average='macro')
    print(f1, f1_)

In [None]:
import numpy as np

def get_transformed_results(result):
    y_true = result.y_true
    y_pred_a, y_pred_b = result.y_preds
    trans_enc = sklearn.preprocessing.LabelEncoder()
    y_true = trans_enc.fit_transform(y_true)
    y_pred_a, y_pred_b = trans_enc.transform(y_pred_a), trans_enc.transform(y_pred_b)
    return np.array(y_true), np.array(y_pred_a), np.array(y_pred_b)

def randomization_test(y_true, y_pred_a, y_pred_b, metric = sklearn.metrics.f1_score, num_trails = 1000):
    metrics = np.empty(num_trails, dtype = np.float64)
    
    def get_metric(indices):
        y_shuffled = np.empty(len(y_true), dtype = np.uint16)
        y_shuffled[indices == 0] = y_pred_a[indices == 0]
        y_shuffled[indices == 1] = y_pred_b[indices == 1]
        return y_shuffled
    
    for i in range(num_trails):
        indices = np.random.randint(0, 2, size = len(y_true))
        y_shuffled_a, y_shuffled_b = get_metric(indices), get_metric(np.abs(indices - 1))
        metric_a, metric_b = metric(y_true, y_shuffled_a), metric(y_true, y_shuffled_b)
        metrics[i] = metric_a - metric_b
    
    return metrics

def f1_metric(y_true, y_pred):
    return sklearn.metrics.f1_score(y_true = y_true, y_pred=y_pred, average='macro')

metric = f1_metric
for result in results:
    y_true, y_pred_a, y_pred_b = get_transformed_results(result)
    metric_a, metric_b = metric(y_true, y_pred_a), metric(y_true, y_pred_b)
    diff = metric_a - metric_b
    #abs_diff = np.fabs(diff)
    res = randomization_test(y_true, y_pred_a, y_pred_b, metric = metric, num_trails = 2000)
    df = pd.DataFrame({'f1': res})
    fig, ax = plt.subplots()
    df.f1.plot(kind = 'hist', bins = 100, ax = ax)
    ax.axvline(diff, color = 'red')
    ax.axvline(-diff, color = 'red')

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_coefficients(classifier, feature_names, top_features=20):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])

    sorted_coefs_graphs = np.argsort(coef[num_text_features:])
    top_positive_graph_coefs = sorted_coefs_graphs[-top_features:]
    top_negative_graph_coefs = sorted_coefs_graphs[:top_features]
    top_graph_coefs = np.hstack([top_negative_graph_coefs, top_positive_graph_coefs])
    # create plot
    plt.figure(figsize=(15, 5))
    colors = ['r' if c < 0 else 'b' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_graph_coefs + num_text_features], color=colors)
    plt.title('Graph features')
    
    plt.figure(figsize=(15, 5))
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    plt.title('All features')
    #plt.xticks(top_coefficients, rotation=60, ha='right')
    if feature_names is not None:
        feature_names = np.array(feature_names)
        plt.xticks(np.arange(0, 2 * top_features), [feature_names[x] if x < len(feature_names) else '..' for x in top_coefficients], rotation=60, ha='right')
    plt.show()


print('#text features: {}, #graph features: {}'.format(num_text_features, num_graph_features))
    
coefs = clf_.coef_.ravel()
text_features = coefs[:num_text_features]
graph_features = coefs[num_text_features:]
tfidf_transformer = clfs.named_steps['features'].transformer_list[0][1].named_steps['tfidf'].named_steps['TfidfTransformer']
fast_wl_transformer = clfs.named_steps['features'].transformer_list[0]
text_feature_names = tfidf_transformer.get_feature_names()

plot_coefficients(clf_, text_feature_names, top_features=50)
#clfs.named_steps['features'].transformer_list[1][1].named_steps['feature_extraction'].named_steps['phi_picker'].shape


In [None]:
import pandas as pd
df = pd.DataFrame({'coefs': coefs})
df_graphs = df[df.index >= num_text_features]
df_texts = df[df.index < num_text_features]
df['type'] = df.index.map(lambda x: 'text' if x < num_text_features else 'graph')
fig, ax = plt.subplots()
for type_, df_ in df.groupby('type'):
    df_.coefs.plot(kind = 'hist', logy = True, bins = 100, ax = ax, label = type_, alpha = 0.8)
ax.legend()
#df_graphs.plot(kind = 'hist', logy = True, bins = 100)
#df_texts.plot(kind = 'hist', logy = True, bins = 100)

In [None]:
for type_, df_ in df.groupby('type'):
    pos_sum = df_[df_.coefs > 0].coefs.sum() / len(df_[df_.coefs > 0])
    neg_sum = df_[df_.coefs <= 0].coefs.sum() / len(df_[df_.coefs <= 0])
    print(type_,'\t', pos_sum, neg_sum)