In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

EXPORT_DPI = 100
EXPORT_FIG_SIZE = (8, 4)
EXPORT_FIG_SIZE_BIG = (10, 7)
EXPORT_FIG_WIDTH, EXPORT_FIG_HEIGHT = EXPORT_FIG_SIZE
EXPORT_FIG_WIDTH_BIG, EXPORT_FIG_HEIGHT_BIG = EXPORT_FIG_SIZE_BIG

import pandas as pd

pd.options.display.max_rows = 80
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = -1

import seaborn as sns
import matplotlib.pyplot as plt
sns.set('notebook', 'whitegrid', palette = 'deep')
plt.rcParams['figure.figsize'] = EXPORT_FIG_SIZE_BIG

In [None]:
from utils import graph_helper, dataset_helper
import networkx as nx
from transformers import fast_wl_pipeline, text_pipeline
from transformers.tuple_selector import TupleSelector
import sklearn
from sklearn import pipeline
from sklearn import svm
from sklearn import preprocessing
import numpy as np


def get_classifier():
    graph_fast_wl_grid_params = {
        'fast_wl__h': [1],
        'fast_wl__phi_dim': [None],
        'fast_wl__round_to_decimals': [10],
        'phi_picker__return_iteration': ['stacked'],
        'phi_picker__use_zeroth': [False]
    }

    grid_params_combined = dict({
        'classifier': []
    }, **dict({'features__fast_wl_pipeline__feature_extraction__' + k: val for k, val in
               graph_fast_wl_grid_params.items()}, **dict(
        features__fast_wl_pipeline__feature_extraction__fast_wl__phi_dim=[]
    )))

    combined_features = sklearn.pipeline.FeatureUnion([
        ('tfidf', sklearn.pipeline.Pipeline([
            ('selector', TupleSelector(tuple_index=1)),
            ('tfidf', text_pipeline.get_pipeline()),
        ])),
        ('fast_wl_pipeline', sklearn.pipeline.Pipeline([
            ('selector', TupleSelector(tuple_index=0, v_stack=False)),
            ('feature_extraction', fast_wl_pipeline.get_pipeline()),
            ('scaler', sklearn.preprocessing.MaxAbsScaler())
        ]))
    ], transformer_weights = dict(
        tfidf=1,
        fast_wl_pipeline=1
    ))

    pipeline = sklearn.pipeline.Pipeline([
        ('features', combined_features),
        ('classifier', None)
    ])
    
    return pipeline, grid_params_combined

cv = sklearn.model_selection.StratifiedKFold(
    n_splits=3,
    random_state=42,
    shuffle=True
)

for graph_cache_file in dataset_helper.get_all_cached_graph_datasets('ng20'):
    if 'concept' not in graph_cache_file or 'v2' not in graph_cache_file: continue
    print(graph_cache_file)
    X_combined, Y_combined = graph_helper.get_filtered_text_graph_dataset(graph_cache_file)

    graphs = [g for (g, _, _) in X_combined]
    empty_graphs = len([1 for g in graphs if nx.number_of_nodes(g) == 0 or nx.number_of_edges(g) == 0])
    num_vertices = sum([nx.number_of_nodes(g) for g in graphs]) + empty_graphs
    fast_wl_pipeline.convert_graphs_to_tuples(graphs)
    X_combined = [(graph, text) for (graph, text, _) in X_combined]
    
    clfs, params = get_classifier()
    #clf = sklearn.linear_model.PassiveAggressiveClassifier(class_weight = 'balanced', max_iter = 10000, verbose = 1, tol = 1e-6)
    clf = sklearn.svm.LinearSVC(class_weight = 'balanced', max_iter = 10000, verbose = 1, tol = 1e-6)
    params['classifier'] = [clf]
    params['features__fast_wl_pipeline__feature_extraction__fast_wl__phi_dim'] = [num_vertices]
    grid = sklearn.model_selection.ParameterGrid(params)
    for params_ in grid:
        print('Fitting: {}'.format(params))
        clfs.set_params(**params_)

    scores = []
    for train, test in cv.split(X_combined, Y_combined):
        X_train, Y_train, X_test, Y_test = np.array(X_combined)[train], np.array(Y_combined)[train], np.array(X_combined)[test], np.array(Y_combined)[test]
        
        clfs.fit(X_train, Y_train)
        clf_ = clfs.named_steps['classifier']
        
        Y_pred = clfs.predict(X_test)
        f1 = sklearn.metrics.f1_score(y_true= Y_test, y_pred=Y_pred, average = 'macro')
        scores.append(f1)
        print(f1)
        coefs = clf_.coef_
    print('mean', np.array(scores).mean())
    num_text_features = len(clfs.named_steps['features'].transformer_list[0][1].named_steps['tfidf'].named_steps['TfidfTransformer'].vocabulary_.keys())
    num_graph_features = coefs.shape[1] - num_text_features
    break


In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_coefficients(classifier, feature_names, top_features=20):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])

    sorted_coefs_graphs = np.argsort(coef[num_text_features:])
    top_positive_graph_coefs = sorted_coefs_graphs[-top_features:]
    top_negative_graph_coefs = sorted_coefs_graphs[:top_features]
    top_graph_coefs = np.hstack([top_negative_graph_coefs, top_positive_graph_coefs])
    # create plot
    plt.figure(figsize=(15, 5))
    colors = ['r' if c < 0 else 'b' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_graph_coefs + num_text_features], color=colors)
    plt.title('Graph features')
    
    plt.figure(figsize=(15, 5))
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    plt.title('All features')
    #plt.xticks(top_coefficients, rotation=60, ha='right')
    if feature_names is not None:
        feature_names = np.array(feature_names)
        plt.xticks(np.arange(0, 2 * top_features), [feature_names[x] if x < len(feature_names) else '..' for x in top_coefficients], rotation=60, ha='right')
    plt.show()


print('#text features: {}, #graph features: {}'.format(num_text_features, num_graph_features))
    
coefs = clf_.coef_.ravel()
text_features = coefs[:num_text_features]
graph_features = coefs[num_text_features:]
tfidf_transformer = clfs.named_steps['features'].transformer_list[0][1].named_steps['tfidf'].named_steps['TfidfTransformer']
fast_wl_transformer = clfs.named_steps['features'].transformer_list[0]
text_feature_names = tfidf_transformer.get_feature_names()

plot_coefficients(clf_, text_feature_names, top_features=50)
#clfs.named_steps['features'].transformer_list[1][1].named_steps['feature_extraction'].named_steps['phi_picker'].shape


In [None]:
import pandas as pd
df = pd.DataFrame({'coefs': coefs})
df_graphs = df[df.index >= num_text_features]
df_texts = df[df.index < num_text_features]
df['type'] = df.index.map(lambda x: 'text' if x < num_text_features else 'graph')
fig, ax = plt.subplots()
for type_, df_ in df.groupby('type'):
    df_.coefs.plot(kind = 'hist', logy = True, bins = 100, ax = ax, label = type_, alpha = 0.8)
ax.legend()
#df_graphs.plot(kind = 'hist', logy = True, bins = 100)
#df_texts.plot(kind = 'hist', logy = True, bins = 100)

In [None]:
for type_, df_ in df.groupby('type'):
    pos_sum = df_[df_.coefs > 0].coefs.sum() / len(df_[df_.coefs > 0])
    neg_sum = df_[df_.coefs <= 0].coefs.sum() / len(df_[df_.coefs <= 0])
    print(type_,'\t', pos_sum, neg_sum)