In [None]:
from notebook_prelude import *

In [None]:
def get_prediction_data(prediction_file):
    prediction = list(results_helper.get_predictions(filenames=[prediction_file]))
    assert len(prediction) == 1, 'Could not find prediction file: {}'.format(prediction_file)
    filename, prediction = prediction[0]
    prediction = prediction['results']
    return pd.DataFrame(prediction)

def get_graph_prediction_data(file):
    df = get_prediction_data(file)
    df['X_test_undirected'] = df.X_test.apply(nx.Graph)
    df['edge_labels'] = df.X_test.apply(lambda x: [data['name'] for _, _, data in x.edges(data=True)])
    df['node_labels'] = df.X_test.apply(nx.nodes)
    df['num_nodes'] = df.X_test.apply(nx.number_of_nodes)
    df['num_edges'] = df.X_test.apply(nx.number_of_edges)
    df['num_nodes_plus_edges'] = df.num_nodes + df.num_edges
    df['num_nodes_div_num_edges'] =  df.num_nodes / df.num_edges
    df['num_connected_components'] = df.X_test_undirected.apply(nx.number_connected_components)
    df['num_nodes_div_num_connected_components'] = df.num_nodes / df.num_connected_components
    df['accuracy'] = df.apply(lambda x: 1 if x.Y_pred == x.Y_real else 0, axis=1)
    return df

def get_text_prediction_data(file):
    df_text = get_prediction_data(file)
    df_text['num_words'] = df_text.X_test.apply(lambda x: len(x.split()))
    df_text['doc_length'] = df_text.X_test.apply(lambda x: len(x))
    df_text['accuracy'] = df.apply(lambda x: 1 if x.Y_pred == x.Y_real else 0, axis=1)
    return df_text

def get_corr(df, columns, method='spearman'):
    mat_spearman = df[columns].corr(method='spearman').dropna(axis=1, how='all').dropna(axis=0, how='all')
    mat_pearson = df[columns].corr(method='pearson').dropna(axis=1, how='all').dropna(axis=0, how='all')
    return mat_spearman, mat_pearson


data = collections.defaultdict(list)
for DATASET in dataset_helper.get_dataset_names_with_concept_map():
    GRAPH_FILE = 'result___experiment_graphs_plain__{}__graph__dataset_graph_concept_map_{}-v3.npy'.format(DATASET, DATASET)
    TEXT_FILE = 'result___experiment_text_plain__{}__text__text_{}.npy'.format(DATASET, DATASET)
    df = get_graph_prediction_data(GRAPH_FILE)
    df['dataset'] = DATASET
    df_text = get_text_prediction_data(TEXT_FILE)
    df_text['dataset'] = DATASET
    
    mat, mat_pearson = get_corr(df, columns=['num_nodes', 'num_connected_components', 'accuracy'])
    mat_text, mat_text_pearson = get_corr(df_text, columns=['num_words', 'accuracy'])
    
    correlation_graph = mat.loc['num_nodes', 'accuracy']
    correlation_graph_pearson = mat_pearson.loc['num_nodes', 'accuracy']
    correlation_text = mat_text.loc['num_words', 'accuracy']
    correlation_text_pearson = mat_text_pearson.loc['num_words', 'accuracy']
    
    data['dataset'].append(DATASET)
    data['correlation_graph'].append(correlation_graph)
    data['correlation_text'].append(correlation_text)
    data['correlation_graph_pearson'].append(correlation_graph_pearson)
    data['correlation_text_pearson'].append(correlation_text_pearson)

In [None]:
df_corr = pd.DataFrame(data).set_index('dataset')
print(df_corr.to_latex())
df_corr

In [None]:
CMAP = 'inferno_r'

columns=['num_nodes', 'num_connected_components', 'accuracy']
mat = df[columns].corr(method='spearman').dropna(axis=1, how='all').dropna(axis=0, how='all')
fig, ax = plt.subplots(figsize=(5, 5))
im = ax.matshow(mat, cmap=plt.get_cmap(CMAP))

label_lookup = [
    '# nodes', '# c. components', 'accuracy'
]

for x in ['x', 'y']:
    getattr(ax, 'set_{}ticks'.format(x))([i for i in range(len(label_lookup))])
    rotation = 0 if x == 'x' else 0
    getattr(ax, 'set_{}ticklabels'.format(x))(label_lookup, rotation=rotation)
ax.grid('off')
fig.colorbar(im, fraction=0.0456, pad=0.04)
fig.tight_layout()

In [None]:
columns=['num_nodes', 'num_connected_components', 'acurracy']
mat = df[columns].corr(method='spearman').dropna(axis=1, how='all').dropna(axis=0, how='all')
mat

In [None]:
def plot_score_per_quantile(df_, attr, q=10, title=None, ax = None):
    if ax is None:
        _, ax = plt.subplots(figsize = (11, 4))
    df_.plot(kind='bar', ax = ax, legend=False)
    labels = df_.interval_mean.values
    ax.set_xticklabels(['{:.0f}'.format(x) for x in labels])
    #ax.set_xticklabels(labels, rotation=0)
    ax.set_xlabel('Decile')
    ax.set_ylabel('F1 macro')
    ax.set_title(attr if title is None else title)
    ax.get_figure().tight_layout()
    return ax

def get_score_per_quantile(df, attr, q=10, title=None, ax = None):
    df[attr + '_bins'] = pd.qcut(df[attr], q=q, duplicates='drop')
    data = collections.defaultdict(list)
    for interval, df_ in df.groupby(attr + '_bins'):
        score = get_score(df_)
        data['interval'].append(interval)
        data['f1'].append(score)
    df_ = pd.DataFrame(data).set_index('interval')
    df_['interval_mean'] = df_.index.map(lambda x: x.mid)
    #df_ = df_.set_index('interval_mean')
    return df_

def get_score(df_):
    y_true, y_pred = df_.Y_real.values, df_.Y_pred.values
    f1 = sklearn.metrics.f1_score(y_true, y_pred, average='macro')
    return f1

ATTRS = ['num_edges', 'num_nodes', 'num_nodes_div_num_connected_components', 'num_nodes_div_num_edges', 'num_connected_components', 'num_nodes_plus_edges']
ATTRS = ['num_nodes']

TEXT_ATTRS = ['num_words']

Q = 4

def print_latex(df, float_format='%.3f'):
    text = df.to_latex(float_format=float_format)
    return text.replace('.999', '')

data = collections.defaultdict(list)
for DATASET in dataset_helper.get_dataset_names_with_concept_map():
    GRAPH_FILE = 'result___experiment_graphs_plain__{}__graph__dataset_graph_concept_map_{}-v3.npy'.format(DATASET, DATASET)
    TEXT_FILE = 'result___experiment_text_plain__{}__text__text_{}.npy'.format(DATASET, DATASET)
    df = get_graph_prediction_data(GRAPH_FILE)
    df['dataset'] = DATASET
    df_text = get_text_prediction_data(TEXT_FILE)
    df_text['dataset'] = DATASET
    print(DATASET)
    data['dataset'].append(DATASET)
    
    for attr in ATTRS:
        df_ = get_score_per_quantile(df, attr, q=Q)
        df_ = df_.T.loc['f1']
        data['graph_bins'].append(df_.index.values)
        data['graph_vals'].append(df_.values)
        print(print_latex(df_.to_frame().T).replace('\\', ''))

    for attr in TEXT_ATTRS:
        df_ = get_score_per_quantile(df_text, attr, q=Q)
        df_ = df_.T.loc['f1']
        data['text_bins'].append(df_.index.values)
        data['text_vals'].append(df_.values)
        print(print_latex(df_.to_frame().T))

In [None]:
pd.DataFrame(data).set_index('dataset')

In [None]:
'
'