In [None]:
from notebook_prelude import *

In [None]:
print('\n- '.join(sorted(dataset_helper.get_dataset_names_with_concept_map())))
#experiment_helper.save_experiment_params_as_experiment_config()

In [None]:
g=nx.Graph()


for file in dataset_helper.get_all_cached_graph_datasets(graph_type=TYPE_CONCEPT_MAP):
    dataset = filename_utils.get_dataset_from_filename(file)
    X, Y = dataset_helper.get_dataset_cached(file)
    X = graph_helper.get_graphs_only(X)
    directed = X[0].is_directed()
    print('{:30} directed: {}'.format(dataset, directed))
    break

In [None]:
g = X[0]
adj, labels = graph_helper.convert_graphs_to_adjs_tuples([g], copy=True)[0]
#np.array_equiv(adj, adj.T), np.array_equal(adj, adj.T)
adj = np.maximum(adj, adj.T)
adj.todense()

In [None]:
def get_prediction_data(prediction_file):
    prediction = list(results_helper.get_predictions(filenames=[prediction_file]))
    assert len(prediction) == 1
    filename, prediction = prediction[0]
    prediction = prediction['results']['results']
    return pd.DataFrame(prediction)

def get_graph_prediction_data(file):
    df = get_prediction_data(file)
    df['X_test_undirected'] = df.X_test.apply(nx.Graph)
    df['edge_labels'] = df.X_test.apply(lambda x: [data['name'] for _, _, data in x.edges(data=True)])
    df['node_labels'] = df.X_test.apply(nx.nodes)
    df['num_nodes'] = df.X_test.apply(nx.number_of_nodes)
    df['num_edges'] = df.X_test.apply(nx.number_of_edges)
    df['num_nodes_plus_edges'] = df.num_nodes + df.num_edges
    df['num_nodes_div_num_edges'] =  df.num_nodes / df.num_edges
    df['num_connected_components'] = df.X_test_undirected.apply(nx.number_connected_components)
    df['num_nodes_div_num_connected_components'] = df.num_nodes / df.num_connected_components
    return df

def get_text_prediction_data(file):
    df_text = get_prediction_data(file)
    df_text['num_words'] = df_text.X_test.apply(lambda x: len(x.split()))
    return df_text

DATASET = 'ng20'

GRAPH_FILE = 'result___{}__graph__dataset_graph_concept_map_{}-v3.npy'.format(DATASET, DATASET)
TEXT_FILE = 'result___{}__text.npy'.format(DATASET)

df = get_graph_prediction_data(GRAPH_FILE)
df['dataset'] = DATASET
df_text = get_text_prediction_data(TEXT_FILE)
df_text['dataset'] = DATASET

In [None]:
for idx, df_ in df.iterrows():
    dataset = df_.dataset
    X = df_.X_test
    break

In [None]:
def get_score_per_quantile(df, attr, q=10, ax = None):
    if ax is None:
        _, ax = plt.subplots(figsize = (11, 4))
    df[attr + '_bins'] = pd.qcut(df[attr], q=q, duplicates='drop')
    data = collections.defaultdict(list)
    for interval, df_ in df.groupby(attr + '_bins'):
        score = get_score(df_)
        data['interval'].append(interval)
        data['f1'].append(score)
    df_ = pd.DataFrame(data).set_index('interval')
    df_['interval_mean'] = df_.index.map(lambda x: x.mid)
    df_.set_index('interval_mean').plot(kind='bar', ax = ax, legend=False)
    labels = df_.interval_mean.values
    ax.set_xticklabels(['{:.0f}'.format(x) for x in labels])
    #ax.set_xticklabels(labels, rotation=0)
    ax.set_xlabel('Decile')
    ax.set_ylabel('F1 macro')
    ax.set_title(attr)
    ax.get_figure().tight_layout()
    return ax

def get_score(df_):
    y_true, y_pred = df_.Y_real.values, df_.Y_pred.values
    f1 = sklearn.metrics.f1_score(y_true, y_pred, average='macro')
    return f1

for attr in ['num_edges', 'num_nodes', 'num_nodes_div_num_connected_components', 'num_nodes_div_num_edges', 'num_connected_components', 'num_nodes_plus_edges']:
    ax = get_score_per_quantile(df, attr)
    fig = ax.get_figure()
    save_fig(fig, 'graph_binning_{}'.format(attr), folder='tmp/graph_sizes')
    
for attr in ['num_words']:
    ax = get_score_per_quantile(df_text, attr)
    fig = ax.get_figure()
    save_fig(fig, 'text_binning_{}'.format(attr), folder='tmp/graph_sizes')