In [None]:
%%javascript
require("notebook/js/notebook").Notebook.prototype.scroll_to_bottom = function () {}

In [None]:
from notebook_prelude import *

## Datasets statistics (Text and Graph)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

trans = NxGraphToTupleTransformer()


#LIMIT_DATASET = ['ng20']
#LIMIT_DATASET = ['ling-spam']
LIMIT_DATASET = None
filtered_datasets = []
for dataset in dataset_helper.get_all_available_dataset_names():
    if LIMIT_DATASET and dataset not in LIMIT_DATASET: continue
    concept_maps = [x for x in dataset_helper.get_all_cached_graph_datasets(dataset, graph_type=TYPE_CONCEPT_MAP) if 'v2' in x or 'v3' in x]
    cooccurence_graphs = [x for x in dataset_helper.get_all_cached_graph_datasets(dataset, graph_type=TYPE_COOCCURRENCE) if '_cooccurrence_1' in x]
    if not len(concept_maps) or not len(cooccurence_graphs): continue
    filtered_datasets.append((dataset, concept_maps[0], cooccurence_graphs[0]))
data = collections.defaultdict(lambda: [])
for dataset, concept_map, cooccurence_graph in helper.log_progress(filtered_datasets):
    print('Dataset: {}'.format(dataset))

    X, Y = dataset_helper.get_dataset(dataset)
    
    # Add general dataset statistics
    data['dataset'].append(dataset)
    data['num_documents'].append(len(X))
    data['num_classes'].append(len(set(Y)))

    # Add text statistics
    count_vec = CountVectorizer()
    doc_vecs = count_vec.fit_transform(X)
    
    all_words = set(count_vec.vocabulary_.keys())
    data['document_lengths'].append([len(x) for x in X])
    data['num_words'].append(doc_vecs.sum())
    data['num_unique_words'].append(len(all_words))
    data['median_doc_length'].append(np.median([len(x) for x in X]))
    data['median_words_per_doc'].append(np.median(np.squeeze(np.asarray(doc_vecs.sum(axis = 1)))))
    
    # Add graph statistics
    for graph_type, graph_cache_file in [(TYPE_CONCEPT_MAP, concept_map), (TYPE_COOCCURRENCE, cooccurence_graph)]:
        X_graph, _ = dataset_helper.get_dataset_cached(graph_cache_file)
        X_graph = graph_helper.get_graphs_only(X_graph)
        trans.transform(X_graph)
        
        all_labels = []
        for _, l in X_graph:
            all_labels += l

        data['num_nodes_{}'.format(graph_type)].append(len(all_labels))
        data['num_unique_nodes_labels_{}'.format(graph_type)].append(len(set(all_labels)))
        data['num_edges_{}'.format(graph_type)].append(sum(len(adj.nonzero()[0]) for adj, labels in X_graph))

## Graph statistics, eg. ratio_edges_to_nodes

In [None]:

df = pd.DataFrame(data).set_index('dataset')
df['ratio_unique_words'] = df.num_unique_words / df.num_nodes_cooccurrence

for graph_type in [TYPE_CONCEPT_MAP, TYPE_COOCCURRENCE]:
    df['ratio_edges_to_nodes_{}'.format(graph_type)] = df['num_edges_{}'.format(graph_type)] / df['num_nodes_{}'.format(graph_type)]
    df['ratio_nodes_to_words_{}'.format(graph_type)] = df['num_nodes_{}'.format(graph_type)] / df.num_words

    
df['ratio_unique_words'] = df.num_unique_words / df.num_nodes_cooccurrence
df['ratio_ratio_ratio_edges_to_nodes_cmap_to_coo'] = df['ratio_edges_to_nodes_concept_map'] / df['ratio_edges_to_nodes_cooccurrence']
for col in ['median_doc_length', 'median_words_per_doc']:
    df[col] = df[col].astype(int)

df = df[sorted(df.columns)]
df

### Nodes per graph type

In [None]:
for t in GRAPH_TYPES:
    df['nodes_per_graph_{}'.format(t)] = df['num_edges_{}'.format(t)] / df.num_documents
df

### Create latex tables for text and graphs

In [None]:
RENAME_COLS = {
    'dataset': {
        'num_classes': '# classes', 
        'num_documents': '# docs', 
        'median_words_per_doc': 'median #words/doc',
        'ratio_unique_words': '#uniq. words/#words' 
    },
    'graphs': {
        'ratio_nodes_to_words_concept_map': '#nodes/#words cmap',
        'ratio_nodes_to_words_cooccurrence': '#nodes/#words coo',
        'ratio_edges_to_nodes_concept_map': '#edges/#nodes cmap', 
        'ratio_edges_to_nodes_cooccurrence': '#edges/#nodes coo', 
        'nodes_per_graph_cooccurrence': '#nodes/graph cmap',
        'nodes_per_graph_concept_map': '#nodes/graph coo',
    }
}

df.loc['mean'] = df.mean()

for key, rename_cols in RENAME_COLS.items():
    df_stats = df[list(rename_cols.keys())].rename(columns = rename_cols)
    print(df_stats.to_latex(float_format = '%.2f'))

In [None]:
with open('tmp/dataset_statistics.npy', 'wb') as f:
    pickle.dump(df, f)

In [None]:
with open('tmp/dataset_statistics.npy', 'rb') as f:
    df = pickle.load(f)

## Co-Occurrence graph statistics

### All-words vs. only-nouns compressino

In [None]:
WINDOW_SIZE = 1
datasets_coo = collections.defaultdict(lambda: {})

cooccurrence_graph_files = [x for x in dataset_helper.get_all_cached_graph_datasets() if 'cooccurrence' in x]

for graph_cf in cooccurrence_graph_files:
    dataset = filename_utils.get_dataset_from_filename(graph_cf)
    window_size = filename_utils.get_cooccurrence_window_size_from_filename(graph_cf)
    words = filename_utils.get_cooccurrence_words_from_filename(graph_cf)
    if window_size != WINDOW_SIZE: continue
    if dataset in datasets_coo and words in datasets_coo[dataset]: continue
    print(graph_cf)
    X, Y = dataset_helper.get_dataset_cached(graph_cf)
    all_labels = graph_helper.get_all_node_labels(X)
    datasets_coo[dataset][words] = all_labels

In [None]:
data = collections.defaultdict(lambda: [])
for dataset, words in datasets_coo.items():
    for word, labels in words.items():
        data['dataset'].append(dataset)
        data['word'].append(word.replace('-', '_').replace('all', 'all_words'))
        data['label_count'].append(len(labels))
    
df = pd.DataFrame(data)
df = df.pivot(index = 'dataset', columns = 'word', values = 'label_count')
df['word_ratio'] = df.only_nouns / df.all_words
df

## Concept map

### Recurring concepts

In [None]:
all_labels = collections.defaultdict(lambda: {})
dataset = 'ling-spam'
dataset = None
for gcf in dataset_helper.get_all_cached_graph_datasets(dataset):#graph_type=TYPE_CONCEPT_MAP):
    dataset = filename_utils.get_dataset_from_filename(gcf)
    graph_type = graph_helper.get_graph_type_from_filename(gcf)
    
    if graph_type == TYPE_CONCEPT_MAP and 'v2' not in gcf: continue
    if graph_type in all_labels[dataset]: continue
    
    print('{:30} {}'.format(dataset, graph_type))
    X, Y = dataset_helper.get_dataset_cached(gcf)
    X = graph_helper.convert_graphs_to_adjs_tuples(X, copy = True)
    all_labels[dataset][graph_type] = [labels for _, labels in X]

In [None]:
data = collections.defaultdict(lambda: [])
for dataset, graph_types in all_labels.items():
    for graph_type, labels in graph_types.items():
        labels_flat = helper.flatten_array(labels)
        c = collections.Counter(labels_flat)
        df_cmap_occurrences = pd.DataFrame(list(c.items()), columns = ['label', 'occurrences'])
        data['dataset'] += [dataset] * len(c.keys())
        data['label'] += c.keys()
        data['occurrences'] += c.values()
        data['num_docs'] += [len(labels)] * len(c.keys())
        data['type'] += [graph_type] * len(c.keys())
df_occs = pd.DataFrame(data)

In [None]:
df_occ_only_once = ((df_occs[df_occs.occurrences <= 1].groupby(['dataset', 'type']).occurrences.value_counts() / df_occs.groupby('dataset').occurrences.sum()).groupby(['dataset', 'type']).sum()).unstack()#.plot(kind = 'barh')
df_occ_only_once = df_occ_only_once[pd.isna(df_occ_only_once['concept-map']) == False]
print(df_occ_only_once.to_latex())
df_occ_only_once

In [None]:
fig, ax = plt.subplots()
last = 200#df_occs.occurrences.quantile(q=0.98)
df_occs[df_occs.occurrences < last].groupby('dataset').occurrences.hist(bins = 120, ax = ax, alpha = 0.2, label = 'dataset')
ax.set_yscale('log')
ax.legend()