# Analysis of hit-graph datasets

The purpose of this notebook is to analyze the preprocessed hit-graph datasets.

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
def get_sample_size(filename):
    with np.load(filename) as f:
        n_nodes = f['X'].shape[0]
        n_edges = f['y'].shape[0]
    return n_nodes, n_edges

def process_dataset(dir, n_files=None):
    files = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]
    print('%i total files' % len(files))
    if n_files is not None:
        files = files[:n_files]
    shapes = np.array([get_sample_size(f) for f in files])
    n_nodes, n_edges = shapes[:,0], shapes[:,1]
    return n_nodes, n_edges

## Example event

In [4]:
data_dir = '../data/example_event/raw'
n_files = 1

In [5]:
n_nodes, n_edges = process_dataset(data_dir, n_files)

5 total files


ValueError: Cannot load file containing pickled data when allow_pickle=False

In [None]:
plt.figure(figsize=(8,6))

plt.hist2d(n_nodes, n_edges)
plt.xlabel('Number of graph nodes')
plt.ylabel('Number of graph edges')
plt.colorbar();

## Medium dataset

In [None]:
data_dir = '/global/cscratch1/sd/sfarrell/heptrkx/data/hitgraphs_med_002'
n_files = 1000

In [None]:
n_nodes, n_edges = process_dataset(data_dir, n_files)

In [None]:
plt.figure(figsize=(8,6))

plt.hist2d(n_nodes, n_edges)
plt.xlabel('Number of graph nodes')
plt.ylabel('Number of graph edges')
plt.colorbar();

## Big dataset

In [None]:
data_dir = '/global/cscratch1/sd/sfarrell/heptrkx/data/hitgraphs_big_000'
n_files = 1000

In [None]:
n_nodes, n_edges = process_dataset(data_dir, n_files)

In [None]:
plt.figure(figsize=(8,6))

plt.hist2d(n_nodes, n_edges)
plt.xlabel('Number of graph nodes')
plt.ylabel('Number of graph edges')
plt.colorbar();

## Test

In [None]:
def get_sample_results(filename):
    with np.load(filename) as f:
        n_nodes = f['X'].shape[0]
        n_edges = f['y'].shape[0]
        purity = f['y'].mean()
    return n_nodes, n_edges, purity

In [None]:
data_dir = '/global/cscratch1/sd/sfarrell/heptrkx/data/hitgraphs_small_000'
n_files = 100

In [None]:
files = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]
print('%i total files' % len(files))
if n_files is not None:
    files = files[:n_files]
sample_results = [get_sample_results(f) for f in files]
n_nodes, n_edges, purity = zip(*sample_results)