# MDAnalysis and Pilot-In-Memory (Pairwise Distance)


The main performance bottleneck of the current MDAnalysis implementation is the construction of the graph using NetworkX taking ~78% of the overall runtime.


**Beckstein Profiling:**

    47        10           33      3.3      0.0      if adj is None:
    48        10        66544   6654.4      0.0          x = atoms.positions

    54        10     58689221 5868922.1     18.8          adj = (MDAnalysis.core.parallel.distances.distance_array(x, x, box=box) < cutoff)
    
    58        10           78      7.8      0.0      adjk = adj if Nmax is None else adj[:Nmax, :Nmax] 
    59        10    243009076 24300907.6   77.9      graph = nx.Graph(adjk)
    60        10      4346636 434663.6      1.4      subgraphs = nx.connected_components(graph)
    61        49        83597   1706.1      0.0      indices = [np.sort(g) for g in subgraphs]
    62        49      5694698 116218.3      1.8      return [atoms[group].residues for group in indices]



## 1. LeafletFinder NetworkX Implementation Profiling

see https://code.google.com/p/mdanalysis/

Profile default implementation based on [NetworkX](https://networkx.github.io/)

In [None]:
FILENAME="../data/mdanalysis/small/graph_edges_95_215.csv"
!head -n 5 {FILENAME}

In [None]:
%matplotlib inline
%time
import networkx as NX
import time
import datetime
import sys

start = time.time()
nxg = NX.read_edgelist(FILENAME, delimiter=",")
end_read = time.time()
NX.draw(nxg, pos=NX.spring_layout(nxg))

In [None]:
import matplotlib.pyplot as plt
degree_sequence=sorted(NX.degree(nxg).values(),reverse=True) # degree sequence
print "Degree sequence", degree_sequence
print "Length: %d" % len(degree_sequence)

dmax=max(degree_sequence)

plt.loglog(degree_sequence,'b-',marker='o')
plt.title("Degree Histogram")
plt.ylabel("Degree")
plt.xlabel("Node")

In [None]:
start = time.time()
components = NX.connected_components(nxg)
end_created = time.time()
count = 0
for component in components:
    print str(sorted(component))
    count = count + 1
end_connected = time.time()
print ("Number of Nodes: " + str(NX.number_of_nodes(nxg)))
print ("Number of Edges: " + str(NX.number_of_edges(nxg)))
print ("Connected Components: " + str(count))
print ("Runtime: " + str((end_connected-start)))
print ("Graph Creation Runtime: " + str((end_created-start)))
print ("Connected Components Runtime: " + str((end_connected - end_created)))

In [None]:
import os
from pilot_hadoop import PilotComputeService
from IPython.display import HTML

os.environ["SAGA_VERBOSE"]="100"

## Pilot-Spark and Pilot-InMemory Implementation

Setup Spark cluster on local machine or HPC resource. Execute **either** 2.1.1 or 2.1.2

### Start Spark Cluster using Pilot-Spark (Stampede)

see https://github.com/drelu/saga-hadoop

In [None]:
pilot_compute_description = {
                            "resource_url":"fork://localhost",
                            "number_cores": 1,
                            "cores_per_node":1,
                            "type":"spark"
                            }
pilot = PilotComputeService.create_pilot(pilot_compute_description);

# print out details of Pilot-Spark
details = pilot.get_details()
HTML("<a target='blank' href='%s'>Spark Web UI</a>"%details["web_ui_url"])

In [None]:
pilot_compute_description = {
                            "resource_url":"spark://129.114.58.102:7077",
                            "type":"spark"
                            }
pilot = PilotComputeService.create_pilot(pilot_compute_description);

# print out details of Pilot-Spark
details = pilot.get_details()
HTML("<a target='blank' href='%s'>Spark Web UI</a>"%details["web_ui_url"])

### 2.1.2 Start Spark Cluster inside YARN (Chameleon)

In [None]:
%run util/init_spark.py

NUMBER_EXECUTORS=10

from pilot_hadoop import PilotComputeService as PilotSparkComputeService

pilotcompute_description = {
    "service_url": "yarn-client://yarn.radical-cybertools.org",
    "number_of_processes": NUMBER_EXECUTORS,
    "physical_memory_per_process": "16G" 
}

print "SPARK HOME: %s"%os.environ["SPARK_HOME"]
print "PYTHONPATH: %s"%os.environ["PYTHONPATH"]

start = time.time()
pilot_spark = PilotSparkComputeService.create_pilot(pilotcompute_description=pilotcompute_description)
sc = pilot_spark.get_spark_context()
print "Spark Startup, %.2f"%(time.time()-start)

## 2.1.3. Start Spark Cluster (Wrangler)

    export JAVA_HOME=/usr/java/jdk1.8.0_45/
    saga-hadoop --resource=slurm://localhost --queue=normal --walltime=59 --number_cores=24 --project=TG-MCB090174 --framework spark

In [None]:
%run util/init_spark_wrangler.py

from pilot_hadoop import PilotComputeService as PilotSparkComputeService

pilotcompute_description = {
    "service_url": "spark://129.114.58.105:7077"
}

print "SPARK HOME: %s"%os.environ["SPARK_HOME"]
print "PYTHONPATH: %s"%os.environ["PYTHONPATH"]

start = time.time()
pilot_spark = PilotSparkComputeService.create_pilot(pilotcompute_description=pilotcompute_description)
sc = pilot_spark.get_spark_context()
print "Spark Startup, %.2f"%(time.time()-start)

## 2.2 Spark Smoke Test

In [None]:
!hadoop fs -text "/data/mdanalysis/synthetic/10.np_txt"

In [None]:
sc.addFile("hdfs:///data/mdanalysis/synthetic/10.np_txt")

In [None]:
rdd=sc.parallelize(range(2))

In [None]:
rdd.count()

In [None]:
import os, subprocess
rdd.map(lambda a: subprocess.check_output('python --version', shell=True, stderr=subprocess.STDOUT)).collect()

# Distance Computation

## Use File Staging

In [None]:
from pyspark import SparkFiles
from MDAnalysis.core.distances import distance_array, self_distance_array
from MDAnalysis.analysis.distances import contact_matrix
import scipy.sparse
import numpy as np
from scipy.spatial.distance import cdist
import sys
import gc

cutoff = 15.0

def get_edges_partition(adjacency_matrix, cutoff=15.0):
    it = np.nditer(adjacency_matrix, flags=['multi_index'])
    edge_list = []
    while not it.finished:
        value = it[0]
        if cutoff < value:
            # only connect 1 undirectional edge, e.g. <0,1>, but not <1,0>'
            if it.multi_index[0]<=it.multi_index[1]:
                edge_list.append((it.multi_index[0], it.multi_index[1]))
                #print "%d <%s>" % (it[0], it.multi_index),
        it.iternext()
    return edge_list

def compute_distance_file(point_index_data_file):
    # 1-D Partitioning
    point_index_start = point_index_data_file[0]
    point_index_end = point_index_data_file[1]
    data = point_index_data_file[2]
    filename=SparkFiles.get(data)
    coord_all = np.loadtxt(filename, dtype='float32')
    coord_part = coord_all[point_index_start:point_index_end]
    adj = cdist(coord_part, coord_all)
    edge_list = get_edges_partition(adj)
    del adj
    #del coord_part
    #del coord_all
    #gc.collect()
    #return edge_list
    return edge_list

Read File Test

In [None]:
import math
#data=sc.textFile(DATA_FILE).collect()
import hdfs
import numpy as np
client = hdfs.client.Client("http://radical-5:50070")
DATA_FILE="hdfs:///data/mdanalysis/small/md_centered.xtc_95Atoms.np_txt"
PARTITION_SIZE=10
content=None
with client.read(DATA_FILE.replace("hdfs://", "")) as reader:
    content=reader.read()
data=np.fromstring(content, dtype="float32", sep=" \n")
data=data.reshape(len(data)/3,3)
number_rows=len(data)
number_partitions=(number_rows/PARTITION_SIZE)+1
print "Number Partitions: %d"%number_partitions

partitions=map(lambda a: (a*PARTITION_SIZE, 
                         (a*PARTITION_SIZE)+PARTITION_SIZE, 
                         os.path.basename(DATA_FILE)), 
           range(number_partitions))
partitions

In [None]:
len(partitions)

In [None]:
sc.addFile(DATA_FILE)
part_rdd=sc.parallelize(partitions, len(partitions))
start = time.time()
edges_list=part_rdd.map(compute_distance_file).flatMap(lambda a: a).collect()
print str(len(edges_list))
print "ComputeDistanceSpark, %d, %d, %.2f"%(len(data), NUMBER_EXECUTORS, (time.time()-start))

### HDFS Staging

In [None]:
import hdfs
import numpy as np
client = hdfs.client.Client("http://radical-5:50070")
content=client.read("/data/mdanalysis/small/md_centered.xtc_95Atoms.np_txt").read()
data_np=np.fromstring(content, dtype="float32", sep=" \n")
data_np=data_np.reshape(len(data_np)/3,3)
len(data_np)

In [None]:
data_np=np.fromstring(content, dtype="float32", sep=" \n")
data_np=data_np.reshape(len(data_np)/3,3)
len(data_np)

In [None]:
NUMBER_EXECUTORS

In [None]:
partitions

In [None]:
os.path.basename(DATA_FILE)

### Broadcast-based Implementation (Optimized)

In [None]:
import numpy as np
#coord = np.loadtxt("vesicle_1_5M_373_stride1000.xtc_145746Atoms.np_txt", dtype='float32')
#coord = np.loadtxt("/data/mdanalysis/medium/md_prod_12x12_everymicroS_pbcmolcenter.xtc_44784Atoms.np_txt", dtype='float32')
DATA_PATH="../data/mdanalysis/synthetic/traj/"
files=[os.path.join(DATA_PATH, i) for i in os.listdir(DATA_PATH) if i.endswith(".np_txt")]
NUMBER_EXECUTORS=48

#coord = np.loadtxt("/data/mdanalysis/small/md_centered.xtc_95Atoms.np_txt", dtype='float32')
coord = np.loadtxt(files[-1], dtype='float32')

In [None]:
!hadoop fs -ls /data/mdanalysis/small

In [None]:
coord_broadcast = sc.broadcast(coord)

In [None]:
coord_all = coord_broadcast.value

In [None]:
coord_all[1:5]

In [None]:
part_rdd=sc.parallelize(range(len(coord_broadcast.value)), NUMBER_EXECUTORS)
part_rdd.cache()

The following code wraps the MDAnalysis functions into Spark code that is executed in a data-parallel way either on an individual or a batch of points (1-D partitioning)

In [None]:
from MDAnalysis.core.distances import distance_array, self_distance_array
from MDAnalysis.analysis.distances import contact_matrix
import scipy.sparse
from scipy.spatial.distance import cdist
import sys
import gc


cutoff = 15.0

################################################################################
# Process batch of points (a partition of the RDD)

def get_edges_partition(adjacency_matrix, cutoff=15.0):
    it = np.nditer(adjacency_matrix, flags=['multi_index'])
    edge_list = []
    while not it.finished:
        value = it[0]
        if cutoff < value:
            # only connect 1 undirectional edge, e.g. <0,1>, but not <1,0>'
            if it.multi_index[0]<=it.multi_index[1]:
                edge_list.append((it.multi_index[0], it.multi_index[1]))
                #print "%d <%s>" % (it[0], it.multi_index),
        it.iternext()
    return edge_list

def compute_distance_partition(iterator):
    """Partition points in 1-D"""
    min_value=sys.maxint
    max_value=-sys.maxint-1
    for i in iterator:
        if i < min_value:
            min_value = i
        if i > max_value:
            max_value = i
    
    # 2-D Partitioning
    coord_all = coord_broadcast.value
    coord_part = coord_all[min_value:max_value]
    #print "**All**"
    #print str(coord_all)
    #print "**Part**"
    #print str(coord_part)
    #adj=contact_matrix(coord_part, returntype="sparse")
    #adj = distance_array(coord_part, coord_all, box=None)
    adj = cdist(coord_part, coord_all)
    #print "**scipy.spatial.distance.cdist**"
    #print(adj)
    #adj2 = distance_array(coord_part, coord_all, box=None)
    #print "**MDAnalysis**"
    #print(adj2)
    
    edge_list = get_edges_partition(adj)
    del coord_part
    del coord_all
    del adj
    gc.collect()
    return edge_list


################################################################################
# Process one point at a time
def get_edges_point(point_index, adjacency_matrix, cutoff=15.0):
    edge_list = []
    for index, i in np.ndenumerate(adjacency_matrix):
        #print ("Index: %d, Value: %d"%(index[i], i))
        #if point_index<=index[1] and i<cutoff:
        if i==True and point_index<=index[1]:
            # Attention we only compute the upper half of the adjacency matrix
            # thus we need to offset the target edge vertice by point_index
            edge_list.append((point_index, point_index+index[1]))
    #del adjacency_matrix
    return edge_list

def compute_distance(point_index):
    # 1-D Partitioning
    coord_all = coord_broadcast.value
    coord_part = coord_all[point_index-1:point_index]
    #adj = (distance_array(coord_part, coord_all[point_index:], box=None) < cutoff)
    adj = (cdist(coord_part, coord_all) < cutoff)
    #adj = cdist(coord_part, coord_all)
    edge_list = get_edges_point(point_index, adj)
    del adj
    #del coord_part
    #del coord_all
    #gc.collect()
    return edge_list

In [None]:
start = time.time()
#edges_list=part_rdd.map(compute_distance).flatMap(lambda a: a).collect()
edges_list_spark=part_rdd.mapPartitions(compute_distance_partition).collect()
print str(len(edges_list))
print "ComputeDistanceSpark, %d, %d, %.2f"%(len(coord_all), NUMBER_EXECUTORS, (time.time()-start))

In [None]:
#start = time.time()
#edges_list_local = compute_distance_partition(iter(range(20000)))
#print str(len(edges_list))
#print "ComputeDistanceLocal, %d, %.2f"%(len(coord_all),(time.time()-start))

for i in range(1):
    start = time.time()
    edges_list=part_rdd.map(compute_distance).flatMap(lambda a: a).collect()
    #edges_list_spark=part_rdd.mapPartitions(compute_distance_partition).collect()
    print str(len(edges_list))
    print "ComputeDistanceSpark, %d, %d, %.2f"%(len(coord_all), NUMBER_EXECUTORS, (time.time()-start))
    del edges_list

## Unoptimized using cartesian product

Not good for sparse result data. Only usable on a very small sample:

    sample=row_rdd.sample(False, 0.01, 81)
    sample.count()


In [None]:
import pyspark.mllib.linalg.distributed
coord_matrix=pyspark.mllib.linalg.distributed.RowMatrix(sc.parallelize(coord[:200], 4))
row_rdd=coord_matrix.rows
row_rdd.cache()

In [None]:
row_rdd.count()

In [None]:
coord = np.loadtxt("md_centered.xtc_95Atoms.np_txt", dtype='float32')
#coord_str = np.array2string(coord, separator=",")

In [None]:
coord

In [None]:
coord.shape[0]

In [None]:
coord_str=[]
for i in range(len(coord)):
    coord_str.append(str(coord[i][0]) +","+ str(coord[i][1]) +","+ str(coord[i][2]))

In [None]:
start = time.time()
sc.parallelize(range(200), 4).count()
print "Count, %.2f"%((time.time()-start))

In [None]:
row_rdd.cartesian(row_rdd).map(lambda a: a).collect()

In [None]:
start = time.time()
distances=  row_rdd.cartesian(row_rdd).\
            map(lambda a: (a[0].squared_distance(a[1]))).\
            filter(lambda a: a>15.0).\
            saveAsTextFile("distances.csv")
print "ComputeDistance, %.2f"%(time.time()-start)

## n-D Partitioning

In [None]:
num_partitions=4

def compute_distance_2d(partition_index):
    # 2-D Partitioning
    coord_all = coord_broadcast.value[:100]
    length = len(coord_all)
    # identify square to work on    
    xdim = math.sqrt(num_partitions)
    ydim = math.sqrt(num_partitions)
    xdim/partition_index
    len=len(coord_all)

# Connected Components: Pilot-InMemory Implementation (Graph)

In [None]:
from distributed_inmem.dataunit_spark import DistributedInMemoryDataUnit
import time

FILENAME="../data/mdanalysis/small/graph_edges_95_215.csv"

FILENAME_ALL_EDGES="../data/mdanalysis/small/graph_edges_95_215_alledges.csv"
du = DistributedInMemoryDataUnit(name="LeafletFinderGraph", sc=sc)

#DistributedInMemoryDataUnit.spark_context.version

f = open(FILENAME_ALL_EDGES)
graph = f.readlines()
du.load(graph)
f.close()

def identityMapper(edge, args):
    #print edge
    #comp = edge.strip().split(",")
    #return (int(comp[0]), int(comp[1]))
    return eval(str(edge))

def groupByVertex(data):
    print("Call reduce on: " + str(data))
    

new_iteration_needed = du.sc.accumulator(0)

# check for smaller keys in each set
def process_vertex(vertex):
    """ pass single vertex and its adjecent vertices
        e.g.: (0, [0, 67, 14])
    """
    global new_iteration_needed
    vertex = eval(vertex)
    source = int(vertex[0])
    dest= sorted([int(i) for i in vertex[1]])
    local_max = False
    
    first_edge_destination = int(dest[0])
    new_vertices = []    
    print "*********Source: %d First Edge Dest: %d"%(source, first_edge_destination) 
    if source <= first_edge_destination:
        local_max = True
        new_vertices.append((source, first_edge_destination))
            

    print "Process: " + str(vertex) + " Local Max: " + str(local_max)
    last_edge_destination = first_edge_destination

    for current_destination in vertex[1]:
        print "Current destination: %s"%str(current_destination)
        current_destination = int(current_destination)
        if current_destination == last_edge_destination: 
            continue
        
        if local_max == True:
            edge = (source, current_destination)
            new_vertices.append(edge)
        else:
            new_vertices.append((first_edge_destination, current_destination))
            new_vertices.append((current_destination, first_edge_destination))
            print "Add 1 to accumulator"
            new_iteration_needed.add(1)

        last_edge_destination = current_destination
    
    if ((not local_max) and (source < last_edge_destination)):
        new_vertices.append((source, first_edge_destination))
    
    print "Return new vertices: " + str(new_vertices)
    return new_vertices


#process_vertex("('19', ['19', '7', '9', '41'])")
num_iterations=0
start = time.time()
while True:
    old_accum_value = new_iteration_needed.value
    print "*********** Start iteration: %d " % num_iterations
    future_result = du.map_pilot(identityMapper, None, number_of_compute_units=2)
    result_du=future_result.result()[0]
    future_result = result_du.reduce_pilot(process_vertex, number_of_compute_units=2)
    output = future_result.result()
    output.export()   
    du = output
    num_iterations = num_iterations + 1
    print "New iteration accum: %d old value: %d"%(new_iteration_needed.value, old_accum_value)
    if old_accum_value < new_iteration_needed.value:
        #print "Accumulator value was increased. New iteration."
        continue        
        #pass
    else:
        break
    break
end = time.time()
print "Final results: "
num_components=du.data.groupByKey().count()
print "Finished after %d Iterations. Found %d components. Time: %.2f"%(num_iterations, num_components, (end-start))

In [None]:
du.data.groupByKey().count()

## 2.5 Native Spark Implementation

In [None]:
!hadoop fs -ls /data/mdanalysis/large

In [None]:
rdd = sc.textFile("/data/mdanalysis/large/graph_edges_145746_1012872.csv")

### 2.5.1 Load data from text file 

In [None]:
%%time
FILENAME="/data/mdanalysis/small/graph_edges_95_215.csv"
data = sc.textFile(FILENAME).map(lambda line: [int(i) for i in line.split(",")])
# add backward edges
data = data.flatMap(lambda v: [(v[0],v[1]),(v[1],v[0])])

#data.saveAsTextFile("../data/mdanalysis/small/graph_edges_95_215_alledges.csv")
#data = data.filter(lambda v: v[0] != v[1])
#print data.collect()

data_grouped = data.groupByKey().mapValues(lambda a: sorted(set(a)))
print data_grouped.collect()

#### Connected Component Implementation

In [None]:
new_iteration_needed = sc.accumulator(0)
# check for smaller keys in each set
def process_vertex(vertex):
    """ pass single vertex and its adjecent vertices
        e.g.: (0, [0, 67, 14])
    """
    global new_iteration_needed
    source = vertex[0]
    local_max = False
    
    first_edge_destination = vertex[1][0]
    new_vertices = []    
    print "*********Source: %d First Edge Dest: %d"%(source, first_edge_destination) 
    if source <= first_edge_destination:
        local_max = True
        new_vertices.append((source, first_edge_destination))
            
    #pdb.set_trace()
    print "Process: " + str(vertex) + " Local Max: " + str(local_max)
    last_edge_destination = first_edge_destination

    #if vertex[1]==None or len(vertex[1])<=1:
    #    new_vertices.append((source, source))   
    for current_destination in vertex[1]:
        # print "Current destination: %s"%str(current_destination)
        # remove duplicates
        if current_destination == last_edge_destination: 
            continue
        
        if local_max == True:
            edge = (source, current_destination)
            new_vertices.append(edge)
        else:
            new_vertices.append((first_edge_destination, current_destination))
            new_vertices.append((current_destination, first_edge_destination))
            print "Add 1 to accumulator"
            new_iteration_needed.add(1)

        last_edge_destination = current_destination
    
    if ((not local_max) and (source < last_edge_destination)):
        new_vertices.append((source, first_edge_destination))
    
    #print "Return new vertices: " + str(new_vertices)
    return new_vertices


#process_vertex((19, [7, 9, 19, 41]))
num_iterations=0
cc = data_grouped
start = time.time()
while True:
    old_accum_value = new_iteration_needed.value
    print "*********** Start iteration: %d " % num_iterations
    #print "Accum before iteration: " + str(old_accum_value)
    cc = cc.flatMap(lambda v: process_vertex(v))\
           .groupByKey()\
           .mapValues(lambda a: sorted(set(a)))
    cc.collect()
    num_iterations = num_iterations + 1
    #print "New iteration accum: %d old value: %d"%(new_iteration_needed.value, old_accum_value)
    if old_accum_value < new_iteration_needed.value:
        #print "Accumulator value was increased. New iteration."
        continue
    else:
        break
end = time.time()

print "Finished after %d Iterations. Found %d components. Time: %.2f"%(num_iterations, cc.count(), (end-start))

# Giannis

In [None]:
import sys
import numpy as np

def get_distance(Atom1, Atom2):
    # Calculate Euclidean distance. 1-D and 3-D in the future
    return np.sqrt(sum((Atom1 - Atom2) ** 2))

def n_dim_input_to_numpy_array(temp):
    temp = temp.split(',')
    temp = map(float,temp)
    return np.asfarray(temp)
    calc_count = calc_count +1

if __name__ == '__main__':


    args = sys.argv[1:]
    WINDOW_SIZE = int(sys.argv[1])
    reading_start_point_i = int(sys.argv[2]) -1
    j_dim = int(sys.argv[3]) -1
    total_file_lines =  int(sys.argv[4])
    cutoff = float(sys.argv[5])

    #----------------------Reading Input File-------------------------------#

    read_file = open('input.txt')


    atoms = list()
    for count, line in enumerate(read_file):
        if count == total_file_lines+1 or count >= reading_start_point_i+WINDOW_SIZE:
            break
        if count >= reading_start_point_i and count <reading_start_point_i+WINDOW_SIZE:
            atoms.append(n_dim_input_to_numpy_array(line))

    # That means that we are not calculating the elements of the main diagonal which are the same.
    # We do calculate differnt atoms
    if reading_start_point_i!=j_dim:
        atomsY = list()
        atomsY.append(n_dim_input_to_numpy_array(line)) # already read from previous for-loop
        for countY, line in enumerate(read_file):
            if countY > j_dim + WINDOW_SIZE-count-1:
                break
            if countY >= j_dim-count and countY < j_dim + WINDOW_SIZE-count-1:  #-1 because we already appended the first line 
                atomsY.append(n_dim_input_to_numpy_array(line))
    read_file.close()

    # the difference is that in the Cus compute data that are in main diagonal compute half of the elements 
    # because table is symmetric, so the second loop can be half in the first case 
    distances=np.empty((WINDOW_SIZE,WINDOW_SIZE),dtype='bool')
    if reading_start_point_i == j_dim:
        for i in range(0,WINDOW_SIZE):
            for j in range(i+1,WINDOW_SIZE):
                dist = get_distance(atoms[i],atoms[j])  
                if dist<=cutoff:
                    distances[i][j]=True 
                else:
                    distances[i][j]=False
    else:
        for i in range(0,WINDOW_SIZE):
            for j in range(0,WINDOW_SIZE):
                dist = get_distance(atoms[i],atomsY[j])  
                if dist<=cutoff:
                    distances[i][j]=True
                else:
                    distances[i][j]=False

    np.save("distances_%d_%d.npz.npy" % (reading_start_point_i,j_dim),distances)

# Benchmark

---
## Scratch Space

In [None]:
#start = df_grouped
result=start.flatMap(lambda v: (v[0], v[1])).map(lambda v: v<start_index).countByValue()

print sttrresult

local_max = not result.has_key(True)

print "Local Max: " + str(local_max) + " Smaller Index: " + str(result.has_key(True))

In [None]:
schema = StructType([
            StructField("source", IntegerType(), True),
            StructField("destination", IntegerType(), True)
        ])
df = sqlCtx.createDataFrame(data, schema)
df.explain()
schema_grouped = StructType([
            StructField("source", IntegerType(), True),
            StructField("destination", ArrayType(IntegerType()), True)
        ])
df_grouped = sqlCtx.createDataFrame(data_grouped, schema_grouped)

In [None]:
from pyspark.sql.functions import udf, lit
from pyspark.sql.functions import *
from pyspark.sql.types import ArrayType

t = udf(lambda s: str(s), StringType())
slen = udf(lambda s: Column(len(s)), IntegerType())

#df.groupBy("source").collect()
#df.groupBy("source").agg(df.source, t(df.source))

c = df.groupBy(df.source).agg(col("source"), slen(df.destination))

#c = df.agg(col("source"), t(df.destination).alias('counts'))
c.head(5)


#c = df.groupBy(df.source).lit(df.destination)

In [None]:
vertices = df.select(df["source"]).unionAll(df.select(df["destination"]))
vertices = di_source.distinct()

print "Number of vertices: %d"%(vertices.count())

# GraphLab

In [None]:
from graphlab import SGraph, SFrame
from graphlab import connected_components

PROBLEM={"small": "./data/mdanalysis/small/graph_edges_95_215.csv",
         "medium":"./data/mdanalysis/medium/graph_edges_24056_71826.csv"}

d =datetime.datetime.now()
RESULTSFILE = "results-" + d.strftime("%Y%m%d-%H%M%S") + ".csv"
REPEATS=5

start = time.time()
data = SFrame.read_csv(filename, header=False)
sg = SGraph().add_edges(data, src_field="X1", dst_field="X2")
end_read=time.time()
cc = connected_components.create(sg)
s=cc["component_size"]
end_connected = time.time()
print cc
print s