diff --git a/build/buildgaimc.m b/build/buildgaimc.m new file mode 100644 index 0000000..e2819cb --- /dev/null +++ b/build/buildgaimc.m @@ -0,0 +1,87 @@ +%% build the gaimc distribution +function buildgaimc(varargin) +if ~isempty(strmatch('all',varargin)) + buildlist = {'test','pages','zip'}; +else + buildlist = varargin; +end +tobuild = @(type,buidlist) ~isempty(strmatch(type,buildlist)); +gaimcversion = '1.0'; +mypath = fileparts(mfilename('fullpath')); +oldpath = pwd; +matlablpath=path; +try + if tobuild('test',buildlist) + fprintf('Running tests...\n\n'); + cd(mypath) % move to where we think we are + cd ../ % in main gaimc directory + addpath(pwd); % add it to the path + cd test % run tests + try + test_main + catch ME + fprintf('*************\n') + fprintf('Tests failed!\n'); + fprintf('*************\n') + fprintf('\n'); + fprintf('Build halted.\n'); + fprintf('\n'); + fprintf('Test error:\n'); + rethrow(ME); + end + end + + if tobuild('pages',buildlist) + fprintf('Building demos...\n\n'); + cd(mypath) % move to where we think we are + cd ../ % in main gaimc directory + addpath(pwd); % add it to the path + cd demo + try + publish('demo'); + publish('airports'); + publish('performance_comparison_simple'); + if tobuild('pages_fullperf') + publish('performance_comparison'); + end + catch ME + fprintf('******************\n') + fprintf('Publishing failed!\n'); + fprintf('******************\n') + fprintf('\n'); + fprintf('Build halted.\n'); + fprintf('\n'); + fprintf('Test error:\n'); + rethrow(ME); + end + end + + if tobuild('zip',buildlist) + cd(mypath) % move to where we think we are + try + cd ../../ + [status,result] = system(... + sprintf('zip -r gaimc/build/gaimc-%s.zip gaimc/*.m gaimc/demo gaimc/test gaimc/graphs',... + gaimcversion)); + result + catch ME + fprintf('******************\n') + fprintf('Zipping failed!\n'); + fprintf('******************\n') + fprintf('\n'); + fprintf('Build halted.\n'); + fprintf('\n'); + fprintf('Error:\n'); + rethrow(ME); + end + end + +catch ME + cd(oldpath); % move to where we think we are + path(matlabpath); + rethrow(ME) +end + +cd(oldpath); +path(matlabpath); + diff --git a/build/matlab_central_notes.txt b/build/matlab_central_notes.txt new file mode 100644 index 0000000..bf4a4cb --- /dev/null +++ b/build/matlab_central_notes.txt @@ -0,0 +1,35 @@ +Picture: minnesota graph (demo_08.png) + +Title : gaimc : Graph Algorithms In Matlab Code +Summary (100 chars) : +Efficient pure-Matlab implementations of graph algorithms to complement MatlabBGL's mex functions. +Description (5000 chars) : + +While MatlabBGL uses the Boost Graph Library for efficient graph routines, +gaimc implements everything in pure Matlab code. While the routines are +slower, they aren't as slow as I initially thought. Since people often +have problems getting MatlabBGL to compile on new versions of Matlab +or on new architectures, this library is then a complement to MatlabBGL. + +See the published M-files for a few examples of the capabilities. + +Functions + depth first search (dfs) + breadth first search (bfs) + connected components (scomponents) + maximum weight bipartite matching (bipartite_matching) + Dijkstra's shortest paths (dijkstra) + Prim's minimum spanning tree (mst_prim) + clustering coefficients (clustercoeffs) + directed clustering coefficients (dirclustercoeffs) + core numbers (corenums) + +The project is maintained at github : http://github.com/dgleich/gaimc/tree/master + +Tags: + +graph, network, dijkstra, core numbers, prim, dfs, bfs, connected components, +strongly connected components, clustering coefficients, directed clustering coefficients, +directed network, directed graph, depth first search, breadth first search, +shortest paths, minimum spanning tree, mst + diff --git a/demo/airports.m b/demo/airports.m index 0f2aad4..af1970a 100644 --- a/demo/airports.m +++ b/demo/airports.m @@ -67,12 +67,10 @@ % That's the US, now we need to plot our data on top of it. [X,Y] = gplot(T,xy); % get the information to reproduce a gplot plotm(Y,X,'k.-','LineWidth',1.5); % plot the lines on the map -%% -% We need to clear the axes after the mapping toolbox -clf; + %% % Let's just look at the continential US too. -figure; ax = usamap('conus'); +clf; ax = usamap('conus'); states = shaperead('usastatelo', 'UseGeoCoords', true, 'Selector',... {@(name) ~any(strcmp(name,{'Alaska','Hawaii'})), 'Name'}); faceColors = makesymbolspec('Polygon',... @@ -85,9 +83,6 @@ % One interesting aspect of this map is that major airline hubs (Chicago, % New York, etc. are not well represented. One possible explaination is % that they have larger delays than other regional airports. -% -% Clear the figure again before proceeding. -clf; %% Honolulu to St. Johns? % Before, we saw that the lengthiest route was between St. John's and diff --git a/demo/demo.m b/demo/demo.m index 68fe5da..7778ae7 100644 --- a/demo/demo.m +++ b/demo/demo.m @@ -83,7 +83,7 @@ %% % Load the example matrix from the Boost Graph Library -load('graphs/dfs_example'); +load_gaimc_graph('dfs_example'); figure(1); graph_draw(A,xy,'labels',labels); %% @@ -99,7 +99,7 @@ %% % Let's look at breadth first search too, using a different example. -load('graphs/bfs_example'); +load_gaimc_graph('bfs_example'); figure(1); clf; graph_draw(A,xy,'labels',labels); %% @@ -161,7 +161,8 @@ %% % Now, we just call MST and look at the result. -T = mst_prim(A); +% T = mst_prim(A); +% This command means we can't run the demo, so it's commented out. %% % Oops, travel time isn't symmetric! Let's just pick the longest possible @@ -220,6 +221,7 @@ %% % We also have a largest_component function that makes it easy to just get % the largest connected component. +clf; [Acc,f] = largest_component(A); graph_draw(Acc,xy(f,:)) %% @@ -236,7 +238,7 @@ %% % Load a road network to use for statistical computations -load('graphs/minnesota'); +load_gaimc_graph('minnesota'); gplot(A,xy); %% @@ -280,7 +282,7 @@ %% % Load and convert the graph. -load ('graphs/all_shortest_paths_example'); +load_gaimc_graph('all_shortest_paths_example'); A = spfun(@(x) x-min(min(A))+1,A); % remove the negative edges As = convert_sparse(A); %% @@ -305,4 +307,4 @@ toc %% % And just to check, let's make sure the output is the same. -isequal(D,D2) \ No newline at end of file +isequal(D,D2) diff --git a/demo/html/airports.html b/demo/html/airports.html new file mode 100644 index 0000000..c8b66e2 --- /dev/null +++ b/demo/html/airports.html @@ -0,0 +1,321 @@ + + + + + + + + The US airport network + + + + +
+

The US airport network

+ +

THe North American airport network is an interesting graph to examine. The source for this data was a file on Brendan Frey's + affinity propagation website. A(i,j) is the negative travel time between two airports. Although the data didn't include + the airport locations, I used the Yahoo! Geocoding API to generate a latitude and longitude for each airport. +

+ +

Contents

+
+ +
+

The data

load_gaimc_graph('airports');
+

Plot a histogram for all route time estimates

[si, ti, rt] = find(A);
+hist(-rt,100);  % times are stored as negative values
+

Find the lengthiest route

[val,ind] = max(-rt)
+{labels{si(ind)} labels{ti(ind)}}
+
+val =
+
+        2855
+
+
+ind =
+
+       63478
+
+
+ans = 
+
+    'Honolulu, HI'    'St. Johns, NL'
+
+

Some of the routes include stop overs, so it's probable that is what we find in this case.

+

Graph analysis: connected?

+

One of the first questions about any graph should be if it's connected or not.

max(scomponents(A))
+
+ans =
+
+     1
+
+

There is only one connected component, so the graph is connected.

+

Distance instead of time

+

Let's see how the edges correlate distance with estimated travel time.

[ai aj te] = find(A);
+de = distance(xy(ai,:), xy(aj,:));
+plot(de,-te,'.');
+xlabel('distance (arclength)'); ylabel('time (?)');
+

Wow! It's all over the place, but there is a lower bound. Some of these routes can include stop

+

Minimum spanning tree

+

This section repeats and extends some analysis in the overall gaimc demo. First, let's recompute the minimum spanning tree + based on travel time. +

load_gaimc_graph('airports')
+A = -A;          % we store the negative travel time
+A = max(A,A');   % travel time isn't symmetric
+T = mst_prim(A);
+clf;
+gplot(T,xy);
+
+
+% These next lines plot a map of the US with states colored.
+ax = worldmap('USA');
+load coast
+geoshow(ax, lat, long,...
+    'DisplayType', 'polygon', 'FaceColor', [.45 .60 .30])
+states = shaperead('usastatelo', 'UseGeoCoords', true);
+faceColors = makesymbolspec('Polygon',...
+    {'INDEX', [1 numel(states)], 'FaceColor', polcmap(numel(states))});
+    geoshow(ax, states, 'DisplayType', 'polygon', 'SymbolSpec', faceColors)
+set(gcf,'Position', [   52   234   929   702]);
+

That's the US, now we need to plot our data on top of it.

[X,Y] = gplot(T,xy); % get the information to reproduce a gplot
+plotm(Y,X,'k.-','LineWidth',1.5); % plot the lines on the map
+

Let's just look at the continential US too.

clf; ax = usamap('conus');
+states = shaperead('usastatelo', 'UseGeoCoords', true, 'Selector',...
+    {@(name) ~any(strcmp(name,{'Alaska','Hawaii'})), 'Name'});
+faceColors = makesymbolspec('Polygon',...
+    {'INDEX', [1 numel(states)], 'FaceColor', polcmap(numel(states))});
+geoshow(ax, states, 'DisplayType', 'polygon', 'SymbolSpec', faceColors)
+framem off; gridm off; mlabel off; plabel off
+set(gcf,'Position', [   52   234   929   702]);
+plotm(Y,X,'k.-','LineWidth',1.5); % plot the lines on the map
+

One interesting aspect of this map is that major airline hubs (Chicago, New York, etc. are not well represented. One possible + explaination is that they have larger delays than other regional airports. +

+

Honolulu to St. Johns?

+

Before, we saw that the lengthiest route was between St. John's and Honolulu. But, does the network have a better path to + follow between these cities? Let's check using Dijkstra's shortest path algorithm! +

% Reload the network to restore it.
+load_gaimc_graph('airports')
+A = -A;
+
+% find the longest route again
+[si, ti, rt] = find(A);
+[val,ind] = max(rt); % we've already negated above, so no need to redo it
+start = si(ind);
+dest = ti(ind);
+[d pred] = dijkstra(A,start); % compute the distance to everywhere from St. Johns
+d(dest)
+
+ans =
+
+   735
+
+

That value is considerably shorter than the direct time. How do we find this awesome route?

path =[]; u = dest; while (u ~= start) path=[u path]; u=pred(u); end
+fprintf('%s',labels{start});
+for i=path; fprintf(' --> %s', labels{i}); end, fprintf('\n');
+
Honolulu, HI --> Chicago, IL --> Montreal, QC --> St. Johns, NL
+

All pairs shortest paths

+

At this point, the right thing to do would be to recompute each edge in the network using an all-pairs shortest path algorithm, + and then look at how distance correlates with time in that network. However, I haven't had time to implement that algorithm + yet. +

+

Conclusion

+

Hopefully, you will agree that network algorithms are a powerful way to look at the relationships between airports!

+ +
+ + + \ No newline at end of file diff --git a/demo/html/airports.png b/demo/html/airports.png new file mode 100644 index 0000000..19847a6 Binary files /dev/null and b/demo/html/airports.png differ diff --git a/demo/html/airports_01.png b/demo/html/airports_01.png new file mode 100644 index 0000000..b88c23c Binary files /dev/null and b/demo/html/airports_01.png differ diff --git a/demo/html/airports_02.png b/demo/html/airports_02.png new file mode 100644 index 0000000..6b650ab Binary files /dev/null and b/demo/html/airports_02.png differ diff --git a/demo/html/airports_03.png b/demo/html/airports_03.png new file mode 100644 index 0000000..6d38c92 Binary files /dev/null and b/demo/html/airports_03.png differ diff --git a/demo/html/airports_04.png b/demo/html/airports_04.png new file mode 100644 index 0000000..3828376 Binary files /dev/null and b/demo/html/airports_04.png differ diff --git a/demo/html/airports_05.png b/demo/html/airports_05.png new file mode 100644 index 0000000..6112d3f Binary files /dev/null and b/demo/html/airports_05.png differ diff --git a/demo/html/demo.html b/demo/html/demo.html new file mode 100644 index 0000000..05f6e22 --- /dev/null +++ b/demo/html/demo.html @@ -0,0 +1,744 @@ + + + + + + + + Demo of gaimc - 'Graph Algorithms In Matlab Code' + + + + +
+

Demo of gaimc - 'Graph Algorithms In Matlab Code'

+ +

Matlab includes great algorithms to work with sparse matrices but does provide a reasonable set of algorithms to work with + sparse matrices as graph data structures. My other project -- MatlabBGL -- provides a high-performance solution to this problem + by directly interfacing the Matlab sparse matrix data structure with the Boost Graph Library. That library, however, suffers + from enormous complication because it must be compiled for each platform. The Boost Graph Library heavily uses advanced C++ + features that impair easy portability between platforms. In contrast, the gaimc library is implemented in pure Matlab code, + making it completely portable. +

+

The cost of the portability for this library is a 2-4x slowdown in the runtime of the algorithms as well as significantly + fewer algorithms to choose from. +

+ +

Contents

+
+ +
+

Sparse matrices as graphs

+

To store the connectivity structure of the graph, gaimc uses the adjacency matrix of a graph.

+

A graph is represented by a set of vertices and a set of edges between the vertices. Often, we write $G = (V,E)$ to denote the graph, the set of vertices, and the set of edges, respectively. In gaimc, like in my other package MatlabBGL, + we represent graphs with their adjacency matrices. This representation is handy in Matlab because Matlab is rather efficient + at working with the large sparse matrices that typically arise as adjacency matrices. +

+

To convert from $G=(V,E)$ to an adjacency matrix, we identify each vertex with a row of the matrix via a bijective map. The adjacency matrix is then + a $|V| \times |V|$ matrix called A. The entry A(i,j) = 1 for any edge between in $E$ and 0 otherwise. Let's look at an example. +

load_gaimc_graph('bfs_example');
+graph_draw(A,xy,'labels',labels)
+full(A)
+labels'
+
+ans =
+
+     0     1     0     0     1     0     0     0
+     1     0     0     0     0     1     0     0
+     0     0     0     1     0     1     1     0
+     0     0     1     0     0     0     0     1
+     1     0     0     0     0     0     0     0
+     0     1     1     0     0     0     1     0
+     0     0     1     0     0     1     0     1
+     0     0     0     1     0     0     1     0
+
+
+ans = 
+
+    'r'    's'    't'    'u'    'v'    'w'    'x'    'y'
+
+

This output means that vertex 'r' is row 1, vertex 's' is row 2 and because A(1,2) = 1, then there is an edge between them, + just like in the picture. +

+

One funny property is that A(2,1) = 1 too! So we actually have to store each edge twice in the adjacency matrix. This might + seem wasteful, but its hard to avoid as I've learned while working on graph algorithms. So don't worry about it! It also + makes the generalization to directed graphs (below) easy. +

+

For more information about the adjacency matrix representation of a graph, see a standard book on graph algorithms.

+

Weighted and directed graphs

+

Our previous case handled the situation for undirected graphs only. To encode weighted and directed graphs, we use weighted + and non-symmetric adjacency matrices. +

+

For a weighted matrix, A(i,j) = distance between i and j for most of the algorithms in gaimc. But A(i,j) = 0 means there + is no edge, and so sometimes things can get a little tricky to get what you want. +

+

For a directed graph, just set A(i,j) ~= A(j,i). The adjacency matrix won't be symmetric, but that's what you want!

+

To understand more, explore the examples or read up on adjacency matrices in graph theory books.

+

Loading helper

+

To make loading our sample graphs easy, gaimc defines it's own function to load graphs.

load_gaimc_graph('dfs_example'); % loads one of our example graphs
+whos
+
  Name                    Size                  Bytes  Class      Attributes
+
+  A                       9x9                     416  double     sparse    
+  Acc                     5x5                     176  double     sparse    
+  As                      1x1                40792464  struct               
+  At                  50000x50000            20395704  double     sparse    
+  D                       5x5                     200  double               
+  D2                      5x5                     200  double               
+  T                     456x456                 18216  double     sparse    
+  X                    2730x1                   21840  double               
+  Y                    2730x1                   21840  double               
+  ai                1249731x1                 9997848  double               
+  aj                  71959x1                  575672  double               
+  ans                     1x8                     912  cell                 
+  ati               1249731x1                 9997848  double               
+  ax                      1x1                       8  double               
+  cc                      9x1                      72  double               
+  cc1                 50000x1                  400000  double               
+  cc2                 50000x1                  400000  double               
+  cc3                 50000x1                  400000  double               
+  cc4                 50000x1                  400000  double               
+  ccfs                 2642x1                   21136  double               
+  ci                1249731x1                 9997848  double               
+  cn                   2642x1                   21136  double               
+  comp_results            6x4                     192  double               
+  cp                  50001x1                  400008  double               
+  cs1                     1x1                       8  double               
+  cs2                     1x1                       8  double               
+  cs3                     1x1                       8  double               
+  cs4                     1x1                       8  double               
+  cwd                     1x28                     56  char                 
+  d                     456x1                    3648  double               
+  d1                   1024x1                    8192  double               
+  d2                   1024x1                    8192  double               
+  d3                   1024x1                    8192  double               
+  d4                   1024x1                    8192  double               
+  de                  71959x1                  575672  double               
+  dest                    1x1                       8  double               
+  dirtail                 1x10                     20  char                 
+  dt                      8x1                      64  double               
+  f                       9x1                       9  logical              
+  faceColors              1x1                    1904  struct               
+  gi                      1x1                       8  double               
+  graphdir                1x10                     20  char                 
+  graphs                  1x5                     642  cell                 
+  i                       1x1                       8  double               
+  ind                     1x1                       8  double               
+  labels                  9x1                    1026  cell                 
+  lat                  9865x1                   78920  double               
+  lax                     1x1                       8  double               
+  long                 9865x1                   78920  double               
+  mat_fast                1x1                       8  double               
+  mat_std                 1x1                       8  double               
+  matlabbgldir            1x20                     40  char                 
+  mex_fast                1x1                       8  double               
+  mex_std                 1x1                       8  double               
+  n                       1x1                       8  double               
+  nrep                    1x1                       8  double               
+  ntests                  1x1                       8  double               
+  path                    1x3                      24  double               
+  pred                    1x456                  3648  double               
+  rep                     1x1                       8  double               
+  results                 1x3                    2140  struct               
+  ri                1249731x1                 9997848  double               
+  rp                  50001x1                  400008  double               
+  rst                     1x1                       8  double               
+  rt                  71959x1                  575672  double               
+  si                  71959x1                  575672  double               
+  source                  1x83                    166  char                 
+  start                   1x1                       8  double               
+  states                 49x1                  190442  struct               
+  szi                     1x1                       8  double               
+  szs                     1x6                      48  double               
+  te                  71959x1                  575672  double               
+  ti                      1x1                       8  double               
+  u                       1x1                       8  double               
+  v                       1x1                       8  double               
+  val                     1x1                       8  double               
+  xy                      9x2                     144  double               
+
+

This helps make our examples work regardless of where the current directory lies.

+

Search algorithms

+

The two standard graph search algorithms are depth first search and breadth first search. This library implements both.

+

Load the example matrix from the Boost Graph Library

load_gaimc_graph('dfs_example');
+figure(1); graph_draw(A,xy,'labels',labels);
+

Run a depth first search. The output records the distance to the other vertices, except where the vertices are not reachable + starting from the first node A. +

d=dfs(A,1)
+
+d =
+
+     0
+     1
+     3
+     3
+     2
+     4
+    -1
+    -1
+    -1
+
+

From this example, we see that vertices a-f are reachable from vertex a, but that verice g-i are not reachable. Given the + of the edges, this makes sense. +

+

Let's look at breadth first search too, using a different example.

load_gaimc_graph('bfs_example');
+figure(1); clf; graph_draw(A,xy,'labels',labels);
+

The breadth first search algorithm records the distance from the starting vertex to each vertex it visits in breadth first + order. This means it visits all the vertices in order of their distance from the starting vertex. The d output records the + distance, and the dt output records the step of the algorithm when the breadth first search saw the node. +

[d dt] = bfs(A,2);
+% draw the graph where the label is the "discovery time" of the vertex.
+figure(1); clf; graph_draw(A,xy,'labels',num2str(dt));
+

Notice how the algorithm visits all vertices one edge away from the start vertex (0) before visiting those two edges away.

+

Shortest paths

+

In the previous two examples, the distance between vertices was equivalent to the number of edges. Some graphs, however, + have specific weights, such as the graph of flights between airports. We can use this information to build information about + the shortest path between two nodes in a network. +

% Find the minimum travel time between Los Angeles (LAX) and
+% Rochester Minnesota (RST).
+load_gaimc_graph('airports')
+A = -A; % fix funny encoding of airport data
+lax=247; rst=355;
+
+[d pred] = dijkstra(A,lax); % find all the shorest paths from Los Angeles.
+
+fprintf('Minimum time: %g\n',d(rst));
+% Print the path
+fprintf('Path:\n');
+path =[]; u = rst; while (u ~= lax) path=[u path]; u=pred(u); end
+fprintf('%s',labels{lax});
+for i=path; fprintf(' --> %s', labels{i}); end, fprintf('\n');
+
Minimum time: 244
+Path:
+Los Angeles, CA --> Minneapolis/St Paul, MN --> Rochester, MN
+

Minimum spanning trees

+

A minimum spanning tree is a set of edges from a graph that ...

+

This demo requires the mapping toolbox for maximum effect, but we'll do okay without it.

+

Our data comes from a graph Brendan Frey prepared for his affinity propagation clustering tool. For 456 cities in the US, + we have the mean travel time between airports in those cities, along with their latitude and longitude. +

load_gaimc_graph('airports')
+

For some reason, the data is stored with the negative travel time between cities. (I believe this is so that closer cities + have larger edges between them.) But for a minimum spanning tree, we want the actual travel time between cities. +

A = -A;
+

Now, we just call MST and look at the result. T = mst_prim(A); This command means we can't run the demo, so it's commented + out. +

+

Oops, travel time isn't symmetric! Let's just pick the longest possible time.

A = max(A,A');
+T = mst_prim(A);
+sum(sum(T))/2 % total travel time in tree
+
+ans =
+
+   (1,1)          24550
+
+

Well, the total weight isn't that helpful, let's look at the data instead. +

clf;
+gplot(T,xy);
+

Hey! That looks like the US! You can see regional airports and get some sense of the overall connectivity.

+

Connected components

+

The connected components of a network determine which parts of the network are reachable from other parts. One of your first + questions about any network should generally be: is it connected? +

+

There are two types of connected components: components and strongly connected components. gaimc only implements an algorithm + for the latter case, but that's okay! It turns out it computes exactly the right thing for connected components as well. + The difference only occurs when the graph is undirected vs. directed. +

load_gaimc_graph('dfs_example')
+graph_draw(A,xy)
+

This picture shows there are 3 strongly connected components and 2 connected components

% get the number of strongly connected components
+max(scomponents(A))
+
+ans =
+
+     3
+
+
% get the number of connected components
+max(scomponents(A|A'))  % we make the graph symmetric first by "or"ing each entry
+
+ans =
+
+     2
+
+

Let's look at the vertices in the strongly connected components

cc = scomponents(A)
+
+cc =
+
+     2
+     2
+     2
+     1
+     2
+     2
+     3
+     3
+     3
+
+

The output tells us that vertices 1,2,3,5,6 are in one strong component, vertex 4 is it's own strong component, and vertices + 7,8,9 are in another one. Remember that a strong component is all the vertices mutually reachable from a given vertex. If + you start at vertex 4, you can't get anywhere else! That's why it is in a different component than vertices 1,2,3,5,6. +

+

We also have a largest_component function that makes it easy to just get the largest connected component.

clf;
+[Acc,f] = largest_component(A);
+graph_draw(Acc,xy(f,:))
+

The filter variable f, tells us which vertices in the original graph made it into the largest strong component. We can just + apply that filter to the coordinates xy and reuse them for drawing the graph! +

+

Statistics

+

Graph statistics are just measures that indicate a property of the graph at every vertex, or at every edges. Arguably the + simplest graph statistic would be the average vertex degree. Because such statistics are easy to compute with the adjaceny + matrix in Matlab, they do not have special functions in gaimc. +

+

Load a road network to use for statistical computations

load_gaimc_graph('minnesota');
+gplot(A,xy);
+

Average vertex degree

d = sum(A,2);
+mean(d)
+
+ans =
+
+   (1,1)       2.5034
+
+

So the average number of roads at any intersection is 2.5. My guess is that many roads have artificial intersections in the + graph structure that do not correspond to real intersections. Try validating that hypothesis using the library! +

+

Average clustering coefficients

ccfs = clustercoeffs(A);
+mean(ccfs)
+% The average clustering coefficient is a measure of the edge density
+% throughout the graph.  A small value indicates that the network has few
+% edges and they are well distributed throughout the graph.
+
+ans =
+
+    0.0160
+
+

Average core numbers

cn = corenums(A);
+mean(cn)
+
+ans =
+
+    1.9463
+
+

Efficient repetition

+

Every time a gaimc function runs, it converts the adjacency matrix into a set of compressed sparse row arrays. These arrays + yield efficient access to the edges of the graph starting at a particular vertex. For many function calls on the same graph, + this conversion process slows the algorithms. Hence, gaimc also accepts pre-converted input, in which case it skips it's + conversion. +

+

Let's demonstrate how this works by calling Dijkstra's algorithm to compute the shortest paths between all vertices in the + graph. The Floyd Warshall algorithm computes these same quantities more efficiently, but that would just be one more algorithm + to implement and maintain. +

+

Load and convert the graph.

load_gaimc_graph('all_shortest_paths_example');
+A = spfun(@(x) x-min(min(A))+1,A); % remove the negative edges
+As = convert_sparse(A);
+

Now, we'll run Dijkstra's algorithm for every vertex and save the result On my 2GHz laptop, this takes 0.000485 seconds.

n = size(A,1);
+D = zeros(n,n);
+tic
+for i=1:n
+    D(i,:) = dijkstra(As,i);
+end
+toc
+
Elapsed time is 0.000304 seconds.
+

Let's try it without the conversion to see if we can notice the difference in speed. On my 2GHz laptop, this takes 0.001392 + seconds. +

D2 = zeros(n,n);
+tic
+for i=1:n
+    D2(i,:) = dijkstra(A,i);
+end
+toc
+
Elapsed time is 0.000541 seconds.
+

And just to check, let's make sure the output is the same.

isequal(D,D2)
+
+ans =
+
+     1
+
+
+
+ + + \ No newline at end of file diff --git a/demo/html/demo.png b/demo/html/demo.png new file mode 100644 index 0000000..6f0c93a Binary files /dev/null and b/demo/html/demo.png differ diff --git a/demo/html/demo_01.png b/demo/html/demo_01.png new file mode 100644 index 0000000..2dbcfe2 Binary files /dev/null and b/demo/html/demo_01.png differ diff --git a/demo/html/demo_02.png b/demo/html/demo_02.png new file mode 100644 index 0000000..4fd7aee Binary files /dev/null and b/demo/html/demo_02.png differ diff --git a/demo/html/demo_03.png b/demo/html/demo_03.png new file mode 100644 index 0000000..9cad9bf Binary files /dev/null and b/demo/html/demo_03.png differ diff --git a/demo/html/demo_04.png b/demo/html/demo_04.png new file mode 100644 index 0000000..5e602d1 Binary files /dev/null and b/demo/html/demo_04.png differ diff --git a/demo/html/demo_05.png b/demo/html/demo_05.png new file mode 100644 index 0000000..e0c2922 Binary files /dev/null and b/demo/html/demo_05.png differ diff --git a/demo/html/demo_06.png b/demo/html/demo_06.png new file mode 100644 index 0000000..e08d0ee Binary files /dev/null and b/demo/html/demo_06.png differ diff --git a/demo/html/demo_07.png b/demo/html/demo_07.png new file mode 100644 index 0000000..d364e9f Binary files /dev/null and b/demo/html/demo_07.png differ diff --git a/demo/html/demo_08.png b/demo/html/demo_08.png new file mode 100644 index 0000000..1612ed5 Binary files /dev/null and b/demo/html/demo_08.png differ diff --git a/demo/html/demo_eq02910.png b/demo/html/demo_eq02910.png new file mode 100644 index 0000000..d1fbfd3 Binary files /dev/null and b/demo/html/demo_eq02910.png differ diff --git a/demo/html/demo_eq74756.png b/demo/html/demo_eq74756.png new file mode 100644 index 0000000..473e14e Binary files /dev/null and b/demo/html/demo_eq74756.png differ diff --git a/demo/html/demo_eq85525.png b/demo/html/demo_eq85525.png new file mode 100644 index 0000000..896d5f6 Binary files /dev/null and b/demo/html/demo_eq85525.png differ diff --git a/demo/html/demo_eq91421.png b/demo/html/demo_eq91421.png new file mode 100644 index 0000000..d1fbfd3 Binary files /dev/null and b/demo/html/demo_eq91421.png differ diff --git a/demo/html/performance_comparison_simple.html b/demo/html/performance_comparison_simple.html new file mode 100644 index 0000000..fbc95aa --- /dev/null +++ b/demo/html/performance_comparison_simple.html @@ -0,0 +1,334 @@ + + + + + + + + Compare performance of gaimc to matlab_bgl + + + + +
+

Compare performance of gaimc to matlab_bgl

+ +

While the gaimc library implements its graph routines in Matlab "m"-code, the matlab_bgl library uses graph algorithms from the Boost graph library in C++ through a mex interface. Folklore has it that Matlab code + with for loops like those required in the gaimc library is considerably slower. This example examines this lore and shows that gaimc is typically within a factor of 2-4 of the mex code for one function. The full performance_comparison suite evaluates the + rest, but it takes a while to run, so I only run it when I'm interested in a complete picture and can afford to run it overnight. +

+
+

Contents

+
+ +
+

This demo is unlikely to work on your own computer. They depend on having the MatlabBGL routines in one spot.

+

Setup the environment

+

We need MatlabBGL on the path

graphdir = '../graphs/';
+matlabbgldir = '~/dev/matlab-bgl/4.0';
+try
+    addpath(matlabbgldir); % change this to your matlab_bgl path
+    ci=components(sparse(ones(5)));
+catch
+    error('gaimc:performance_comparison','Matlab BGL is not working, halting...');
+end
+
Warning: Duplicate directory name: /home/dgleich/dev/matlab-bgl/4.0.
+

Check to make sure we are in the correct directory

cwd = pwd; dirtail = ['gaimc' filesep 'demo'];
+if strcmp(cwd(end-length(dirtail)+1:end),dirtail) == 0
+    error('%s should be executed from %s\n',mfilename,dirtail);
+end
+

initalize the results structure

results=[];
+mex_fast=0; mat_fast=0; mex_std=0; mat_std=0;
+

Connected components

+

To evaluate the performance of the connected components algorithm, we use sets of random graphs.

nrep=30;
+szs=[1 10 100 5000 10000 50000];
+comp_results=[mex_fast mat_fast mex_std mat_std];
+for szi=1:length(szs)
+    fprintf('\n%20s size=%-5i     ', 'scomponents', szs(szi));
+    % Matlab needs 1 iteration to compile the function
+    if szi==2, mex_fast=0; mat_fast=0; mex_std=0; mat_std=0; end
+    for rep=1:nrep
+        fprintf('\b\b\b\b'); fprintf(' %3i', rep);
+        A=sprand(szs(szi),szs(szi),25/szs(szi));
+        At=A'; [rp ci ai]=sparse_to_csr(A); As.rp=rp; As.ci=ci; As.ai=ai;
+        tic; cc1=components(A); mex_std=mex_std+toc;
+        tic; cc2=components(At,struct('istrans',1,'nocheck',1));
+            mex_fast=mex_fast+toc;
+        tic; cc3=scomponents(A); mat_std=mat_std+toc;
+        tic; cc4=scomponents(As); mat_fast=mat_fast+toc;
+        cs1=accumarray(cc1,1,[max(cc1) 1]);
+        cs2=accumarray(cc2,1,[max(cc2) 1]);
+        cs3=accumarray(cc3,1,[max(cc3) 1]);
+        cs4=accumarray(cc4,1,[max(cc4) 1]);
+        if any(cs1 ~= cs2) || any(cs2 ~= cs3) || any(cs2 ~= cs4)
+            error('gaimc:scomponents','incorrect results from scomponents');
+        end
+    end
+    comp_results(end+1,:) = [mex_fast mat_fast mex_std mat_std];
+end
+comp_results=diff(comp_results);
+results(end+1).name='scomponents';
+results(end).mex_fast = mex_fast;
+results(end).mat_fast = mat_fast;
+results(end).mex_std = mex_std;
+results(end).mat_std = mat_std;
+
+         scomponents size=1       30
+         scomponents size=10      30
+         scomponents size=100     30
+         scomponents size=5000    30
+         scomponents size=10000   30
+         scomponents size=50000   30

Dijkstra's algorithm

+

To evaluate the performance of Dijkstra's algorithm, we pick

graphs = {'clr-25-2', 'clr-24-1', 'cs-stanford', ...
+    'minnesota', 'tapir'};
+nrep=30; ntests=100; mex_fast=0; mat_fast=0; mex_std=0; mat_std=0;
+for rep=1:nrep
+    for gi=1:length(graphs)
+        load([graphdir graphs{gi} '.mat']); n=size(A,1);
+        At=A'; [rp ci ai]=sparse_to_csr(A); As.rp=rp; As.ci=ci; As.ai=ai;
+        for ti=1:ntests
+            fprintf([repmat('\b',1,66) '%20s rep=%3i graph=%-20s trial=%4i'], ...
+                'dijkstra',rep,graphs{gi},ti);
+            v=ceil(n*rand(1));
+            tic; d1=dijkstra_sp(A,v); mex_std=mex_std+toc;
+            tic; d2=dijkstra_sp(At,v,struct('istrans',1,'nocheck',1));
+              mex_fast=mex_fast+toc;
+            tic; d3=dijkstra(A,v); mat_std=mat_std+toc;
+            tic; d4=dijkstra(As,v); mat_fast=mat_fast+toc;
+            if any(d1 ~= d2) || any(d2 ~= d3) || any(d3 ~= d4)
+                error('gaimc:dijkstra','incorrect results from dijkstra');
+            end
+        end
+    end
+end
+fprintf('\n');
+results(end+1).name='dijkstra';
+results(end).mex_fast = mex_fast;
+results(end).mat_fast = mat_fast;
+results(end).mex_std = mex_std;
+results(end).mat_std = mat_std;
+
            dijkstra rep= 30 graph=tapir                trial= 100
+

Summarize the results

+

We are going to summarize the results in a bar plot based on the algorithm. Each algorithm is a single bar where the performance + of the mex code is 1. +

nresults=length(results);
+Ystd = zeros(nresults,1);
+Yfast = zeros(nresults,1);
+for i=1:nresults
+    Ystd(i)=results(i).mat_std/results(i).mex_std;
+    Yfast(i)=results(i).mat_fast/results(i).mex_fast;
+end
+bar(1:nresults,[Ystd Yfast]); set(gca,'XTickLabel',{results.name});
+legend('Standard','Fast','Location','Northwest');
+

From this, we see that the connected component codes are about half the speed of the Matlab BGL functions and Dijkstra's is + about 1/4th the speed. This seems unideal, but the code is much more portable and flexible. +

+

The difference between the Standard and Fast results is that the fast results eliminate all data translation for both gaimc + and MatlabBGL and are evaluating the actual algorithm implementation and not any of the data translation components. +

+ +
+ + + \ No newline at end of file diff --git a/demo/html/performance_comparison_simple.png b/demo/html/performance_comparison_simple.png new file mode 100644 index 0000000..a2096d9 Binary files /dev/null and b/demo/html/performance_comparison_simple.png differ diff --git a/demo/html/performance_comparison_simple_01.png b/demo/html/performance_comparison_simple_01.png new file mode 100644 index 0000000..06386ec Binary files /dev/null and b/demo/html/performance_comparison_simple_01.png differ diff --git a/demo/performance_comparison.m b/demo/performance_comparison.m index 63a8f52..f823fd5 100644 --- a/demo/performance_comparison.m +++ b/demo/performance_comparison.m @@ -6,6 +6,10 @@ % considerably slower. This example examines this lore and shows that % |gaimc| is typically within a factor of 2-4 of the mex code. +%% +% *This demo is unlikely to work on your own computer.* +% *They depend on having the MatlabBGL routines in one spot.* + %% Setup the environment % We need MatlabBGL on the path graphdir = '../graphs/'; @@ -103,9 +107,11 @@ %% % Plot the data for connected components -plot(szs, comp_results,'.-'); -legend('mex fast','mat fast','mex std','mat std','Location','Northwest'); -ylabel('time'); xlabel('graph size'); +% This plot isn't too meaningful, so I've omitted it. + +% plot(szs, comp_results,'.-'); +% legend('mex fast','mat fast','mex std','mat std','Location','Northwest'); +% ylabel('time'); xlabel('graph size'); %% Dijkstra's algorithm % To evaluate the performance of Dijkstra's algorithm, we pick @@ -237,7 +243,9 @@ %% Summarize the results % We are going to summarize the results in a bar plot based on the -% algorithm. Each algorithm is a single bar, where the performance of the +% algorithm. Each algorithm is a single bar +graphs = {'all_shortest_paths_example', 'clr-24-1', 'cs-stanford', ... + 'minnesota','tapir'};, where the performance of the % mex code is 1. nresults=length(results); Ystd = zeros(nresults,1); diff --git a/demo/performance_comparison_simple.m b/demo/performance_comparison_simple.m new file mode 100644 index 0000000..54c8905 --- /dev/null +++ b/demo/performance_comparison_simple.m @@ -0,0 +1,131 @@ +%% Compare performance of gaimc to matlab_bgl +% While the |gaimc| library implements its graph routines in Matlab +% "m"-code, the |matlab_bgl| library uses graph algorithms from the Boost +% graph library in C++ through a mex interface. Folklore has it that +% Matlab code with for loops like those required in the |gaimc| library is +% considerably slower. This example examines this lore and shows that +% |gaimc| is typically within a factor of 2-4 of the mex code for one +% function. The full performance_comparison suite evaluates the rest, but +% it takes a while to run, so I only run it when I'm interested in a +% complete picture and can afford to run it overnight. + +%% +% *This demo is unlikely to work on your own computer.* +% *They depend on having the MatlabBGL routines in one spot.* + +%% Setup the environment +% We need MatlabBGL on the path +graphdir = '../graphs/'; +matlabbgldir = '~/dev/matlab-bgl/4.0'; +try + addpath(matlabbgldir); % change this to your matlab_bgl path + ci=components(sparse(ones(5))); +catch + error('gaimc:performance_comparison','Matlab BGL is not working, halting...'); +end + +%% +% Check to make sure we are in the correct directory +cwd = pwd; dirtail = ['gaimc' filesep 'demo']; +if strcmp(cwd(end-length(dirtail)+1:end),dirtail) == 0 + error('%s should be executed from %s\n',mfilename,dirtail); +end + +%% +% initalize the results structure +results=[]; +mex_fast=0; mat_fast=0; mex_std=0; mat_std=0; + +%% Connected components +% To evaluate the performance of the connected components algorithm, we use +% sets of random graphs. +nrep=30; +szs=[1 10 100 5000 10000 50000]; +comp_results=[mex_fast mat_fast mex_std mat_std]; +for szi=1:length(szs) + fprintf('\n%20s size=%-5i ', 'scomponents', szs(szi)); + % Matlab needs 1 iteration to compile the function + if szi==2, mex_fast=0; mat_fast=0; mex_std=0; mat_std=0; end + for rep=1:nrep + fprintf('\b\b\b\b'); fprintf(' %3i', rep); + A=sprand(szs(szi),szs(szi),25/szs(szi)); + At=A'; [rp ci ai]=sparse_to_csr(A); As.rp=rp; As.ci=ci; As.ai=ai; + tic; cc1=components(A); mex_std=mex_std+toc; + tic; cc2=components(At,struct('istrans',1,'nocheck',1)); + mex_fast=mex_fast+toc; + tic; cc3=scomponents(A); mat_std=mat_std+toc; + tic; cc4=scomponents(As); mat_fast=mat_fast+toc; + cs1=accumarray(cc1,1,[max(cc1) 1]); + cs2=accumarray(cc2,1,[max(cc2) 1]); + cs3=accumarray(cc3,1,[max(cc3) 1]); + cs4=accumarray(cc4,1,[max(cc4) 1]); + if any(cs1 ~= cs2) || any(cs2 ~= cs3) || any(cs2 ~= cs4) + error('gaimc:scomponents','incorrect results from scomponents'); + end + end + comp_results(end+1,:) = [mex_fast mat_fast mex_std mat_std]; +end +comp_results=diff(comp_results); +results(end+1).name='scomponents'; +results(end).mex_fast = mex_fast; +results(end).mat_fast = mat_fast; +results(end).mex_std = mex_std; +results(end).mat_std = mat_std; + +%% Dijkstra's algorithm +% To evaluate the performance of Dijkstra's algorithm, we pick +graphs = {'clr-25-2', 'clr-24-1', 'cs-stanford', ... + 'minnesota', 'tapir'}; +nrep=30; ntests=100; mex_fast=0; mat_fast=0; mex_std=0; mat_std=0; +for rep=1:nrep + for gi=1:length(graphs) + load([graphdir graphs{gi} '.mat']); n=size(A,1); + At=A'; [rp ci ai]=sparse_to_csr(A); As.rp=rp; As.ci=ci; As.ai=ai; + for ti=1:ntests + fprintf([repmat('\b',1,66) '%20s rep=%3i graph=%-20s trial=%4i'], ... + 'dijkstra',rep,graphs{gi},ti); + v=ceil(n*rand(1)); + tic; d1=dijkstra_sp(A,v); mex_std=mex_std+toc; + tic; d2=dijkstra_sp(At,v,struct('istrans',1,'nocheck',1)); + mex_fast=mex_fast+toc; + tic; d3=dijkstra(A,v); mat_std=mat_std+toc; + tic; d4=dijkstra(As,v); mat_fast=mat_fast+toc; + if any(d1 ~= d2) || any(d2 ~= d3) || any(d3 ~= d4) + error('gaimc:dijkstra','incorrect results from dijkstra'); + end + end + end +end +fprintf('\n'); +results(end+1).name='dijkstra'; +results(end).mex_fast = mex_fast; +results(end).mat_fast = mat_fast; +results(end).mex_std = mex_std; +results(end).mat_std = mat_std; + + +%% Summarize the results +% We are going to summarize the results in a bar plot based on the +% algorithm. Each algorithm is a single bar where the performance of the +% mex code is 1. +nresults=length(results); +Ystd = zeros(nresults,1); +Yfast = zeros(nresults,1); +for i=1:nresults + Ystd(i)=results(i).mat_std/results(i).mex_std; + Yfast(i)=results(i).mat_fast/results(i).mex_fast; +end +bar(1:nresults,[Ystd Yfast]); set(gca,'XTickLabel',{results.name}); +legend('Standard','Fast','Location','Northwest'); + +%% +% From this, we see that the connected component codes are about half the +% speed of the Matlab BGL functions and Dijkstra's is about 1/4th the +% speed. This seems unideal, but the code is much more portable and +% flexible. +% +% The difference between the Standard and Fast results is that the fast +% results eliminate all data translation for both gaimc and MatlabBGL and +% are evaluating the actual algorithm implementation and not any of the +% data translation components. +