Skip to content

Commit

Permalink
Document and get code running.
Browse files Browse the repository at this point in the history
Instructions to download data from sourceforge and updated relevant code
to work with new instructions.
  • Loading branch information
cmdevries committed May 12, 2015
1 parent 4ed6c10 commit 7cfeeeb
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 21 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -6,3 +6,4 @@ EMTree.o
external/install
external/build
build/
data/
18 changes: 14 additions & 4 deletions README.md
Expand Up @@ -51,6 +51,7 @@ Currently we use:
- [strtk](https://code.google.com/p/strtk/)

Directory structure:

/

/src - all source contributed by this project where each subdirectory
Expand All @@ -64,8 +65,8 @@ Directory structure:
/external/build - build directory for external libraries
/external/install - installation directory for external libraries

Building
========
Building and Running
====================

Make dependencies using a GNU Makefile (only tested on Linux)

Expand All @@ -79,8 +80,17 @@ We use CMake for making the main project
$ cd build
$ cmake ..
$ make
$ cd ..

Run the program
Fetch some data to cluster

$ LD_LIBRARY_PATH=../external/install/lib ./emtree
$ mkdir data
$ cd data
$ wget http://downloads.sourceforge.net/project/ktree/docclust_ir/inex_xml_mining_subset_2010.txt
$ wget http://downloads.sourceforge.net/project/ktree/docclust_ir/wikisignatures.tar.gz
$ tar xzf wikisignatures.tar.gz
$ cd ..

Run the program

$ LD_LIBRARY_PATH=./external/install/lib ./build/emtree
4 changes: 2 additions & 2 deletions src/CreateSignatures.h
Expand Up @@ -60,8 +60,8 @@ void readSignatures(vector<SVector<bool>*> &vectors, string docidFile, string si
}

void loadWikiSignatures(vector<SVector<bool>*>& vectors, int veccount) {
const char docidFile[] = "data/wiki.4096.docids";
const char signatureFile[] = "data/wiki.4096.sig";
const char docidFile[] = "data/wikisignatures/wiki.4096.docids";
const char signatureFile[] = "data/wikisignatures/wiki.4096.sig";
const size_t signatureLength = 4096;
readSignatures(vectors, docidFile, signatureFile, signatureLength, veccount);
}
Expand Down
30 changes: 15 additions & 15 deletions src/StreamingEMTreeExperiments.h
Expand Up @@ -47,10 +47,10 @@ StreamingEMTree_t* streamingEMTreeInit() {
return tree;
}

const char wikiDocidFile[] = "data/wiki.4096.docids";
const char wikiSignatureFile[] = "data/wiki.4096.sig";
const char wikiDocidFile[] = "data/wikisignatures/wiki.4096.docids";
const char wikiSignatureFile[] = "data/wikisignatures/wiki.4096.sig";
const size_t wikiSignatureLength = 4096;


void report(StreamingEMTree_t* emtree) {
int maxDepth = emtree->getMaxLevelCount();
Expand All @@ -60,7 +60,7 @@ void report(StreamingEMTree_t* emtree) {
<< emtree->getClusterCount(i + 1) << endl;
}
cout << "streaming EM-tree had " << emtree->getObjCount() << " vectors inserted" << endl;
cout << "RMSE = " << emtree->getRMSE() << endl;
cout << "RMSE = " << emtree->getRMSE() << endl;
}

void insertWriteClusters(StreamingEMTree_t* emtree) {
Expand All @@ -69,32 +69,32 @@ void insertWriteClusters(StreamingEMTree_t* emtree) {

// setup output streams for all levels in the tree
const string prefix = "wikipedia_clusters";

// insert and write cluster assignments
{
boost::timer::auto_cpu_timer insert("inserting and writing clusters: %w seconds\n");
boost::timer::auto_cpu_timer insert("inserting and writing clusters: %w seconds\n");
ClusterWriter cw(emtree->getMaxLevelCount(), prefix);
emtree->visit(vs, cw);
}

// prune
cout << emtree->prune() << " nodes pruned" << endl;

// report tree stats
report(emtree);

// write out cluster statistics
{
boost::timer::auto_cpu_timer update("writing cluster stats: %w seconds\n");
boost::timer::auto_cpu_timer update("writing cluster stats: %w seconds\n");
ClusterStats cs(emtree->getMaxLevelCount(), prefix);
emtree->visit(cs);
}
}
}

void streamingEMTreeInsertPruneReport(StreamingEMTree_t* emtree) {
// open files
SVectorStream<SVector<bool>> vs(wikiDocidFile, wikiSignatureFile, wikiSignatureLength);

// insert from stream
boost::timer::auto_cpu_timer insert("inserting into streaming EM-tree: %w seconds\n");
insert.start();
Expand Down Expand Up @@ -122,7 +122,7 @@ void streamingEMTree() {
}

// streaming EMTree
const int maxIters = 2;
const int maxIters = 10;
StreamingEMTree_t* emtree = streamingEMTreeInit();
cout << endl << "Streaming EM-tree:" << endl;
for (int i = 0; i < maxIters - 1; i++) {
Expand All @@ -131,10 +131,10 @@ void streamingEMTree() {
{
boost::timer::auto_cpu_timer update("update streaming EM-tree: %w seconds\n");
emtree->update();
}
}
cout << "-----" << endl << endl;
}

// last iteration writes cluster assignments and does not update accumulators
insertWriteClusters(emtree);
}
Expand Down

0 comments on commit 7cfeeeb

Please sign in to comment.