From 7cfeeeb895839aabcb567c710041f55851180e05 Mon Sep 17 00:00:00 2001 From: cmdevries Date: Tue, 12 May 2015 08:22:00 +0200 Subject: [PATCH] Document and get code running. Instructions to download data from sourceforge and updated relevant code to work with new instructions. --- .gitignore | 1 + README.md | 18 ++++++++++++++---- src/CreateSignatures.h | 4 ++-- src/StreamingEMTreeExperiments.h | 30 +++++++++++++++--------------- 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index e06395c..46675e3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ EMTree.o external/install external/build build/ +data/ diff --git a/README.md b/README.md index e9802e6..6c6f5c4 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ Currently we use: - [strtk](https://code.google.com/p/strtk/) Directory structure: + / /src - all source contributed by this project where each subdirectory @@ -64,8 +65,8 @@ Directory structure: /external/build - build directory for external libraries /external/install - installation directory for external libraries -Building -======== +Building and Running +==================== Make dependencies using a GNU Makefile (only tested on Linux) @@ -79,8 +80,17 @@ We use CMake for making the main project $ cd build $ cmake .. $ make + $ cd .. -Run the program +Fetch some data to cluster - $ LD_LIBRARY_PATH=../external/install/lib ./emtree + $ mkdir data + $ cd data + $ wget http://downloads.sourceforge.net/project/ktree/docclust_ir/inex_xml_mining_subset_2010.txt + $ wget http://downloads.sourceforge.net/project/ktree/docclust_ir/wikisignatures.tar.gz + $ tar xzf wikisignatures.tar.gz + $ cd .. + +Run the program + $ LD_LIBRARY_PATH=./external/install/lib ./build/emtree diff --git a/src/CreateSignatures.h b/src/CreateSignatures.h index 270dd88..5b562fe 100644 --- a/src/CreateSignatures.h +++ b/src/CreateSignatures.h @@ -60,8 +60,8 @@ void readSignatures(vector*> &vectors, string docidFile, string si } void loadWikiSignatures(vector*>& vectors, int veccount) { - const char docidFile[] = "data/wiki.4096.docids"; - const char signatureFile[] = "data/wiki.4096.sig"; + const char docidFile[] = "data/wikisignatures/wiki.4096.docids"; + const char signatureFile[] = "data/wikisignatures/wiki.4096.sig"; const size_t signatureLength = 4096; readSignatures(vectors, docidFile, signatureFile, signatureLength, veccount); } diff --git a/src/StreamingEMTreeExperiments.h b/src/StreamingEMTreeExperiments.h index 66d6569..34ccbbd 100644 --- a/src/StreamingEMTreeExperiments.h +++ b/src/StreamingEMTreeExperiments.h @@ -47,10 +47,10 @@ StreamingEMTree_t* streamingEMTreeInit() { return tree; } -const char wikiDocidFile[] = "data/wiki.4096.docids"; -const char wikiSignatureFile[] = "data/wiki.4096.sig"; +const char wikiDocidFile[] = "data/wikisignatures/wiki.4096.docids"; +const char wikiSignatureFile[] = "data/wikisignatures/wiki.4096.sig"; const size_t wikiSignatureLength = 4096; - + void report(StreamingEMTree_t* emtree) { int maxDepth = emtree->getMaxLevelCount(); @@ -60,7 +60,7 @@ void report(StreamingEMTree_t* emtree) { << emtree->getClusterCount(i + 1) << endl; } cout << "streaming EM-tree had " << emtree->getObjCount() << " vectors inserted" << endl; - cout << "RMSE = " << emtree->getRMSE() << endl; + cout << "RMSE = " << emtree->getRMSE() << endl; } void insertWriteClusters(StreamingEMTree_t* emtree) { @@ -69,32 +69,32 @@ void insertWriteClusters(StreamingEMTree_t* emtree) { // setup output streams for all levels in the tree const string prefix = "wikipedia_clusters"; - + // insert and write cluster assignments { - boost::timer::auto_cpu_timer insert("inserting and writing clusters: %w seconds\n"); + boost::timer::auto_cpu_timer insert("inserting and writing clusters: %w seconds\n"); ClusterWriter cw(emtree->getMaxLevelCount(), prefix); emtree->visit(vs, cw); } - + // prune cout << emtree->prune() << " nodes pruned" << endl; - + // report tree stats report(emtree); - + // write out cluster statistics { - boost::timer::auto_cpu_timer update("writing cluster stats: %w seconds\n"); + boost::timer::auto_cpu_timer update("writing cluster stats: %w seconds\n"); ClusterStats cs(emtree->getMaxLevelCount(), prefix); emtree->visit(cs); - } + } } void streamingEMTreeInsertPruneReport(StreamingEMTree_t* emtree) { // open files SVectorStream> vs(wikiDocidFile, wikiSignatureFile, wikiSignatureLength); - + // insert from stream boost::timer::auto_cpu_timer insert("inserting into streaming EM-tree: %w seconds\n"); insert.start(); @@ -122,7 +122,7 @@ void streamingEMTree() { } // streaming EMTree - const int maxIters = 2; + const int maxIters = 10; StreamingEMTree_t* emtree = streamingEMTreeInit(); cout << endl << "Streaming EM-tree:" << endl; for (int i = 0; i < maxIters - 1; i++) { @@ -131,10 +131,10 @@ void streamingEMTree() { { boost::timer::auto_cpu_timer update("update streaming EM-tree: %w seconds\n"); emtree->update(); - } + } cout << "-----" << endl << endl; } - + // last iteration writes cluster assignments and does not update accumulators insertWriteClusters(emtree); }