Skip to content

Commit

Permalink
Merge pull request #21865 from fwyzard/FastTimerService_updates_100x
Browse files Browse the repository at this point in the history
Further updates to the FastTimerService for 10.0.x
  • Loading branch information
cmsbuild committed Jan 17, 2018
2 parents 225ce8a + 2f27805 commit 85ab2e1
Show file tree
Hide file tree
Showing 6 changed files with 281 additions and 176 deletions.
115 changes: 51 additions & 64 deletions HLTrigger/Timer/interface/FastTimerService.h
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
#ifndef FastTimerService_h
#define FastTimerService_h

// system headers
#include <unistd.h>

// C++ headers
#include <chrono>
#include <cmath>
#include <string>
#include <map>
#include <unordered_map>
#include <chrono>
#include <mutex>
#include <unistd.h>
#include <string>
#include <unordered_map>

// boost headers
#include <boost/chrono.hpp>
Expand Down Expand Up @@ -46,40 +48,6 @@ procesing time is divided into
- event processing, sum of the time spent in all the modules
*/

/*
Assuming an HLT process with ~2500 modules and ~500 paths, tracking each step (with two calls per step, to start and stop the timer)
with std::chrono::high_resolution_clock gives a per-event overhead of 1 ms
Detailed informations on different timers can be extracted running $CMSSW_RELEASE_BASE/test/$SCRAM_ARCH/testChrono .
Timer per-call overhead on SLC5:
Linux 2.6.18-371.1.2.el5 x86_64
glibc version: 2.5
clock source: unknown
For each timer the resolution reported is the MINIMUM (MEDIAN) (MEAN +/- its STDDEV) of the increments measured during the test.
Performance of std::chrono::high_resolution_clock
Average time per call: 317.0 ns
Clock tick period: 1.0 ns
Measured resolution: 1000.0 ns (median: 1000.0 ns) (sigma: 199.4 ns) (average: 1007.6 +/- 0.4 ns)
Timer per-call overhead on SLC6 (virtualized):
Linux 2.6.32-358.23.2.el6.x86_64 x86_64
glibc version: 2.12
clock source: kvm-clock
For each timer the resolution reported is the MINIMUM (MEDIAN) (MEAN +/- its STDDEV) of the increments measured during the test.
Performance of std::chrono::high_resolution_clock
Average time per call: 351.2 ns
Clock tick period: 1.0 ns
Measured resolution: 1.0 ns (median: 358.0 ns) (sigma: 30360.8 ns) (average: 685.7 +/- 42.4 ns)
*/


class FastTimerService : public tbb::task_scheduler_observer
{
public:
Expand Down Expand Up @@ -224,6 +192,28 @@ class FastTimerService : public tbb::task_scheduler_observer
static void fillDescriptions(edm::ConfigurationDescriptions & descriptions);

private:
// forward declarations
struct Resources;
struct AtomicResources;

// per-thread measurements
struct Measurement {
public:
Measurement();
void measure();
void measure_and_store(Resources & store);
void measure_and_accumulate(AtomicResources & store);

public:
#ifdef DEBUG_THREAD_CONCURRENCY
std::thread::id id;
#endif // DEBUG_THREAD_CONCURRENCY
boost::chrono::thread_clock::time_point time_thread;
boost::chrono::high_resolution_clock::time_point time_real;
uint64_t allocated;
uint64_t deallocated;
};

// highlight a group of modules
struct GroupOfModules {
public:
Expand Down Expand Up @@ -266,7 +256,6 @@ class FastTimerService : public tbb::task_scheduler_observer

struct ResourcesPerModule {
public:
ResourcesPerModule();
void reset();
ResourcesPerModule & operator+=(ResourcesPerModule const& other);
ResourcesPerModule operator+(ResourcesPerModule const& other) const;
Expand All @@ -276,27 +265,8 @@ class FastTimerService : public tbb::task_scheduler_observer
unsigned events;
};

// per-thread measurements
struct Measurement {
public:
Measurement();
void measure();
void measure_and_store(Resources & store);
void measure_and_accumulate(AtomicResources & store);

public:
#ifdef DEBUG_THREAD_CONCURRENCY
std::thread::id id;
#endif // DEBUG_THREAD_CONCURRENCY
boost::chrono::thread_clock::time_point time_thread;
boost::chrono::high_resolution_clock::time_point time_real;
uint64_t allocated;
uint64_t deallocated;
};

struct ResourcesPerPath {
public:
ResourcesPerPath();
void reset();
ResourcesPerPath & operator+=(ResourcesPerPath const& other);
ResourcesPerPath operator+(ResourcesPerPath const& other) const;
Expand All @@ -323,7 +293,7 @@ class FastTimerService : public tbb::task_scheduler_observer

struct ResourcesPerJob {
public:
ResourcesPerJob();
ResourcesPerJob() = default;
ResourcesPerJob(ProcessCallGraph const& job, std::vector<GroupOfModules> const& groups);
void reset();
ResourcesPerJob & operator+=(ResourcesPerJob const& other);
Expand Down Expand Up @@ -416,14 +386,19 @@ class FastTimerService : public tbb::task_scheduler_observer
void book(DQMStore::ConcurrentBooker &, ProcessCallGraph const&, std::vector<GroupOfModules> const&,
PlotRanges const& event_ranges, PlotRanges const& path_ranges,
PlotRanges const& module_ranges, unsigned int lumisections,
bool bymodule, bool bypath, bool byls);
bool bymodule, bool bypath, bool byls, bool transitions);
void fill(ProcessCallGraph const&, ResourcesPerJob const&, unsigned int ls);
void fill_run(AtomicResources const&);
void fill_lumi(AtomicResources const&, unsigned int lumisection);

private:
// resources spent in all the modules of the job
PlotsPerElement event_;
PlotsPerElement event_ex_;
PlotsPerElement overhead_;
// resources spent in the modules' lumi and run transitions
PlotsPerElement lumi_;
PlotsPerElement run_;
// resources spent in the highlighted modules
std::vector<PlotsPerElement> highlight_;
// resources spent in each module
Expand All @@ -440,7 +415,12 @@ class FastTimerService : public tbb::task_scheduler_observer
std::vector<ResourcesPerJob> streams_;

// concurrent histograms and profiles
std::unique_ptr<PlotsPerJob> plots_;
std::unique_ptr<PlotsPerJob> plots_;

// per-lumi and per-run information
std::vector<AtomicResources> lumi_transition_; // resources spent in the modules' global and stream lumi transitions
std::vector<AtomicResources> run_transition_; // resources spent in the modules' global and stream run transitions
AtomicResources overhead_; // resources spent outside of the modules' transitions

// summary data
ResourcesPerJob job_summary_; // whole event time accounting per-job
Expand All @@ -453,12 +433,14 @@ class FastTimerService : public tbb::task_scheduler_observer

// atomic variables to keep track of the completion of each step, process by process
std::unique_ptr<std::atomic<unsigned int>[]> subprocess_event_check_;
std::unique_ptr<std::atomic<unsigned int>[]> subprocess_global_lumi_check_;
std::unique_ptr<std::atomic<unsigned int>[]> subprocess_global_run_check_;

// retrieve the current thread's per-thread quantities
Measurement & thread();

// job configuration
unsigned int concurrent_lumis_;
unsigned int concurrent_runs_;
unsigned int concurrent_streams_;
unsigned int concurrent_threads_;
Expand All @@ -469,13 +451,12 @@ class FastTimerService : public tbb::task_scheduler_observer
const bool print_job_summary_; // print the time spent in each process, path and module for the whole job

// dqm configuration
unsigned int module_id_; // pseudo module id for the FastTimerService, needed by the thread-safe DQMStore

bool enable_dqm_; // non const, depends on the availability of the DQMStore
const bool enable_dqm_bymodule_;
const bool enable_dqm_bypath_;
const bool enable_dqm_byls_;
const bool enable_dqm_bynproc_;
const bool enable_dqm_transitions_;

const PlotRanges dqm_event_ranges_;
const PlotRanges dqm_path_ranges_;
Expand All @@ -500,6 +481,9 @@ class FastTimerService : public tbb::task_scheduler_observer
template <typename T>
void printEventLine(T& out, Resources const& data, std::string const & label) const;

template <typename T>
void printEventLine(T& out, AtomicResources const& data, std::string const & label) const;

template <typename T>
void printEvent(T& out, ResourcesPerJob const&) const;

Expand All @@ -519,7 +503,10 @@ class FastTimerService : public tbb::task_scheduler_observer
void printPathSummaryLine(T& out, Resources const& data, Resources const& total, uint64_t events, std::string const& label) const;

template <typename T>
void printSummary(T& out, ResourcesPerJob const&, std::string const& label) const;
void printSummary(T& out, ResourcesPerJob const& data, std::string const& label) const;

template <typename T>
void printTransition(T& out, AtomicResources const& data, std::string const& label) const;

// check if this is the first process being signalled
bool isFirstSubprocess(edm::StreamContext const&);
Expand Down
2 changes: 1 addition & 1 deletion HLTrigger/Timer/plugins/FastTimerServiceClient.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ FastTimerServiceClient::fillSummaryPlots(DQMStore::IBooker & booker, DQMStore::I
// the plots are directly in the configured folder
fillProcessSummaryPlots(booker, getter, m_dqm_path);
} else {
static const boost::regex running_n_processes(".*/Running [0-9]+ processes");
static const boost::regex running_n_processes(".*/Running .*");

booker.setCurrentFolder(m_dqm_path);
std::vector<std::string> subdirs = getter.getSubdirs();
Expand Down
2 changes: 1 addition & 1 deletion HLTrigger/Timer/plugins/ThroughputServiceClient.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ ThroughputServiceClient::fillSummaryPlots(DQMStore::IBooker & booker, DQMStore::
// the plots are in the main folder
folders.push_back(m_dqm_path);
} else {
static const boost::regex running_n_processes(".*/Running [0-9]+ processes");
static const boost::regex running_n_processes(".*/Running .*");
booker.setCurrentFolder(m_dqm_path);
std::vector<std::string> subdirs = getter.getSubdirs();
for (auto const & subdir: subdirs) {
Expand Down

0 comments on commit 85ab2e1

Please sign in to comment.