bibliography.bib

################################################################
# Introduction
################################################################

This bibliography is a resource for people writing papers that refer
to the Google cluster traces.  It covers papers that analyze the
traces, as well as ones that use them as inputs to other studies.

  * I recommend using \usepackage{url}.
  * Entries are in publication-date order, with the most recent at the top.
  * Bibtex ignores stuff that is outside the entries, so text like this is safe.

The following are the RECOMMENDED CITATIONS if you just need the basics:

  * Borg:
     * \cite{clusterdata:Verma2015, clusterdata:Tirmazi2020} for Borg itself
  * 2019 traces:
     * \cite{clusterdata:Wilkes2020, clusterdata:Wilkes2020a, clusterdata:Tirmazi2020} for
       the complete set of info about trace itself.
     * \cite{clusterdata:Wilkes2020} for the 2019 trace announcement
     * \cite{clusterdata:Wilkes2020a} for the details about the 2019 trace contents
     * \cite{clusterdata:Tirmazi2020} for the EuroSys paper about the 2019 and 2011 traces
  * 2011 trace:
     * \cite{clusterdata:Wilkes2011, clusterdata:Reiss2011} for the trace itself
     * \cite{clusterdata:Reiss2012b} for the first thorough analysis of it.

If you use the traces, please send a bibtex entry that looks *exactly* like one
of these to johnwilkes@google.com, so your paper can be added - and cited!  A
Github pull request is the best format.

################################################################
# Trace-announcements
################################################################

These entries can be used to cite the traces themselves.

# The May 2019 traces.
# Use clusterdata:Tirmazi2020 for the first paper to analyze them.

# This is the formal announcement of the trace:
@Misc{clusterdata:Wilkes2020,
  author = {John Wilkes},
  title = {Yet more {Google} compute cluster trace data},
  howpublished = {Google research blog},
  month = Apr,
  year = 2020,
  address = {Mountain View, CA, USA},
  note = {Posted at \url{https://ai.googleblog.com/2020/04/yet-more-google-compute-cluster-trace.html}.},
}

# If you want to cite details about the trace itself:
@TechReport{clusterdata:Wilkes2020a,
  author = {John Wilkes},
  title = {{Google} cluster-usage traces v3},
  institution = {Google Inc.},
  year = 2020,
  month = Apr,
  type = {Technical Report},
  address = {Mountain View, CA, USA},
  note = {Posted at \url{https://github.com/google/cluster-data/blob/master/ClusterData2019.md}},
  abstract = {
	This document describes the semantics, data format, and
	schema of usage traces of a few Google compute cells.
	This document describes version 3 of the trace format.},
}

#----------------
The next couple are for the May 2011 "full" trace.
@Misc{clusterdata:Wilkes2011,
  author = {John Wilkes},
  title = {More {Google} cluster data},
  howpublished = {Google research blog},
  month = Nov,
  year = 2011,
  address = {Mountain View, CA, USA},
  note = {Posted at \url{http://googleresearch.blogspot.com/2011/11/more-google-cluster-data.html}.},
}

@TechReport{clusterdata:Reiss2011,
  author = {Charles Reiss and John Wilkes and Joseph L. Hellerstein},
  title = {{Google} cluster-usage traces: format + schema},
  institution = {Google Inc.},
  year = 2011,
  month = Nov,
  type = {Technical Report},
  address = {Mountain View, CA, USA},
  note = {Revised 2014-11-17 for version 2.1.  Posted at
	\url{https://github.com/google/cluster-data}},
}


#----------------
# The next one is for the earlier "small" 7-hour trace.
# (Most people should not be using this.)

@Misc{clusterdata:Hellersetein2010,
  author = {Joseph L. Hellerstein},
  title = {{Google} cluster data},
  howpublished = {Google research blog},
  month = Jan,
  year = 2010,
  note = {Posted at \url{http://googleresearch.blogspot.com/2010/01/google-cluster-data.html}.},
}

#----------------
The canonical Borg paper.
@inproceedings{clusterdata:Verma2015,
  title = {Large-scale cluster management at {Google} with {Borg}},
  author = {Abhishek Verma and Luis Pedrosa and Madhukar R. Korupolu and David Oppenheimer and Eric Tune and John Wilkes},
  year = {2015},
  booktitle = {Proceedings of the European Conference on Computer Systems (EuroSys'15)},
  address = {Bordeaux, France},
  articleno = {18},
  numpages = {17},
  abstract = {
	Google's Borg system is a cluster manager that runs hundreds of thousands of jobs,
	from many thousands of different applications, across a number of clusters each with
	up to tens of thousands of machines.

	It achieves high utilization by combining admission control, efficient task-packing,
	over-commitment, and machine sharing with process-level performance isolation.
	It supports high-availability applications with runtime features that minimize
	fault-recovery time, and scheduling policies that reduce the probability of correlated
	failures. Borg simplifies life for its users by offering a declarative job specification
	language, name service integration, real-time job monitoring, and tools to analyze and
	simulate system behavior.

	We present a summary of the Borg system architecture and features, important design
	decisions, a quantitative analysis of some of its policy decisions, and a qualitative
	examination of lessons learned from a decade of operational experience with it.},
  url = {https://dl.acm.org/doi/10.1145/2741948.2741964},
  doi = {10.1145/2741948.2741964},
}


#----------------
The next paper describes the policy choices and technologies used to
make the traces safe to release.

@InProceedings{clusterdata:Reiss2012,
  author = {Charles Reiss and John Wilkes and Joseph L. Hellerstein},
  title = {Obfuscatory obscanturism: making workload traces of
	commercially-sensitive systems safe to release},
  year = 2012,
  booktitle = {3rd International Workshop on Cloud Management (CLOUDMAN)},
  month = Apr,
  publisher = {IEEE},
  pages = {1279--1286},
  address = {Maui, HI, USA},
  abstract = {Cloud providers such as Google are interested in fostering
	research on the daunting technical challenges they face in
	supporting planetary-scale distributed systems, but no
	academic organizations have similar scale systems on which to
	experiment. Fortunately, good research can still be done using
	traces of real-life production workloads, but there are risks
	in releasing such data, including inadvertently disclosing
	confidential or proprietary information, as happened with the
	Netflix Prize data. This paper discusses these risks, and our
	approach to them, which we call systematic obfuscation. It
	protects proprietary and personal data while leaving it
	possible to answer interesting research questions. We explain
	and motivate some of the risks and concerns and propose how
	they can best be mitigated, using as an example our recent
	publication of a month-long trace of a production system
	workload on a 11k-machine cluster.},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6212064},
}

################################################################
# Trace-analysis papers
################################################################

These papers are primarily about analyzing the traces.
Order: most recent first.

If you just want one citation about the Cluster2011 trace, then
use \cite{clusterdata:Reiss2012b}.


################ 2023
@INPROCEEDINGS{clusterdata:Tuns2023,
  author = {Tuns, Adrian-Ioan and Spătaru, Adrian},
  booktitle = {2023 25th International Symposium on Symbolic and Numeric Algorithms for Scientific Computing (SYNASC)},
  title = {Cloud Service Failure Prediction on {Google’s Borg} Cluster Traces Using Traditional Machine Learning},
  year = 2023,
  month = Sep,
  ISSN = {2470-881X},
  doi = {10.1109/SYNASC61333.2023.00029},
  url = {https://doi.org/10.1109/SYNASC61333.2023.00029},
  pages = {162--169},
  keywords = {Cloud computing;Machine learning algorithms;Scientific computing;Clustering algorithms;
   Prediction algorithms;Boosting;Classification algorithms;failure prediction;big data;machine learning;
   classification algorithms;Google Borg},
  abstract = {The ability to predict failures in complex systems is crucial for maintaining their
   optimal performance, opening the possibility of reducing downtime and minimizing costs.
   In the context of cloud computing, cloud failure represents one of the most relevant problems,
   which not only leads to substantial financial losses but also negatively impacts the productivity
   of both industrial and end users. This paper presents a comprehensive study on the application of
   failure prediction techniques, by exploring four machine learning algorithms, namely Decision Tree,
   Random Forest, Gradient Boosting, and Logistic Regression.The research focuses on analyzing the
   workload of an industrial set of clusters, provided as traces in Google’s Borg cluster workload traces.
   The aim was to develop highly accurate predictive models for both job and task failures, a goal which
   was achieved. A job classifier having a performance of 83.97\% accuracy (Gradient Boosting) and a task
   classifier of 98.79\% accuracy performance (Decision Tree) were obtained.},
}


################ 2022
@inproceedings {clusterdata:jajooSLearn2022,
author = {Akshay Jajoo and Y. Charlie Hu and Xiaojun Lin and Nan Deng},
title = {A Case for Task Sampling based Learning for Cluster Job Scheduling},
booktitle = {19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)},
year = {2022},
address = {Renton, WA, USA},
url = {https://www.usenix.org/conference/nsdi22/presentation/jajoo},
publisher = {USENIX Association},
keywords = {data centers, big data, job scheduling, learning, online learning},
abstract = {The ability to accurately estimate job runtime properties allows a
  scheduler to effectively schedule jobs. State-of-the-art online cluster job 
  schedulers use history-based learning, which uses past job execution information 
  to estimate the runtime properties of newly arrived jobs. However, with fast-paced
  development in cluster technology (in both hardware and software) and changing user
  inputs, job runtime properties can change over time, which lead to inaccurate predictions.
  In this paper, we explore the potential and limitation of real-time learning of job 
  runtime properties, by proactively sampling and scheduling a small fraction of the
  tasks of each job. Such a task-sampling-based approach exploits the similarity among
  runtime properties of the tasks of the same job and is inherently immune to changing
  job behavior. Our study focuses on two key questions in comparing task-sampling-based
  learning (learning in space) and history-based learning (learning in time): (1) Can
  learning in space be more accurate than learning in time? (2) If so, can delaying
  scheduling the remaining tasks of a job till the completion of sampled tasks be more
  than compensated by the improved accuracy and result in improved job performance? Our
  analytical and experimental analysis of 3 production traces with different skew and job
  distribution shows that learning in space can be substantially more accurate. Our 
  simulation and testbed evaluation on Azure of the two learning approaches anchored in a
  generic job scheduler using 3 production cluster job traces shows that despite its online
  overhead, learning in space reduces the average Job Completion Time (JCT) by 1.28x, 1.56x,
  and 1.32x compared to the prior-art history-based predictor.},
}


################ 2021
@article{clusterdata:jajooSLearnTechReport2021,
  author    = {Akshay Jajoo and Y. Charlie Hu and Xiaojun Lin and Nan Deng},
  title     = {The Case for Task Sampling based Learning for Cluster Job Scheduling},
  journal   = {Computing Research Repository},
  volume    = {abs/2108.10464},
  year      = {2021},
  url       = {https://arxiv.org/abs/2108.10464},
  eprinttype = {arXiv},
  eprint    = {2108.10464},
  timestamp = {Fri, 27 Aug 2021 15:02:29 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2108-10464.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  keywords = {data centers, big data, job scheduling, learning, online learning},
  abstract = {The ability to accurately estimate job runtime properties allows a
    scheduler to effectively schedule jobs. State-of-the-art online cluster job 
    schedulers use history-based learning, which uses past job execution information 
    to estimate the runtime properties of newly arrived jobs. However, with fast-paced
    development in cluster technology (in both hardware and software) and changing user
    inputs, job runtime properties can change over time, which lead to inaccurate predictions.
    In this paper, we explore the potential and limitation of real-time learning of job 
    runtime properties, by proactively sampling and scheduling a small fraction of the
    tasks of each job. Such a task-sampling-based approach exploits the similarity among
    runtime properties of the tasks of the same job and is inherently immune to changing
    job behavior. Our study focuses on two key questions in comparing task-sampling-based
    learning (learning in space) and history-based learning (learning in time): (1) Can
    learning in space be more accurate than learning in time? (2) If so, can delaying
    scheduling the remaining tasks of a job till the completion of sampled tasks be more
    than compensated by the improved accuracy and result in improved job performance? Our
    analytical and experimental analysis of 3 production traces with different skew and job
    distribution shows that learning in space can be substantially more accurate. Our 
    simulation and testbed evaluation on Azure of the two learning approaches anchored in a
    generic job scheduler using 3 production cluster job traces shows that despite its online
    overhead, learning in space reduces the average Job Completion Time (JCT) by 1.28x, 1.56x,
    and 1.32x compared to the prior-art history-based predictor.},
}

################ 2020

@inproceedings{clusterdata:Tirmazi2020,
  author = {Tirmazi, Muhammad and Barker, Adam and Deng, Nan and Haque, Md E. and Qin, Zhijing Gene and Hand, Steven and Harchol-Balter, Mor and Wilkes, John},
  title = {{Borg: the Next Generation}},
  year = {2020},
  isbn = {9781450368827},
  publisher = {ACM},
  address = {Heraklion, Greece},
  url = {https://doi.org/10.1145/3342195.3387517},
  doi = {10.1145/3342195.3387517},
  booktitle = {Proceedings of the Fifteenth European Conference on Computer Systems (EuroSys'20)},
  articleno = {30},
  numpages = {14},
  keywords = {data centers, cloud computing},
  abstract = {
	This paper analyzes a newly-published trace that covers 8
	different Borg clusters for the month of May 2019. The
	trace enables researchers to explore how scheduling works in
	large-scale production compute clusters. We highlight how
	Borg has evolved and perform a longitudinal comparison of
	the newly-published 2019 trace against the 2011 trace, which
	has been highly cited within the research community.
	Our findings show that Borg features such as alloc sets
	are used for resource-heavy workloads; automatic vertical
	scaling is effective; job-dependencies account for much of
	the high failure rates reported by prior studies; the workload
	arrival rate has increased, as has the use of resource
	over-commitment; the workload mix has changed, jobs have
	migrated from the free tier into the best-effort batch tier;
	the workload exhibits an extremely heavy-tailed distribution
	where the top 1\% of jobs consume over 99\% of resources; and
	there is a great deal of variation between different clusters.},
}


################ 2018

@article{clusterdata:Sebastio2018,
  title = {Characterizing machines lifecycle in Google data centers},
  journal = {Performance Evaluation},
  volume = 126,
  pages = {39 -- 63},
  year = 2018,
  issn = {0166-5316},
  doi = {https://doi.org/10.1016/j.peva.2018.08.001},
  url = {http://www.sciencedirect.com/science/article/pii/S016653161830004X},
  author = {Stefano Sebastio and Kishor S. Trivedi and Javier Alonso},
  keywords = {Statistical analysis, Distributed architectures, Cloud computing, System reliability, Large-scale systems, Empirical studies},
  abstract = {Due to the increasing need for computational power, the market has
	shifted towards big centralized data centers.  Understanding the nature
	of the dynamics of these data centers from machine and job/task
	perspective is critical to design efficient data center management
	policies like optimal resource/power utilization, capacity planning and
	optimal (reactive and proactive) maintenance scheduling. Whereas
	jobs/tasks dynamics have received a lot of attention, the study of the
	dynamics of the underlying machines supporting the jobs/tasks execution
	has received much less attention, even when these dynamics would
	substantially affect the performance of the jobs/tasks execution.  Given
	the limited data available from large computing installations, only a
	few previous studies have inspected data centers and only concerning
	failures and their root causes. In this paper, we study the 2011 Google
	data center traces from the machine dynamics perspective. First, we
	characterize the machine events and their underlying distributions in
	order to have a better understanding of the entire machine lifecycle.
	Second, we propose a data-driven model to enable the estimate of the
	expected number of available machines at any instant of time. The model
	is parameterized and validated using the empirical data collected by
	Google during a one month period.}
}

################ 2017

@Inbook{clusterdata:Ray2017,
  author = {Ray, Biplob R., Chowdhury, Morshed and Atif, Usman},
  editor = {Doss, Robin, Piramuthu, Selwyn and Zhou, Wei},
  title = {Is {High Performance Computing (HPC)} Ready to Handle Big Data?},
  bookTitle = {Future Network Systems and Security},
  year = 2017,
  month = Aug,
  publisher = {Springer},
  address = {Cham, Switzerland},
  pages = {97--112},
  abstract={In recent years big data has emerged as a universal term and its
	management has become a crucial research topic. The phrase `big data'
	refers to data sets so large and complex that the processing of them
	requires collaborative High Performance Computing (HPC). How to
	effectively allocate resources is one of the prime challenges in
	HPC. This leads us to the question: are the existing HPC resource
	allocation techniques effective enough to support future big data
	challenges? In this context, we have investigated the effectiveness of
	HPC resource allocation using the Google cluster dataset and a number of
	data mining tools to determine the correlational coefficient between
	resource allocation, resource usages and priority. Our analysis
	initially focused on correlation between resource allocation and
	resource uses. The finding shows that a high volume of resources that
	are allocated by the system for a job are not being used by that same
	job. To investigate further, we analyzed the correlation between
	resource allocation, resource usages and priority. Our clustering,
	classification and prediction techniques identified that the allocation
	and uses of resources are very loosely correlated with priority of the
	jobs. This research shows that our current HPC scheduling needs
	improvement in order to accommodate the big data challenge
	efficiently.},
  keywords = {Big data; HPC; Data mining; QoS; Correlation },
  isbn = {978-3-319-65548-2},
  doi = {10.1007/978-3-319-65548-2_8},
  url = {https://doi.org/10.1007/978-3-319-65548-2_8},
}

@INPROCEEDINGS{clusterdata:Elsayed2017,
  author = {Nosayba El-Sayed and Hongyu Zhu and Bianca Schroeder},
  title = {Learning from Failure Across Multiple Clusters: A Trace-Driven Approach to Understanding, Predicting, and Mitigating Job Terminations},
  booktitle={International Conference on Distributed Computing Systems (ICDCS)},
  year=2017,
  month=Jun,
  pages={1333--1344},
  abstract={In large-scale computing platforms, jobs are prone to interruptions
	and premature terminations, limiting their usability and leading to
	significant waste in cluster resources. In this paper, we tackle this
	problem in three steps. First, we provide a comprehensive study based on
	log data from multiple large-scale production systems to identify
	patterns in the behaviour of unsuccessful jobs across different clusters
	and investigate possible root causes behind job termination. Our results
	reveal several interesting properties that distinguish unsuccessful jobs
	from others, particularly w.r.t. resource consumption patterns and job
	configuration settings. Secondly, we design a machine learning-based
	framework for predicting job and task terminations. We show that job
	failures can be predicted relatively early with high precision and
	recall, and also identify attributes that have strong predictive power
	of job failure. Finally, we demonstrate in a concrete use case how our
	prediction framework can be used to mitigate the effect of unsuccessful
	execution using an effective task-cloning policy that we propose.},
  keywords={learning (artificial intelligence);parallel
 	processing;resource allocation;software fault tolerance; job
 	configuration settings;job failures prediction;job
 	terminations mitigation;job terminations prediction;
 	large-scale computing platforms;machine learning-based
 	framework;resource consumption patterns; task-cloning
 	policy;trace-driven approach;Computer crashes;Electric
 	breakdown;Google;Large-scale systems; Linear systems;Parallel
 	processing;Program processors;Failure Mitigation;Failure
 	Prediction;Job Failure; Large-Scale Systems;Reliability;Trace
 	Analysis}, doi={10.1109/ICDCS.2017.317}, issn={1063-6927}, }

################ 2014

@INPROCEEDINGS{clusterdata:Abdul-Rahman2014,
  author = {Abdul-Rahman, Omar Arif and Aida, Kento},
  title = {Towards understanding the usage behavior of {Google} cloud
	users: the mice and elephants phenomenon},
  booktitle = {IEEE International Conference on Cloud Computing
	Technology and Science (CloudCom)},
  year = 2014,
  month = dec,
  address = {Singapore},
  pages = {272--277},
  keywords = {Google trace; Workload trace analysis; User session view;
	Application composition; Mass-Count disparity; Exploratory statistical
	analysis; Visual analysis; Color-schemed graphs; Coarse grain
	classification; Heavy-tailed distributions; Long-tailed lognormal
	distributions; Exponential distribution; Normal distribution; Discrete
	modes; Large web services; Batch processing; MapReduce computation;
	Human users; },
  abstract = {In the era of cloud computing, users encounter the challenging
	task of effectively composing and running their applications on the
	cloud. In an attempt to understand user behavior in constructing
	applications and interacting with typical cloud infrastructures, we
	analyzed a large utilization dataset of Google cluster. In the present
	paper, we consider user behavior in composing applications from the
	perspective of topology, maximum requested computational resources, and
	workload type. We model user dynamic behavior around the user's session
	view. Mass-Count disparity metrics are used to investigate the
	characteristics of underlying statistical models and to characterize
	users into distinct groups according to their composition and behavioral
	classes and patterns. The present study reveals interesting insight into
	the heterogeneous structure of the Google cloud workload.},
  doi = {10.1109/CloudCom.2014.75},
}

################ 2013

@inproceedings{clusterdata:Di2013,
  title = {Characterizing cloud applications on a {Google} data center},
  author = {Di, Sheng and Kondo, Derrick and Franck, Cappello},
  booktitle = {42nd International Conference on Parallel Processing (ICPP)},
  year = 2013,
  month = Oct,
  address = {Lyon, France},
  abstract = {In this paper, we characterize Google applications,
	based on a one-month Google trace with over 650k jobs running
	across over 12000 heterogeneous hosts from a Google data
	center. On one hand, we carefully compute the valuable
	statistics about task events and resource utilization for
	Google applications, based on various types of resources (such
	as CPU, memory) and execution types (e.g., whether they can
	run batch tasks or not). Resource utilization per application
	is observed with an extremely typical Pareto principle. On the
	other hand, we classify applications via a K-means clustering
	algorithm with optimized number of sets, based on task events
	and resource usage. The number of applications in the Kmeans
	clustering sets follows a Pareto-similar distribution. We
	believe our work is very interesting and valuable for the
	further investigation of Cloud environment.},
}

################ 2012

@INPROCEEDINGS{clusterdata:Reiss2012b,
  title = {Heterogeneity and dynamicity of clouds at scale: {Google}
	trace analysis},
  author = {Charles Reiss and Alexey Tumanov and Gregory R. Ganger and
	Randy H. Katz and Michael A. Kozuch},
  booktitle = {ACM Symposium on Cloud Computing (SoCC)},
  year = 2012,
  month = Oct,
  address = {San Jose, CA, USA},
  abstract = {To better understand the challenges in developing effective
	cloud-based resource schedulers, we analyze the first publicly available
	trace data from a sizable multi-purpose cluster.  The most notable
	workload characteristic is heterogeneity: in resource types (e.g.,
	cores:RAM per machine) and their usage (e.g., duration and resources
	needed). Such heterogeneity reduces the effectiveness of traditional
	slot- and core-based scheduling.  Furthermore, some tasks are
	constrained as to the kind of machine types they can use, increasing the
	complexity of resource assignment and complicating task migration. The
	workload is also highly dynamic, varying over time and most workload
	features, and is driven by many short jobs that demand quick scheduling
	decisions.  While few simplifying assumptions apply, we find that many
	longer-running jobs have relatively stable resource utilizations, which
	can help adaptive resource schedulers.},
  url = {http://www.pdl.cmu.edu/PDL-FTP/CloudComputing/googletrace-socc2012.pdf},
  privatenote = {An earlier version of this was posted at
	\url{http://www.istc-cc.cmu.edu/publications/papers/2012/ISTC-CC-TR-12-101.pdf},
	and included here as clusterdata:Reiss2012a. Please use this
	version instead of that.},
}

@INPROCEEDINGS{clusterdata:Liu2012,
  author = {Zitao Liu and Sangyeun Cho},
  title = {Characterizing machines and workloads on a {Google} cluster},
  booktitle = {8th International Workshop on Scheduling and Resource
	Management for Parallel and Distributed Systems (SRMPDS)},
  year = 2012,
  month = Sep,
  address = {Pittsburgh, PA, USA},
  abstract = {Cloud computing offers high scalability, flexibility and
	cost-effectiveness to meet emerging computing
	requirements. Understanding the characteristics of real workloads on a
	large production cloud cluster benefits not only cloud service providers
	but also researchers and daily users.  This paper studies a large-scale
	Google cluster usage trace dataset and characterizes how the machines in
	the cluster are managed and the workloads submitted during a 29-day
	period behave. We focus on the frequency and pattern of machine
	maintenance events, job- and task-level workload behavior, and how the
	overall cluster resources are utilized.},
  url = {http://www.cs.pitt.edu/cast/abstract/liu-srmpds12.html},
}

@INPROCEEDINGS{clusterdata:Di2012a,
  author = {Sheng Di and Derrick Kondo and Walfredo Cirne},
  title = {Characterization and comparison of cloud versus {Grid} workloads},
  booktitle = {International Conference on Cluster Computing (IEEE CLUSTER)},
  year = 2012,
  month = Sep,
  pages = {230--238},
  address = {Beijing, China},
  abstract = {A new era of Cloud Computing has emerged, but the characteristics
	of Cloud load in data centers is not perfectly clear. Yet this
	characterization is critical for the design of novel Cloud job and
	resource management systems. In this paper, we comprehensively
	characterize the job/task load and host load in a real-world production
	data center at Google Inc. We use a detailed trace of over 25 million
	tasks across over 12,500 hosts. We study the differences between a
	Google data center and other Grid/HPC systems, from the perspective of
	both work load (w.r.t. jobs and tasks) and host load
	(w.r.t. machines). In particular, we study the job length, job
	submission frequency, and the resource utilization of jobs in the
	different systems, and also investigate valuable statistics of machine's
	maximum load, queue state and relative usage levels, with different job
	priorities and resource attributes. We find that the Google data center
	exhibits finer resource allocation with respect to CPU and memory than
	that of Grid/HPC systems. Google jobs are always submitted with much
	higher frequency and they are much shorter than Grid jobs. As such,
	Google host load exhibits higher variance and noise.},
  keywords = {cloud computing;computer centres;grid computing;queueing
	theory;resource allocation;search engines;CPU;Google data
	center;cloud computing;cloud job;cloud load;data centers;grid
	workloads;grid-HPC systems;host load;job length;job submission
	frequency;jobs resource utilization;machine maximum load;queue
	state;real-world production data center;relative usage
	levels;resource allocation;resource attributes;resource
	management systems;task load;Capacity
	planning;Google;Joints;Load modeling;Measurement;Memory
	management;Resource management;Cloud Computing;Grid
	Computing;Load Characterization},
  doi = {10.1109/CLUSTER.2012.35},
  privatenote = {An earlier version is available at
	\url{http://hal.archives-ouvertes.fr/hal-00705858}.  It used
	to be included here as clusterdata:Di2012.},
}

################ 2010

@Article{clusterdata:Mishra2010,
  author = {Mishra, Asit K. and Hellerstein, Joseph L. and Cirne,
	Walfredo and Das, Chita R.},
  title = {Towards characterizing cloud backend workloads: insights
	from {Google} compute clusters},
  journal = {SIGMETRICS Perform. Eval. Rev.},
  volume = {37},
  number = {4},
  month = Mar,
  year = 2010,
  issn = {0163-5999},
  pages = {34--41},
  numpages = {8},
  url = {http://doi.acm.org/10.1145/1773394.1773400},
  doi = {10.1145/1773394.1773400},
  publisher = {ACM},
  abstract = {The advent of cloud computing promises highly available,
	efficient, and flexible computing services for applications such as web
	search, email, voice over IP, and web search alerts. Our experience at
	Google is that realizing the promises of cloud computing requires an
	extremely scalable backend consisting of many large compute clusters
	that are shared by application tasks with diverse service level
	requirements for throughput, latency, and jitter. These considerations
	impact (a) capacity planning to determine which machine resources must
	grow and by how much and (b) task scheduling to achieve high machine
	utilization and to meet service level objectives.

	Both capacity planning and task scheduling require a good understanding
	of task resource consumption (e.g., CPU and memory usage). This in turn
	demands simple and accurate approaches to workload
	classification-determining how to form groups of tasks (workloads) with
	similar resource demands. One approach to workload classification is to
	make each task its own workload. However, this approach scales poorly
	since tens of thousands of tasks execute daily on Google compute
	clusters. Another approach to workload classification is to view all
	tasks as belonging to a single workload. Unfortunately, applying such a
	coarse-grain workload classification to the diversity of tasks running
	on Google compute clusters results in large variances in predicted
	resource consumptions.

	This paper describes an approach to workload classification and its
	application to the Google Cloud Backend, arguably the largest cloud
	backend on the planet. Our methodology for workload classification
	consists of: (1) identifying the workload dimensions; (2) constructing
	task classes using an off-the-shelf algorithm such as k-means; (3)
	determining the break points for qualitative coordinates within the
	workload dimensions; and (4) merging adjacent task classes to reduce the
	number of workloads. We use the foregoing, especially the notion of
	qualitative coordinates, to glean several insights about the Google
	Cloud Backend: (a) the duration of task executions is bimodal in that
	tasks either have a short duration or a long duration; (b) most tasks
	have short durations; and (c) most resources are consumed by a few tasks
	with long duration that have large demands for CPU and memory.},
}


################################################################
# Trace-usage papers
################################################################

These entries are for papers that primarily focus on some other topic, but
use the traces as inputs, e.g., in simulations or load predictions.
Order: most recent first.

################ 2023
@ARTICLE{clusterdata:jajooSLearnTCC2023,
  author={Jajoo, Akshay and Hu, Y. Charlie and Lin, Xiaojun and Deng, Nan},
  journal={IEEE Transactions on Cloud Computing}, 
  title={SLearn: A Case for Task Sampling Based Learning for Cluster Job Scheduling}, 
  year={2023},
  volume={11},
  number={3},
  pages={2664-2680},
  publisher = {USENIX Association},
  keywords = {data centers, big data, job scheduling, learning, online learning},
  abstract = {The ability to accurately estimate job runtime properties allows a 
	scheduler to effectively schedule jobs. State-of-the-art online cluster 
	job schedulers use history-based learning, which uses past job execution 
	information to estimate the runtime properties of newly arrived jobs. 
	However, with fast-paced development in cluster technology (in both hardware
	and software) and changing user inputs, job runtime properties can change over 
	time, which lead to inaccurate predictions. In this article, we explore the 
	potential and limitation of real-time learning of job runtime properties, 
	by proactively sampling and scheduling a small fraction of the tasks of 
	each job. Such a task-sampling-based approach exploits the similarity among 
	runtime properties of the tasks of the same job and is inherently immune to 
	changing job behavior. Our analytical and experimental analysis of 3 production
	traces with different skew and job distribution shows that learning in space can
	be substantially more accurate. Our simulation and testbed evaluation on Azure of 
	the two learning approaches anchored in a generic job scheduler using 3 production
	cluster job traces shows that despite its online overhead, learning in space reduces
	the average Job Completion Time (JCT) by 1.28×, 1.56×, and 1.32× compared to the 
	prior-art history-based predictor. We further analyze the experimental results to 
	give intuitive explanations to why learning in space outperforms learning in time 
	in these experiments. Finally, we show how sampling-based learning can be extended 
	to schedule DAG jobs and achieve similar speedups over the prior-art history-based 
	predictor.},
  doi={10.1109/TCC.2022.3222649}}


################ 2022
@inproceedings {clusterdata:jajooSLearnNSDI2022,
author = {Akshay Jajoo and Y. Charlie Hu and Xiaojun Lin and Nan Deng},
title = {A Case for Task Sampling based Learning for Cluster Job Scheduling},
booktitle = {19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)},
year = {2022},
address = {Renton, WA, USA},
url = {https://www.usenix.org/conference/nsdi22/presentation/jajoo},
publisher = {USENIX Association},
keywords = {data centers, big data, job scheduling, learning, online learning},
abstract = {The ability to accurately estimate job runtime properties allows a
  scheduler to effectively schedule jobs. State-of-the-art online cluster job 
  schedulers use history-based learning, which uses past job execution information 
  to estimate the runtime properties of newly arrived jobs. However, with fast-paced
  development in cluster technology (in both hardware and software) and changing user
  inputs, job runtime properties can change over time, which lead to inaccurate predictions.
  In this paper, we explore the potential and limitation of real-time learning of job 
  runtime properties, by proactively sampling and scheduling a small fraction of the
  tasks of each job. Such a task-sampling-based approach exploits the similarity among
  runtime properties of the tasks of the same job and is inherently immune to changing
  job behavior. Our study focuses on two key questions in comparing task-sampling-based
  learning (learning in space) and history-based learning (learning in time): (1) Can
  learning in space be more accurate than learning in time? (2) If so, can delaying
  scheduling the remaining tasks of a job till the completion of sampled tasks be more
  than compensated by the improved accuracy and result in improved job performance? Our
  analytical and experimental analysis of 3 production traces with different skew and job
  distribution shows that learning in space can be substantially more accurate. Our 
  simulation and testbed evaluation on Azure of the two learning approaches anchored in a
  generic job scheduler using 3 production cluster job traces shows that despite its online
  overhead, learning in space reduces the average Job Completion Time (JCT) by 1.28x, 1.56x,
  and 1.32x compared to the prior-art history-based predictor.},
}


################ 2021
@article{clusterdata:jajooSLearnTechReport2021,
  author    = {Akshay Jajoo and Y. Charlie Hu and Xiaojun Lin and Nan Deng},
  title     = {The Case for Task Sampling based Learning for Cluster Job Scheduling},
  journal   = {Computing Research Repository},
  volume    = {abs/2108.10464},
  year      = {2021},
  url       = {https://arxiv.org/abs/2108.10464},
  eprinttype = {arXiv},
  eprint    = {2108.10464},
  timestamp = {Fri, 27 Aug 2021 15:02:29 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2108-10464.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  keywords = {data centers, big data, job scheduling, learning, online learning},
  abstract = {The ability to accurately estimate job runtime properties allows a
    scheduler to effectively schedule jobs. State-of-the-art online cluster job 
    schedulers use history-based learning, which uses past job execution information 
    to estimate the runtime properties of newly arrived jobs. However, with fast-paced
    development in cluster technology (in both hardware and software) and changing user
    inputs, job runtime properties can change over time, which lead to inaccurate predictions.
    In this paper, we explore the potential and limitation of real-time learning of job 
    runtime properties, by proactively sampling and scheduling a small fraction of the
    tasks of each job. Such a task-sampling-based approach exploits the similarity among
    runtime properties of the tasks of the same job and is inherently immune to changing
    job behavior. Our study focuses on two key questions in comparing task-sampling-based
    learning (learning in space) and history-based learning (learning in time): (1) Can
    learning in space be more accurate than learning in time? (2) If so, can delaying
    scheduling the remaining tasks of a job till the completion of sampled tasks be more
    than compensated by the improved accuracy and result in improved job performance? Our
    analytical and experimental analysis of 3 production traces with different skew and job
    distribution shows that learning in space can be substantially more accurate. Our 
    simulation and testbed evaluation on Azure of the two learning approaches anchored in a
    generic job scheduler using 3 production cluster job traces shows that despite its online
    overhead, learning in space reduces the average Job Completion Time (JCT) by 1.28x, 1.56x,
    and 1.32x compared to the prior-art history-based predictor.},
}

################ 2020

@INPROCEEDINGS{clusterdata:Lin2020,
   title = {Using {GANs} for Sharing Networked Time Series Data: Challenges, 
   Initial Promise, and Open Questions},
   author = {Lin, Zinan and Jain, Alankar and Wang, Chen and Fanti, 
   Giulia and Sekar, Vyas},
   year = {2020},
   isbn = {9781450381383},
   publisher = {Association for Computing Machinery},
   url = {https://doi.org/10.1145/3419394.3423643},
   doi = {10.1145/3419394.3423643},
   abstract = {Limited data access is a longstanding barrier to data-driven
   research and development in the networked systems community. In this work,
   we explore if and how generative adversarial networks (GANs) can be used to
   incentivize data sharing by enabling a generic framework for sharing
   synthetic datasets with minimal expert knowledge. As a specific target, our
   focus in this paper is on time series datasets with metadata (e.g., packet
   loss rate measurements with corresponding ISPs). We identify key challenges
   of existing GAN approaches for such workloads with respect to fidelity
   (e.g., long-term dependencies, complex multidimensional relationships, mode
   collapse) and privacy (i.e., existing guarantees are poorly understood and
   can sacrifice fidelity). To improve fidelity, we design a custom workflow
   called DoppelGANger (DG) and demonstrate that across diverse real-world
   datasets (e.g., bandwidth measurements, cluster requests, web sessions) and
   use cases (e.g., structural characterization, predictive modeling, algorithm
   comparison), DG achieves up to 43% better fidelity than baseline models.
   Although we do not resolve the privacy problem in this work, we identify
   fundamental challenges with both classical notions of privacy and recent
   advances to improve the privacy properties of GANs, and suggest a potential
   roadmap for addressing these challenges. By shedding light on the promise
   and challenges, we hope our work can rekindle the conversation on workflows
   for data sharing.},
   booktitle = {Proceedings of the ACM Internet Measurement Conference (IMC
   2020)},
   pages = {464--483},
   numpages = {20},
   keywords = {privacy, synthetic data generation, time series,
   generative adversarial networks},
}

@article{clusterdata:Aydin2020,
  title = {Multi-objective temporal bin packing problem: an application in cloud computing},
  journal = {Computers \& Operations Research},
  volume = 121,
  pages = {1049--59},
  year = 2020,
  month = Sep,
  issn = {0305-0548},
  doi = {https://doi.org/10.1016/j.cor.2020.104959},
  url = {http://www.sciencedirect.com/science/article/pii/S0305054820300769},
  author = {Nurşen Aydin and Ibrahim Muter and Ş. Ilker Birbil},
  keywords = {Bin packing, Cloud computing, Heuristics, Exact methods, Column generation},
  abstract = {Improving energy efficiency and lowering operational
	costs are the main challenges faced in systems with multiple
	servers. One prevalent objective in such systems is to
	minimize the number of servers required to process a given set
	of tasks under server capacity constraints. This objective
	leads to the well-known bin packing problem. In this study, we
	consider a generalization of this problem with a time
	dimension, where the tasks are to be performed with predefined
	start and end times. This new dimension brings about new
	performance considerations, one of which is the uninterrupted
	utilization of servers. This study is motivated by the problem
	of energy efficient assignment of virtual machines to physical
	servers in a cloud computing service. We address the virtual
	machine placement problem and present a binary integer
	programming model to develop different assignment policies. By
	analyzing the structural properties of the problem, we propose
	an efficient heuristic method based on solving smaller
	versions of the original problem iteratively. Moreover, we
	design a column generation algorithm that yields a lower bound
	on the objective value, which can be utilized to evaluate the
	performance of the heuristic algorithm. Our numerical study
	indicates that the proposed heuristic is capable of solving
	large-scale instances in a short time with small optimality
	gaps.},
}

@article{clusterdata:Milocco2020,
  title = {Evaluating the Upper Bound of Energy Cost Saving by Proactive Data Center Management},
  journal = {IEEE Transactions on Network and Service Management},  
  year = 2020,  
  issn = {1932-4537},
  doi = {10.1109/TNSM.2020.2988346},
  url = {https://ieeexplore.ieee.org/abstract/document/9069318},
  author = {Ruben Milocco and Pascale Minet and Éric Renault and Selma Boumerdassi},  
  keywords = {Data center management, Proactive management, Machine Learning, Prediction, Energy cost},
  abstract = {
	Data Centers (DCs) need to periodically configure their servers in order to meet user demands.
	Since appropriate proactive management to meet demands reduces the cost, either by improving Quality of 
	Service (QoS) or saving energy, there is a great interest in studying different proactive strategies 
	based on predictions of the energy used to serve CPU and memory requests. The amount of savings that can 
	be achieved depends not only on the selected proactive strategy but also on user-demand statistics and the 
	predictors used. Despite its importance, it is difficult to find theoretical studies that quantify the 
	savings that can be made, due to the problem complexity. A proactive DC management strategy is presented 
	together with its upper bound of energy cost savings obtained with respect to a purely reactive management. 
	Using this method together with records of the recent past, it is possible to quantify the efficiency of 
	different predictors. Both linear and nonlinear predictors are studied, using a Google data set collected 
	over 29 days, to evaluate the benefits that can be obtained with these two predictors.},
}


################ 2018

@article{clusterdata:Sliwko2018,
author  =  {Sliwko, Leszek},
title = {A Scalable Service Allocation Negotiation For Cloud Computing},
journal = {Journal of Theoretical and Applied Information Technology},
volume = 96,
number = 20,
month = Oct,
year = 2018,
issn = {1817-3195},
pages = {6751--6782},
numpages = {32},
keywords = {distributed scheduling, agents; load balancing, MASB},
abstract={This paper presents a detailed design of a decentralised agent-based
  scheduler, which can be used to manage workloads within the computing cells
  of a Cloud system. This scheme in based on the concept of service allocation
  negotiation, whereby all system nodes communicate between themselves and
  scheduling logic is decentralised. The architecture presented has been
  implemented, with multiple simulations run using realword workload traces from
  the Google Cluster Data project. The results were then compared to the
  scheduling patterns of Google’s Borg system.}
}

@INPROCEEDINGS{clusterdata:Liu2018gh,
  author = {Liu, Jinwei and Shen, Haiying and Sarker, Ankur and Chung, Wingyan},
  title = {Leveraging Dependency in Scheduling and Preemption for High Throughput in Data-Parallel Clusters},
  booktitle = {2018 IEEE International Conference on Cluster Computing (CLUSTER)},
  year = {2018},
  month = Sep,
  pages = {359--369},
  publisher = {IEEE},
  abstract = {Task scheduling and preemption are two important functions in
	data-parallel clusters. Though directed acyclic graph task dependencies
	are common in data-parallel clusters, previous task scheduling and
	preemption methods do not fully utilize such task dependency to increase
	throughput since they simply schedule precedent tasks prior to their
	dependent tasks or neglect the dependency. We notice that in both
	scheduling and preemption, choosing a task with more dependent tasks to
	run allows more tasks to be runnable next, which facilitates to select a
	task that can more increase throughput. Accordingly, in this paper, we
	propose a Dependency-aware Scheduling and Preemption system (DSP) to
	achieve high throughput. First, we build an integer linear programming
	model to minimize the makespan (i.e., the time when all jobs finish
	execution) with the consideration of task dependency and deadline, and
	derive the target server and start time for each task, which can
	minimize the makespan. Second, we utilize task dependency to determine
	tasks' priorities for preemption. Finally, we propose a method to reduce
	the number of unnecessary preemptions that cause more overhead than the
	throughput gain. Extensive experimental results based on a real cluster
	and Amazon EC2 cloud service show that DSP achieves much higher
	throughput compared to existing strategies.},
  doi = {10.1109/CLUSTER.2018.00054},
}

@inproceedings{clusterdata:Minet2018j,
  author = {Pascale Minet and Éric Renault and Ines Khoufi and Selma Boumerdassi},
  title = {Analyzing Traces from a {Google} Data Center},
  booktitle = {14th International Wireless Communications and Mobile Computing Conference (IWCMC 2018)},
  year = 2018,
  month = Jun,
  publisher = {IEEE},
  address = {Limassol, Cyprus},
  pages = {1167--1172},
  url = {https://doi.org/10.1109/IWCMC.2018.8450304},
  doi = {10.1109/IWCMC.2018.8450304},
  abstract = {
	Traces collected from an operational Google data center over 29 days represent a very rich
	and useful source of information for understanding the main features of a data center. In this
	paper, we characterize the strong heterogeneity of jobs and the medium heterogeneity of machine
	configurations. We analyze the off-periods of machines. We study the distribution of jobs per
	category, per scheduling class, per priority and per number of tasks. The distribution of job
	execution durations shows a high disparity, as does the job waiting time before being scheduled.
	The resource requests in terms of CPU and memory are also analyzed. The distribution of these
	parameter values is very useful to develop accurate models and algorithms for resource allocation
	in data centers.},
  keywords = {Data analysis, data center, big data application, resource allocation, scheduling},
}

@inproceedings{clusterdata:Minet2018m,
  author = {Pascale Minet and Éric Renault and Ines Khoufi and Selma Boumerdassi},
  title = {Data Analysis of a {Google} Data Center},
  booktitle = {18th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGRID 2018)},
  year = 2018,
  month = May,
  publisher = {IEEE},
  address = {Washington DC, USA},
  pages = {342--343},
  url = {https://doi.org/10.1109/CCGRID.2018.00049},
  doi = {10.1109/CCGRID.2018.00049},
  abstract = {
	Data collected from an operational Google data center during 29 days represent a very rich
	and very useful source of information for understanding the main features of a data center.
	In this paper, we highlight the strong heterogeneity of jobs. The distribution of job execution
	duration shows a high disparity, as well as the job waiting time before being scheduled. The
	resource requests in terms of CPU and memory are also analyzed. The knowledge of all these features
	is needed to design models of jobs, machines and resource requests that are representative of a
	real data center.},
}

@ARTICLE{clusterdata:Sebastio2018b,
  author = {Stefano Sebastio and Rahul Ghosh and Tridib Mukherjee},
  journal = {IEEE Transactions on Services Computing},
  title = {An availability analysis approach for deployment configurations of containers},
  year = {2018},
  month = Jan,
  abstract = {Operating system (OS) containers enabling the microservice-oriented
	architecture are becoming popular in the context of Cloud
	services. Containers provide the ability to create lightweight and
	portable runtime environments decoupling the application requirements
	from the characteristics of the underlying system. Services built on
	containers have a small resource footprint in terms of processing,
	storage, memory and network, allowing a denser deployment
	environment. While the performance of such containers is addressed in
	few previous studies, understanding the failure-repair behavior of the
	containers remains unexplored. In this paper, from an availability point
	of view, we propose and compare different configuration models for
	deploying a containerized software system. Inspired by Google
	Kubernetes, a container management system, these configurations are
	characterized with a failure response and migration service. We develop
	novel non-state-space and state-space analytic models for container
	availability analysis. Analytical as well as simulative solutions are
	obtained for the developed models. Our analysis provides insights on k
	out-of N availability and sensitivity of system availability for key
	system parameters. Finally, we build an open-source software tool
	powered by these models. The tool helps Cloud administrators to assess
	the availability of containerized systems and to conduct a what-if
	analysis based on user-provided parameters and configurations.},
  keywords = {Containers;Analytical models;Cloud computing;Stochastic
	processes;Tools;Computer architecture;Google;container;system
	availability;virtual machine;cloud computing;analytic model;stochastic
	reward net},
  doi = {10.1109/TSC.2017.2788442},
  ISSN = {1939-1374},
}


@article{clusterdata:Sebastio2018c,
 author  =  {Sebastio, Stefano and Amoretti, Michele and Lafuente, Alberto Lluch and Scala, Antonio},
 title = {A Holistic Approach for Collaborative Workload Execution in Volunteer Clouds},
  journal = {ACM Transactions on Modeling and Computer Simulation (TOMACS)},
  volume = 28,
  number = 2,
  month = Mar,
  year = 2018,
  issn = {1049-3301},
  pages = {14:1--14:27},
  articleno = {14},
  numpages = {27},
  url = {http://doi.acm.org/10.1145/3155336},
  doi = {10.1145/3155336},
  acmid = {3155336},
  publisher = {ACM},
  keywords = {Collective adaptive systems, ant colony optimization (ACO),
	autonomic computing, cloud computing, collaborative computing,
	computational fields, multiagent optimization, peer-to-peer (P2P), task
	scheduling},
  abstract={The demand for provisioning, using, and maintaining distributed
	computational resources is growing hand in hand with the quest for
	ubiquitous services. Centralized infrastructures such as cloud computing
	systems provide suitable solutions for many applications, but their
	scalability could be limited in some scenarios, such as in the case of
	latency-dependent applications. The volunteer cloud paradigm aims at
	overcoming this limitation by encouraging clients to offer their own
	spare, perhaps unused, computational resources. Volunteer clouds are
	thus complex, large-scale, dynamic systems that demand for self-adaptive
	capabilities to offer effective services, as well as modeling and
	analysis techniques to predict their behavior. In this article, we
	propose a novel holistic approach for volunteer clouds supporting
	collaborative task execution services able to improve the quality of
	service of compute-intensive workloads. We instantiate our approach by
	extending a recently proposed ant colony optimization algorithm for
	distributed task execution with a workload-based partitioning of the
	overlay network of the volunteer cloud. Finally, we evaluate our
	approach using simulation-based statistical analysis techniques on a
	workload benchmark provided by Google. Our results show that the
	proposed approach outperforms some traditional distributed task
	scheduling algorithms in the presence of compute-intensive workloads.}
}

@Article{clusterdata:Sebastio2018d,
  author = {Stefano Sebastio and Giorgio Gnecco},
  title = {A green policy to schedule tasks in a distributed cloud},
  journal = {Optimization Letters},
  year = 2018,
  month = Oct,
  day = 01,
  volume = 12,
  number = 7,
  pages = {1535--1551},
  abstract = {In the last years, demand and availability of computational
	capabilities experienced radical changes. Desktops and laptops increased
	their processing resources, exceeding users' demand for large part of
	the day. On the other hand, computational methods are more and more
	frequently adopted by scientific communities, which often experience
	difficulties in obtaining access to the required
	resources. Consequently, data centers for outsourcing use, relying on
	the cloud computing paradigm, are proliferating. Notwithstanding the
	effort to build energy-efficient data centers, their energy footprint is
	still considerable, since cooling a large number of machines situated in
	the same room or container requires a significant amount of power. The
	volunteer cloud, exploiting the users' willingness to share a quote of
	their underused machine resources, can constitute an effective solution
	to have the required computational resources when needed. In this paper,
	we foster the adoption of the volunteer cloud computing as a green
	(i.e., energy efficient) solution even able to outperform existing data
	centers in specific tasks. To manage the complexity of such a large
	scale heterogeneous system, we propose a distributed optimization policy
	to task scheduling with the aim of reducing the overall energy
	consumption executing a given workload. To this end, we consider an
	integer programming problem relying on the Alternating Direction Method
	of Multipliers (ADMM) for its solution. Our approach is compared with a
	centralized one and other non-green targeting solutions. Results show
	that the distributed solution found by the ADMM constitutes a good
	suboptimal solution, worth to be applied in a real environment.},
  issn = {1862-4480},
  doi = {10.1007/s11590-017-1208-8},
  url = {https://doi.org/10.1007/s11590-017-1208-8}
}
################ 2017

@Article{clusterdata:Carvalho2017b,
  author = {Marcus Carvalho and Daniel A. Menasc\'{e} and Francisco Brasileiro},
  title = {Capacity planning for {IaaS} cloud providers offering multiple
	service classes},
  journal = {Future Generation Computer Systems},
  volume = {77},
  pages = {97--111},
  month = Dec,
  year = 2017,
  abstract = {Infrastructure as a Service (IaaS) cloud providers typically offer
	multiple service classes to satisfy users with different requirements
	and budgets. Cloud providers are faced with the challenge of estimating
	the minimum resource capacity required to meet Service Level Objectives
	(SLOs) defined for all service classes. This paper proposes a capacity
	planning method that is combined with an admission control mechanism to
	address this challenge. The capacity planning method uses analytical
	models to estimate the output of a quota-based admission control
	mechanism and find the minimum capacity required to meet availability
	SLOs and admission rate targets for all classes. An evaluation using
	trace-driven simulations shows that our method estimates the best cloud
	capacity with a mean relative error of 2.5\% with respect to the
	simulation, compared to a 36\% relative error achieved by a single-class
	baseline method that does not consider admission control
	mechanisms. Moreover, our method exhibited a high SLO fulfillment for
	both availability and admission rates, and obtained mean CPU utilization
	over 91\%, while the single-class baseline method had values not greater
	than 78\%.},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X16308561},
  doi = {10.1016/j.future.2017.07.019},
  issn = {0167-739X},
}

@inproceedings{clusterdata:Janus2017,
  author = {Pawel Janus and Krzysztof Rzadca},
  title = {{SLO}-aware Colocation of Data Center Tasks Based on Instantaneous Processor Requirements},
  booktitle = {ACM Symposium on Cloud Computing (SoCC)},
  year = 2017,
  month = Sep,
  pages = {256--268},
  address = {Santa Clara, CA, USA},
  publisher = {ACM},
  abstract = {In a cloud data center, a single physical machine simultaneously
	executes dozens of highly heterogeneous tasks.  Such colocation results
	in more efficient utilization of machines, but, when tasks' requirements
	exceed available resources, some of the tasks might be throttled down or
	preempted.  We analyze version 2.1 of the Google cluster trace that
	shows short-term (1 second) task CPU usage.  Contrary to the assumptions
	taken by many theoretical studies, we demonstrate that the empirical
	distributions do not follow any single distribution. However, high
	percentiles of the total processor usage (summed over at least 10 tasks)
	can be reasonably estimated by the Gaussian distribution.  We use this
	result for a probabilistic fit test, called the Gaussian Percentile
	Approximation (GPA), for standard bin-packing algorithms.  To check
	whether a new task will fit into a machine, GPA checks whether the
	resulting distribution's percentile corresponding to the requested
	service level objective, SLO is still below the machine's capacity.  In
	our simulation experiments, GPA resulted in colocations exceeding the
	machines' capacity with a frequency similar to the requested SLO.},
  doi = {10.1145/3127479.3132244},
  url = {http://arxiv.org/abs/1709.01384},
}


@InProceedings{clusterdata:Carvalho2017,
  title = {Multi-dimensional admission control and capacity planning for {IaaS} clouds with multiple service classes},
  author = {Carvalho, Marcus and Brasileiro, Francisco and Lopes, Raquel and Farias, Giovanni and Fook, Alessandro and Mafra, Jo\~{a}o and Turull, Daniel},
  booktitle = {IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGrid)},
  year = 2017,
  month = May,
  pages = {160--169},
  address = {Madrid, Spain},
  keywords = {admission control, capacity planning, cloud computing, performance models, simulation},
  abstract = {Infrastructure as a Service (IaaS) providers typically offer
	multiple service classes to deal with the wide variety of users adopting
	this cloud computing model. In this scenario, IaaS providers need to
	perform efficient admission control and capacity planning in order to
	minimize infrastructure costs, while fulfilling the different Service
	Level Objectives (SLOs) defined for all service classes
	offered. However, most of the previous work on this field consider a
	single resource dimension -- typically CPU -- when making such
	management decisions. We show that this approach will either increase
	infrastructure costs due to over-provisioning, or violate SLOs due to
	lack of capacity for the resource dimensions being ignored. To fill this
	gap, we propose admission control and capacity planning methods that
	consider multiple service classes and multiple resource dimensions.  Our
	results show that our admission control method can guarantee a high
	availability SLO fulfillment in scenarios where both CPU and memory can
	become the bottleneck resource.  Moreover, we show that our capacity
	planning method can find the minimum capacity required for both CPU and
	memory to meet SLOs with good accuracy. We also analyze how the load
	variation on one resource dimension can affect another, highlighting the
	need to manage resources for multiple dimensions simultaneously.},
  url = {https://doi.org/10.1109/CCGRID.2017.14},
  doi = {10.1109/CCGRID.2017.14},
}

@article{clusterdata:Sebastio2017,
  title = {Optimal distributed task scheduling in volunteer clouds},
  journal = {Computers and Operations Research},
  volume = 81,
  pages = {231 - 246},
  year = 2017,
  month = May,
  issn = {0305-0548},
  doi = {https://doi.org/10.1016/j.cor.2016.11.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0305054816302660},
  author = {Stefano Sebastio and Giorgio Gnecco and Alberto Bemporad},
  keywords = {Cloud computing, Distributed optimization, Integer programming,
	Combinatorial optimization, ADMM},
  abstract = {The ever increasing request of computational resources has shifted
	the computing paradigm towards solutions where less computation is
	performed locally. The most widely adopted approach nowadays is
	represented by cloud computing. With the cloud, users can transparently
	access to virtually infinite resources with the same aptitude of using
	any other utility. Next to the cloud, the volunteer computing paradigm
	has gained attention in the last decade, where the spared resources on
	each personal machine are shared thanks to the usersÃ¢â�¬â�¢ willingness to
	cooperate. Cloud and volunteer paradigms have been recently seen as
	companion technologies to better exploit the use of local
	resources. Conversely, this scenario places complex challenges in
	managing such a large-scale environment, as the resources available on
	each node and the presence of the nodes online are not known
	a-priori. The complexity further increases in presence of tasks that
	have an associated Service Level Agreement specified, e.g., through a
	deadline. Distributed management solutions have then be advocated as the
	only approaches that are realistically applicable. In this paper, we
	propose a framework to allocate tasks according to different policies,
	defined by suitable optimization problems. Then, we provide a
	distributed optimization approach relying on the Alternating Direction
	Method of Multipliers (ADMM) for one of these policies, and we compare
	it with a centralized approach. Results show that, when a centralized
	approach can not be adopted in a real environment, it could be possible
	to rely on the good suboptimal solutions found by the ADMM.}
}
################ 2016

@InProceedings{clusterdata:Zakarya2016,
  title = {An energy aware cost recovery approach for virtual machine migration},
  author = {Muhammad Zakarya and Lee Gillam},
  year = 2016,
  booktitle = {13th International Conference on Economics of Grids, Clouds, Systems and Services (GECON2016)},
  month = September,
  address = {Athens, Greece},
  abstract = {Datacenters provide an IT backbone for today's business and
	economy, and are the principal electricity consumers for Cloud
	computing. Various studies suggest that approximately 30\% of the
	running servers in US datacenters are idle and the others are
	under-utilized, making it possible to save energy and money by using
	Virtual Machine (VM) consolidation to reduce the number of hosts in
	use. However, consolidation involves migrations that can be expensive in
	terms of energy consumption, and sometimes it will be more energy
	efficient not to consolidate. This paper investigates how migration
	decisions can be made such that the energy costs involved with the
	migration are recovered, as only when costs of migration have been
	recovered will energy start to be saved. We demonstrate through a number
	of experiments, using the Google workload traces for 12,583 hosts and
	1,083,309 tasks, how different VM allocation heuristics, combined with
	different approaches to migration, will impact on energy efficiency. We
	suggest, using reasonable assumptions for datacenter setup, that a
	combination of energy-aware fill-up VM allocation and energy-aware
	migration, and migration only for relatively long running VMs, provides
	for optimal energy efficiency.},
  url = {http://epubs.surrey.ac.uk/id/eprint/813810},
}

@INPROCEEDINGS{clusterdata:Sliwko2016,
  title = {{AGOCS} - Accurate {Google} Cloud Simulator Framework},
  author = {Leszek Sliwko and Vladimir Getov},
  booktitle = {16th IEEE International Conference on Scalable Computing and Communications (ScalCom 2016)},
  year = 2016,
  month = July,
  pages={550--558},
  address = {Toulouse, France},
  keywords = {cloud system; workload traces; workload simulation framework; google cluster data},
  abstract = {This paper presents the Accurate Google Cloud Simulator (AGOCS) -
	a novel high-fidelity Cloud workload simulator based on parsing real
	workload traces, which can be conveniently used on a desktop machine for
	day-to-day research. Our simulation is based on real-world workload
	traces from a Google Cluster with 12.5K nodes, over a period of a
	calendar month. The framework is able to reveal very precise and
	detailed parameters of the executed jobs, tasks and nodes as well as to
	provide actual resource usage statistics. The system has been
	implemented in Scala language with focus on parallel execution and an
	easy-to-extend design concept. The paper presents the detailed
	structural framework for AGOCS and discusses our main design decisions,
	whilst also suggesting alternative and possibly performance enhancing
	future approaches. The framework is available via the Open Source GitHub
	repository.},
  url = {http://dx.doi.org/10.1109/UIC-ATC-ScalCom-CBDCom-IoP-SmartWorld.2016.10},
  doi={10.1109/UIC-ATC-ScalCom-CBDCom-IoP-SmartWorld.2016.10},
}

################ 2015

@INPROCEEDINGS{clusterdata:Carvalho2015,
  title = {Prediction-Based Admission Control for {IaaS} Clouds with Multiple Service Classes},
  author = {Marcus Carvalho and Daniel Menasce and Francisco Brasileiro},
  booktitle = {IEEE International Conference on Cloud Computing Technology and Science (CloudCom)},
  year = 2015,
  month = Nov,
  pages={82--90},
  address = {Vancouver, BC, Canada},
  keywords = {admission control;cloud computing;infrastructure-as-a-service;
       performance prediction;quality of service;resource management},
  abstract = {There is a growing adoption of cloud computing services,
       attracting users with different requirements and budgets to run
       their applications in cloud infrastructures. In order to match
       users' needs, cloud providers can offer multiple service
       classes with different pricing and Service Level Objective (SLO)
       guarantees. Admission control mechanisms can help providers to
       meet target SLOs by limiting the demand at peak periods. This
       paper proposes a prediction-based admission control model for
       IaaS clouds with multiple service classes, aiming to maximize
       request admission rates while fulfilling availability SLOs
       defined for each class. We evaluate our approach with trace-driven
       simulations fed with data from production systems. Our results
       show that admission control can reduce SLO violations
       significantly, specially in underprovisioned scenarios. Moreover,
       our predictive heuristics are less sensitive to different capacity
       planning and SLO decisions, as they fulfill availability SLOs for
       more than 91\% of requests even in the worst case scenario, for
       which only 56\% of SLOs are fulfilled by a simpler greedy heuristic
       and as little as 0.2\% when admission control is not used.},
  url = {http://dx.doi.org/10.1109/CloudCom.2015.16},
  doi={10.1109/CloudCom.2015.16},
}

@INPROCEEDINGS{clusterdata:Ismaeel2015,
  author = {Salam Ismaeel and Ali Miri},
  title = {Using {ELM} Techniques to Predict Data Centre {VM} Requests},
  year = 2015,
  booktitle = {IEEE International Conference on Cyber Security and Cloud Computing (CSCloud)},
  month = Nov,
  publisher = {IEEE},
  address = {New York, NY, USA},
  abstract = {Data centre prediction models can be used to forecast future loads
	for a given centre in terms of CPU, memory, VM requests, and other
	parameters. An effective and efficient model can not only be used to
	optimize resource allocation, but can also be used as part of a strategy
	to conserve energy, improve performance and increase profits for both
	clients and service providers. In this paper, we have developed a
	prediction model, which combines k-means clustering techniques and
	Extreme Learning Machines (ELMs). We have shown the effectiveness of our
	proposed model by using it to estimate future VM requests in a data
	centre based on its historical usage. We have tested our model on real
	Google traces that feature over 25 million tasks collected over a 29-day
	time period. Experimental results presented show that our proposed
	system outperforms other models reported in the literature.},
}

@INPROCEEDINGS{clusterdata:Sebastio2015b,
  author = {Stefano Sebastio and Antonio Scala},
  booktitle = {2015 IEEE Conference on Collaboration and Internet Computing (CIC)},
  title = {A workload-based approach to partition the volunteer cloud},
  year = 2015,
  month = Oct,
  pages = {210--218},
  abstract = {The growing demand of computational resources has shifted users
	towards the adoption of cloud computing technologies. Cloud allows users
	to transparently access to remote computing capabilities as an
	utility. The volunteer computing paradigm, another ICT trend of the last
	years, can be considered a companion force to enhance the cloud in
	fulfilling specific domain requirements, such as computational intensive
	requests. Combining the spared resources provided by volunteer nodes
	with few data centers is possible to obtain a robust and scalable cloud
	platform. The price for such benefits relies in increased challenges to
	design and manage a dynamic complex system composed by heterogeneous
	nodes. Task execution requests submitted in the volunteer cloud are
	usually associated with Quality of Service requirements e.g., Specified
	through an execution deadline. In this paper, we present a preliminary
	evaluation of a cloud partitioning approach to distribute task execution
	requests in volunteer cloud, that has been validated through a
	simulation-based statistical analysis using the Google workload data
	trace.},
keywords = {cloud computing;computer centres;digital simulation;quality of
	service;statistical analysis;volunteer computing;workload-based
	approach;volunteer cloud partitioning;computational resources;cloud
	computing technologies;remote computing capabilities;volunteer computing
	paradigm;volunteer nodes;data centers;cloud platform;dynamic complex
	system;heterogeneous nodes;task execution request;quality of service
	requirements;simulation-based statistical analysis;Google workload data
	trace;Cloud computing;Measurement;Peer-to-peer computing;Quality of
	service;Google;Overlay networks;Computer applications;cloud
	computing;autonomic clouds;autonomous systems;volunteer
	computing;distributed tasks execution},
doi = {10.1109/CIC.2015.27},
}

@INPROCEEDINGS{clusterdata:Sirbu2015,
  title = {Towards Data-Driven Autonomics in Data Centers},
  author = {Alina S{\^\i}rbu and Ozalp Babaoglu},
  booktitle = {International Conference on Cloud and Autonomic Computing (ICCAC)},
  month = Sep,
  year = 2015,
  address = {Cambridge, MA, USA},
  publisher = {IEEE Computer Society},
  keywords = {Data science; predictive analytics; Google cluster
	trace; log data analysis; failure prediction; machine learning
	classification; ensemble classifier; random forest; BigQuery},
  abstract = {Continued reliance on human operators for managing data centers is
	a major impediment for them from ever reaching extreme dimensions.
	Large computer systems in general, and data centers in particular, will
	ultimately be managed using predictive computational and executable
	models obtained through data-science tools, and at that point, the
	intervention of humans will be limited to setting high-level goals and
	policies rather than performing low-level operations. Data-driven
	autonomics, where management and control are based on holistic
	predictive models that are built and updated using generated data, opens
	one possible path towards limiting the role of operators in data
	centers.  In this paper, we present a data-science study of a public
	Google dataset collected in a 12K-node cluster with the goal of building
	and evaluating a predictive model for node failures.  We use BigQuery,
	the big data SQL platform from the Google Cloud suite, to process
	massive amounts of data and generate a rich feature set characterizing
	machine state over time. We describe how an ensemble classifier can be
	built out of many Random Forest classifiers each trained on these
	features, to predict if machines will fail in a future 24-hour
	window. Our evaluation reveals that if we limit false positive rates to
	5\%, we can achieve true positive rates between 27\% and 88\% with
	precision varying between 50\% and 72\%.  We discuss the practicality of
	including our predictive model as the central component of a data-driven
	autonomic manager and operating it on-line with live data streams
	(rather than off-line on data logs).  All of the scripts used for
	BigQuery and classification analyses are publicly available from the
	authors' website.},
  url = {http://www.cs.unibo.it/babaoglu/papers/pdf/CAC2015.pdf},
}

@inproceedings {clusterdata:Delgado2015hawk,
  author = {Pamela Delgado and Florin Dinu and Anne-Marie Kermarrec and Willy Zwaenepoel},
  title = {{Hawk}: hybrid datacenter scheduling},
  year = {2015},
  booktitle = {USENIX Annual Technical Conference (USENIX ATC)},
  month = Jul,
  publisher = {USENIX Association},
  pages = {499--510},
  address = {Santa Clara, CA, USA},
  isbn = {978-1-931971-225},
  url = {https://www.usenix.org/conference/atc15/technical-session/presentation/delgado},
  abstract = {This paper addresses the problem of efficient scheduling of large
	clusters under high load and heterogeneous workloads. A heterogeneous
	workload typically consists of many short jobs and a small number of
	large jobs that consume the bulk of the cluster's resources.

	Recent work advocates distributed scheduling to overcome the limitations
	of centralized schedulers for large clusters with many competing
	jobs. Such distributed schedulers are inherently scalable, but may make
	poor scheduling decisions because of limited visibility into the overall
	resource usage in the cluster. In particular, we demonstrate that under
	high load, short jobs can fare poorly with such a distributed scheduler.

	We propose instead a new hybrid centralized/distributed scheduler,
	called Hawk. In Hawk, long jobs are scheduled using a centralized
	scheduler, while short ones are scheduled in a fully distributed
	way. Moreover, a small portion of the cluster is reserved for the use of
	short jobs. In order to compensate for the occasional poor decisions
	made by the distributed scheduler, we propose a novel and efficient
	randomized work-stealing algorithm.

	We evaluate Hawk using a trace-driven simulation and a prototype
	implementation in Spark. In particular, using a Google trace, we show
	that under high load, compared to the purely distributed Sparrow
	scheduler, Hawk improves the 50th and 90th percentile runtimes by 80\%
	and 90\% for short jobs and by 35\% and 10\% for long jobs,
	respectively. Measurements of a prototype implementation using Spark on
	a 100-node cluster confirm the results of the simulation.},
}

@article{clusterdata:Sebastio2015,
  author = {Sebastio, Stefano and Amoretti, Michele and Lluch-Lafuente, Alberto},
  title = {AVOCLOUDY: a simulator of volunteer clouds},
  journal = {Software: Practice and Experience},
  volume = {46},
  number = {1},
  pages = {3--30},
  year = 2015,
  month = Jan,
  keywords = {cloud computing, volunteer computing, autonomic computing, distributed computing, discrete event simulation},
  doi = {10.1002/spe.2345},
  url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/spe.2345},
  eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/spe.2345},
  abstract = {The increasing demand of computational and storage resources is
	shifting users toward the adoption of cloud technologies. Cloud
	computing is based on the vision of computing as utility, where users no
	more need to buy machines but simply access remote resources made
	available on-demand by cloud providers. The relationship between users
	and providers is defined by a service-level agreement, where the
	non-fulfillment of its terms is regulated by the associated penalty
	fees. Therefore, it is important that the providers adopt proper
	monitoring and managing strategies. Despite their reduced application,
	intelligent agents constitute a feasible technology to add autonomic
	features to cloud operations. Furthermore, the volunteer computing
	paradigmÃ¢â�¬â��one of the Information and Communications Technology (ICT)
	trends of the last decadeÃ¢â�¬â��can be pulled alongside traditional cloud
	approaches, with the purpose to Ã¢â�¬Ë�greenÃ¢â�¬â�¢ them. Indeed, the
	combination of data center and volunteer resources, managed by agents,
	allows one to obtain a more robust and scalable cloud computing
	platform. The increased challenges in designing such a complex system
	can benefit from a simulation-based approach, to test autonomic
	management solutions before their deployment in the production
	environment. However, currently available simulators of cloud platforms
	are not suitable to model and analyze such heterogeneous, large-scale,
	and highly dynamic systems. We propose the AVOCLOUDY simulator to fill
	this gap. This paper presents the internal architecture of the
	simulator, provides implementation details, summarizes several notable
	applications, and provides experimental results that measure the
	simulator performance and its accuracy. The latter experiments are based
	on real-world worldwide distributed computations on top of the PlanetLab
	platform.}
}

################ 2014

@InProceedings{clusterdata:Iglesias2014:task-estimation,
  author = {Jesus Omana Iglesias and Liam Murphy Lero and Milan De
	Cauwer and Deepak Mehta and Barry O'Sullivan},
  title = {A methodology for online consolidation of tasks through
	more accurate resource estimations},
  year = 2014,
  month = Dec,
  booktitle = {IEEE/ACM Intl. Conf. on Utility and Cloud Computing (UCC)},
  address = {London, UK},
  abstract = {Cloud providers aim to provide computing services for a wide range
	of applications, such as web applications, emails, web searches, and map
	reduce jobs. These applications are commonly scheduled to run on
	multi-purpose clusters that nowadays are becoming larger and more
	heterogeneous. A major challenge is to efficiently utilize the cluster's
	available resources, in particular to maximize overall machine
	utilization levels while minimizing application waiting time. We studied
	a publicly available trace from a large Google cluster ($\sim$12,000
	machines) and observed that users generally request more resources than
	required for running their tasks, leading to low levels of utilization.
	In this paper, we propose a methodology for achieving an efficient
	utilization of the cluster's resources while providing the users with
	fast and reliable computing services. The methodology consists of three
	main modules: i) a prediction module that forecasts the maximum resource
	requirement of a task; ii) a scalable scheduling module that efficiently
	allocates tasks to machines; and iii) a monitoring module that tracks
	the levels of utilization of the machines and tasks. We present results
	that show that the impact of more accurate resource estimations for the
	scheduling of tasks can lead to an increase in the average utilization
	of the cluster, a reduction in the number of tasks being evicted, and a
	reduction in task waiting time.},
  keys = {online scheduling, Cloud computing, forecasting, resource provisioning,
constraint programming},
}

@InProceedings{clusterdata:Balliu2014,
  author = {Alkida Balliu and Dennis Olivetti and Ozalp Babaoglu and
	Moreno Marzolla and Alina Sirbu},
  title = {{BiDAl: Big Data Analyzer} for cluster traces},
  year = 2014,
  booktitle = {Informatik Workshop on System Software Support for Big Data (BigSys)},
  month = Sep,
  publisher = {GI-Edition Lecture Notes in Informatics},
  abstract = {Modern data centers that provide Internet-scale services are
	stadium-size structures housing tens of thousands of heterogeneous
	devices (server clusters, networking equipment, power and cooling
	infrastructures) that must operate continuously and reliably.  As part
	of their operation, these devices produce large amounts of data in the
	form of event and error logs that are essential not only for identifying
	problems but also for improving data center efficiency and
	management. These activities employ data analytics and often exploit
	hidden statistical patterns and correlations among different factors
	present in the data.  Uncovering these patterns and correlations is
	challenging due to the sheer volume of data to be analyzed. This paper
	presents BiDAl, a prototype ``log-data analysis framework'' that
	incorporates various Big Data technologies to simplify the analysis of
	data traces from large clusters. BiDAl is written in Java with a modular
	and extensible architecture so that different storage backends
	(currently, HDFS and SQLite are supported), as well as different
	analysis languages (current implementation supports SQL, R and Hadoop
	MapReduce) can be easily selected as appropriate. We present the design
	of BiDAl and describe our experience using it to analyze several public
	traces of Google data clusters for building a simulation model capable
	of reproducing observed behavior.},
}

@inproceedings{clusterdata:Caglar2014,
  title = {{iOverbook}: intelligent resource-overbooking to support
	soft real-time applications in the cloud},
  author = {Faruk Caglar and Aniruddha Gokhale},
  booktitle = {7th IEEE International Conference on Cloud Computing (IEEE CLOUD)},
  year = 2014,
  month = {Jun--Jul},
  address = {Anchorage, AK, USA},
  abstract = {Cloud service providers (CSPs) often overbook their resources
	with user applications despite having to maintain service-level
	agreements with their customers. Overbooking is attractive to CSPs
	because it helps to reduce power consumption in the data center by
	packing more user jobs in less number of resources while improving their
	profits. Overbooking becomes feasible because user applications tend to
	overestimate their resource requirements utilizing only a fraction of
	the allocated resources. Arbitrary resource overbooking ratios, however,
	may be detrimental to soft real-time applications, such as airline
	reservations or Netflix video streaming, which are increasingly hosted
	in the cloud. The changing dynamics of the cloud preclude an offline
	determination of overbooking ratios. To address these concerns, this
	paper presents iOverbook, which uses a machine learning approach to make
	systematic and online determination of overbooking ratios such that the
	quality of service needs of soft real-time systems can be met while
	still benefiting from overbooking. Specifically, iOverbook utilizes
	historic data of tasks and host machines in the cloud to extract their
	resource usage patterns and predict future resource usage along with the
	expected mean performance of host machines. To evaluate our approach, we
	have used a large usage trace made available by Google of one of its
	production data centers. In the context of the traces, our experiments
	show that iOverbook can help CSPs improve their resource utilization by
	an average of 12.5\% and save 32\% power in the data center.},
  url = {http://www.dre.vanderbilt.edu/~gokhale/WWW/papers/CLOUD-2014.pdf},
}

@inproceedings{clusterdata:Sebastio2014,
  author = {Sebastio, Stefano and Amoretti, Michele and Lluch Lafuente, Alberto},
  title = {A computational field framework for collaborative task
	execution in volunteer clouds},
  booktitle = {International Symposium on Software Engineering for
	Adaptive and Self-Managing Systems (SEAMS)},
  year = 2014,
  month = Jun,
  isbn = {978-1-4503-2864-7},
  address = {Hyderabad, India},
  pages = {105--114},
  url = {http://doi.acm.org/10.1145/2593929.2593943},
  doi = {10.1145/2593929.2593943},
  publisher = {ACM},
  keywords = {ant colony optimization, bio-inspired algorithms, cloud computing,
	distributed tasks execution, peer-to-peer, self-* systems, spatial
	computing, volunteer computing},
  abstract = {The increasing diffusion of cloud technologies offers new
	opportunities for distributed and collaborative computing. Volunteer
	clouds are a prominent example, where participants join and leave the
	platform and collaborate by sharing computational resources. The high
	complexity, dynamism and unpredictability of such scenarios call for
	decentralized self-* approaches. We present in this paper a framework
	for the design and evaluation of self-adaptive collaborative task
	execution strategies in volunteer clouds.  As a byproduct, we propose a
	novel strategy based on the Ant Colony Optimization paradigm, that we
	validate through simulation-based statistical analysis over Google
	cluster data.},
}

@inproceedings{clusterdata:Breitgand2014-adaptive,
  title = {An adaptive utilization accelerator for virtualized environments},
  author = {Breitgand, David and Dubitzky, Zvi and Epstein, Amir and
	Feder, Oshrit and Glikson, Alex and Shapira, Inbar and
	Toffetti, Giovanni},
  booktitle = {International Conference on Cloud Engineering (IC2E)},
  pages = {165--174},
  year = 2014,
  month = Mar,
  publisher = IEEE,
  address = {Boston, MA, USA},
  abstract = {One of the key enablers of a cloud provider competitiveness is
	ability to over-commit shared infrastructure at ratios that are higher
	than those of other competitors, without compromising non-functional
	requirements, such as performance. A widely recognized impediment to
	achieving this goal is so called ``Virtual Machines sprawl'', a
	phenomenon referring to the situation when customers order Virtual
	Machines (VM) on the cloud, use them extensively and then leave them
	inactive for prolonged periods of time. Since a typical cloud
	provisioning system treats new VM provision requests according to the
	nominal virtual hardware specification, an often occurring situation is
	that the nominal resources of a cloud/pool become exhausted fast while
	the physical hosts utilization remains low. We present IBM adaPtive
	UtiLiSation AcceleratoR (IBM PULSAR), a cloud resources scheduler that
	extends OpenStack Nova Filter Scheduler. IBM PULSAR recognises that
	effective safely attainable over-commit ratio varies with time due to
	workloads' variability and dynamically adapts the effective over-commit
	ratio to these changes.},
}

@ARTICLE{clusterdata:Zhang2014-Harmony,
  author = {Qi Zhang and Mohamed Faten Zhani and Raouf Boutaba and
	Joseph L Hellerstein},
  title = {Dynamic heterogeneity-aware resource provisioning in the cloud},
  journal = {IEEE Transactions on Cloud Computing (TCC)},
  year = 2014,
  month = Mar,
  volume = 2,
  number = 1,
  abstract = {Data centers consume tremendous amounts of energy in terms of
	power distribution and cooling. Dynamic capacity provisioning is a
	promising approach for reducing energy consumption by dynamically
	adjusting the number of active machines to match resource
	demands. However, despite extensive studies of the problem, existing
	solutions have not fully considered the heterogeneity of both workload
	and machine hardware found in production environments. In particular,
	production data centers often comprise heterogeneous machines with
	different capacities and energy consumption characteristics. Meanwhile,
	the production cloud workloads typically consist of diverse applications
	with different priorities, performance and resource
	requirements. Failure to consider the heterogeneity of both machines and
	workloads will lead to both sub-optimal energy-savings and long
	scheduling delays, due to incompatibility between workload requirements
	and the resources offered by the provisioned machines. To address this
	limitation, we present Harmony, a Heterogeneity-Aware dynamic capacity
	provisioning scheme for cloud data centers. Specifically, we first use
	the K-means clustering algorithm to divide workload into distinct task
	classes with similar characteristics in terms of resource and
	performance requirements. Then we present a technique that dynamically
	adjusting the number of machines to minimize total energy consumption
	and scheduling delay. Simulations using traces from a Google's compute
	cluster demonstrate Harmony can reduce energy by 28 percent compared to
	heterogeneity-oblivious solutions.},
}

################ 2013

@INPROCEEDINGS{clusterdata:Di2013a,
  title = {Optimization of cloud task processing with checkpoint-restart mechanism},
  author = {Di, Sheng and Robert, Yves and Vivien, Fr\'ed\'eric and
	Kondo, Derrick and Wang, Cho-Li and Cappello, Franck},
  booktitle = {25th International Conference on High Performance
	Computing, Networking, Storage and Analysis (SC)},
  year = 2013,
  month = Nov,
  address = {Denver, CO, USA},
  abstract = {In this paper, we aim at optimizing fault-tolerance techniques
	based on a checkpointing/restart mechanism, in the context of cloud
	computing. Our contribution is three-fold. (1) We derive a fresh formula
	to compute the optimal number of checkpoints for cloud jobs with varied
	distributions of failure events. Our analysis is not only generic with
	no assumption on failure probability distribution, but also attractively
	simple to apply in practice. (2) We design an adaptive algorithm to
	optimize the impact of checkpointing regarding various costs like
	checkpointing/restart overhead. (3) We evaluate our optimized solution
	in a real cluster environment with hundreds of virtual machines and
	Berkeley Lab Checkpoint/Restart tool. Task failure events are emulated
	via a production trace produced on a large-scale Google data
	center. Experiments confirm that our solution is fairly suitable for
	Google systems. Our optimized formula outperforms Young's formula by
	3--10 percent, reducing wallclock lengths by 50--100 seconds per job on
	average.},
}

@inproceedings{clusterdata:Qiang2013-anomaly,
  author = {Qiang Guan and Song Fu},
  title = {Adaptive Anomaly Identification by Exploring Metric
	Subspace in Cloud Computing Infrastructures},
  booktitle = {32nd IEEE Symposium on Reliable Distributed Systems (SRDS)},
  year = 2013,
  month = Sep,
  pages = {205--214},
  address = {Braga, Portugal},
  abstract = {Cloud computing has become increasingly popular by obviating the
	need for users to own and maintain complex computing
	infrastructures. However, due to their inherent complexity and large
	scale, production cloud computing systems are prone to various runtime
	problems caused by hardware and software faults and environmental
	factors. Autonomic anomaly detection is a crucial technique for
	understanding emergent, cloud-wide phenomena and self-managing cloud
	resources for system-level dependability assurance. To detect anomalous
	cloud behaviors, we need to monitor the cloud execution and collect
	runtime cloud performance data. These data consist of values of
	performance metrics for different types of failures, which display
	different correlations with the performance metrics. In this paper, we
	present an adaptive anomaly identification mechanism that explores the
	most relevant principal components of different failure types in cloud
	computing infrastructures. It integrates the cloud performance metric
	analysis with filtering techniques to achieve automated, efficient, and
	accurate anomaly identification. The proposed mechanism adapts itself by
	recursively learning from the newly verified detection results to refine
	future detections. We have implemented a prototype of the anomaly
	identification system and conducted experiments in an on-campus cloud
	computing environment and by using the Google data center traces. Our
	experimental results show that our mechanism can achieve more efficient
	and accurate anomaly detection than other existing schemes.},
}

@ARTICLE{clusterdata:Zhani2013-HARMONY,
  title = {{HARMONY}: dynamic heterogeneity-aware resource provisioning in the cloud},
  author = {Qi Zhang and Mohamed Faten Zhani and Raouf Boutaba and
	Joseph L. Hellerstein},
  journal = {The 33rd International Conference on Distributed Computing Systems (ICDCS)},
  year = 2013,
  pages = {510--519},
  month = Jul,
  address = {Philadelphia, PA, USA},
  abstract = {Data centers today consume tremendous amount of energy in terms
	of power distribution and cooling. Dynamic capacity provisioning is a
	promising approach for reducing energy consumption by dynamically
	adjusting the number of active machines to match resource
	demands. However, despite extensive studies of the problem, existing
	solutions for dynamic capacity provisioning have not fully considered
	the heterogeneity of both workload and machine hardware found in
	production environments. In particular, production data centers often
	comprise several generations of machines with different capacities,
	capabilities and energy consumption characteristics. Meanwhile, the
	workloads running in these data centers typically consist of a wide
	variety of applications with different priorities, performance
	objectives and resource requirements. Failure to consider heterogenous
	characteristics will lead to both sub-optimal energy-savings and long
	scheduling delays, due to incompatibility between workload requirements
	and the resources offered by the provisioned machines. To address this
	limitation, in this paper we present HARMONY, a Heterogeneity-Aware
	Resource Management System for dynamic capacity provisioning in cloud
	computing environments. Specifically, we first use the K-means
	clustering algorithm to divide the workload into distinct task classes
	with similar characteristics in terms of resource and performance
	requirements. Then we present a novel technique for dynamically
	adjusting the number of machines of each type to minimize total energy
	consumption and performance penalty in terms of scheduling
	delay. Through simulations using real traces from Google's compute
	clusters, we found that our approach can improve data center energy
	efficiency by up to 28\% compared to heterogeneity-oblivious
	solutions.},
}

@INPROCEEDINGS{clusterdata:Amoretti2013
  title = {A cooperative approach for distributed task execution in autonomic clouds},
  author = {Amoretti, M. and Lafuente, A.L. and Sebastio, S.},
  booktitle = {21st Euromicro International Conference on Parallel,
	Distributed and Network-Based Processing (PDP)},
  publisher = {IEEE},
  year = 2013,
  month = Feb,
  pages = {274--281},
  abstract = {Virtualization and distributed computing are two key pillars that
	guarantee scalability of applications deployed in the Cloud. In
	Autonomous Cooperative Cloud-based Platforms, autonomous computing nodes
	cooperate to offer a PaaS Cloud for the deployment of user
	applications. Each node must allocate the necessary resources for
	applications to be executed with certain QoS guarantees. If the QoS of
	an application cannot be guaranteed a node has mainly two options: to
	allocate more resources (if it is possible) or to rely on the
	collaboration of other nodes. Making a decision is not trivial since it
	involves many factors (e.g. the cost of setting up virtual machines,
	migrating applications, discovering collaborators). In this paper we
	present a model of such scenarios and experimental results validating
	the convenience of cooperative strategies over selfish ones, where nodes
	do not help each other. We describe the architecture of the platform of
	autonomous clouds and the main features of the model, which has been
	implemented and evaluated in the DEUS discrete-event simulator. From the
	experimental evaluation, based on workload data from the Google Cloud
	Backend, we can conclude that (modulo our assumptions and
	simplifications) the performance of a volunteer cloud can be compared to
	that of a Google Cluster.},
  doi = {10.1109/PDP.2013.47},
  ISSN = {1066-6192},
  address = {Belfast, UK},
  url = {http://doi.ieeecomputersociety.org/10.1109/PDP.2013.47},
}

################ 2012

@INPROCEEDINGS{clusterdata:Di2012b,
  title = {Host load prediction in a {Google} compute cloud with a {Bayesian} model},
  author = {Di, Sheng and Kondo, Derrick and Cirne, Walfredo},
  booktitle = {International Conference on High Performance Computing,
	Networking, Storage and Analysis (SC)},
  year = 2012,
  month = Nov,
  isbn = {978-1-4673-0804-5},
  address = {Salt Lake City, UT, USA},
  pages = {21:1--21:11},
  abstract = {Prediction of host load in Cloud systems is critical for achieving
	service-level agreements. However, accurate prediction of host load in
	Clouds is extremely challenging because it fluctuates drastically at
	small timescales. We design a prediction method based on Bayes model to
	predict the mean load over a long-term time interval, as well as the
	mean load in consecutive future time intervals. We identify novel
	predictive features of host load that capture the expectation,
	predictability, trends and patterns of host load. We also determine the
	most effective combinations of these features for prediction. We
	evaluate our method using a detailed one-month trace of a Google data
	center with thousands of machines. Experiments show that the Bayes
	method achieves high accuracy with a mean squared error of
	0.0014. Moreover, the Bayes method improves the load prediction accuracy
	by 5.6--50\% compared to other state-of-the-art methods based on moving
	averages, auto-regression, and/or noise filters.},
  url = {http://dl.acm.org/citation.cfm?id=2388996.2389025},
  publisher = {IEEE Computer Society Press},
}

@INPROCEEDINGS{clusterdata:Zhang2012,
  title = {Dynamic energy-aware capacity provisioning for cloud computing environments},
  author = {Zhang, Qi and Zhani, Mohamed Faten and Zhang, Shuo and
	Zhu, Quanyan and Boutaba, Raouf and Hellerstein, Joseph L.},
  booktitle = {9th ACM International Conference on Autonomic Computing (ICAC)},
  year = 2012,
  month = Sep,
  isbn = {978-1-4503-1520-3},
  address = {San Jose, CA, USA},
  pages = {145--154},
  acmid = {2371562},
  publisher = {ACM},
  doi = {10.1145/2371536.2371562},
  keywords = {cloud computing, energy management, model predictive
	control, resource management},
  abstract = {Data centers have recently gained significant popularity as a
	cost-effective platform for hosting large-scale service
	applications. While large data centers enjoy economies of scale by
	amortizing initial capital investment over large number of machines,
	they also incur tremendous energy cost in terms of power distribution
	and cooling. An effective approach for saving energy in data centers is
	to adjust dynamically the data center capacity by turning off unused
	machines. However, this dynamic capacity provisioning problem is known
	to be challenging as it requires a careful understanding of the resource
	demand characteristics as well as considerations to various cost
	factors, including task scheduling delay, machine reconfiguration cost
	and electricity price fluctuation.  In this paper, we provide a
	control-theoretic solution to the dynamic capacity provisioning problem
	that minimizes the total energy cost while meeting the performance
	objective in terms of task scheduling delay. Specifically, we model this
	problem as a constrained discrete-time optimal control problem, and use
	Model Predictive Control (MPC) to find the optimal control
	policy. Through extensive analysis and simulation using real workload
	traces from Google's compute clusters, we show that our proposed
	framework can achieve significant reduction in energy cost, while
	maintaining an acceptable average scheduling delay for individual
	tasks.},
}

@INPROCEEDINGS{clusterdata:Ali-Eldin2012
  title = {Efficient provisioning of bursty scientific workloads on the
	cloud using adaptive elasticity control},
  author = {Ahmed Ali-Eldin and Maria Kihl and Johan Tordsson and Erik Elmroth},
  booktitle = {3rd Workshop on Scientific Cloud Computing (ScienceCloud)},
  year = 2012,
  month = Jun,
  address = {Delft, The Nederlands},
  isbn = {978-1-4503-1340-7},
  pages = {31--40},
  url = {http://dl.acm.org/citation.cfm?id=2287044},
  doi = {10.1145/2287036.2287044},
  publisher = {ACM},
  abstract = {Elasticity is the ability of a cloud infrastructure to dynamically
	change the amount of resources allocated to a running service as load
	changes. We build an autonomous elasticity controller that changes the
	number of virtual machines allocated to a service based on both
	monitored load changes and predictions of future load. The cloud
	infrastructure is modeled as a G/G/N queue. This model is used to
	construct a hybrid reactive-adaptive controller that quickly reacts to
	sudden load changes, prevents premature release of resources, takes into
	account the heterogeneity of the workload, and avoids
	oscillations. Using simulations with Web and cluster workload traces, we
	show that our proposed controller lowers the number of delayed requests
	by a factor of 70 for the Web traces and 3 for the cluster traces when
	compared to a reactive controller. Our controller also decreases the
	average number of queued requests by a factor of 3 for both traces, and
	reduces oscillations by a factor of 7 for the Web traces and 3 for the
	cluster traces. This comes at the expense of between 20\% and 30\%
	over-provisioning, as compared to a few percent for the reactive
	controller.},
}

################ 2011

@INPROCEEDINGS{clusterdata:Sharma2011,
  title = {Modeling and synthesizing task placement constraints in
	{Google} compute clusters},
  author = {Sharma, Bikash and Chudnovsky, Victor and Hellerstein,
	Joseph L. and Rifaat, Rasekh and Das, Chita R.},
  booktitle = {2nd ACM Symposium on Cloud Computing (SoCC)},
  year = 2011,
  month = Oct,
  isbn = {978-1-4503-0976-9},
  address = {Cascais, Portugal},
  pages = {3:1--3:14},
  url = {http://doi.acm.org/10.1145/2038916.2038919},
  doi = {10.1145/2038916.2038919},
  publisher = {ACM},
  keywords = {benchmarking, benchmarks, metrics, modeling, performance
	evaluation, workload characterization},
  abstract = {Evaluating the performance of large compute clusters requires
	benchmarks with representative workloads. At Google, performance
	benchmarks are used to obtain performance metrics such as task
	scheduling delays and machine resource utilizations to assess changes in
	application codes, machine configurations, and scheduling
	algorithms. Existing approaches to workload characterization for high
	performance computing and grids focus on task resource requirements for
	CPU, memory, disk, I/O, network, etc. Such resource requirements address
	how much resource is consumed by a task. However, in addition to
	resource requirements, Google workloads commonly include task placement
	constraints that determine which machine resources are consumed by
	tasks. Task placement constraints arise because of task dependencies
	such as those related to hardware architecture and kernel version.

	This paper develops methodologies for incorporating task placement
	constraints and machine properties into performance benchmarks of large
	compute clusters. Our studies of Google compute clusters show that
	constraints increase average task scheduling delays by a factor of 2 to
	6, which often results in tens of minutes of additional task wait
	time. To understand why, we extend the concept of resource utilization
	to include constraints by introducing a new metric, the Utilization
	Multiplier (UM). UM is the ratio of the resource utilization seen by
	tasks with a constraint to the average utilization of the resource. UM
	provides a simple model of the performance impact of constraints in that
	task scheduling delays increase with UM. Last, we describe how to
	synthesize representative task constraints and machine properties, and
	how to incorporate this synthesis into existing performance
	benchmarks. Using synthetic task constraints and machine properties
	generated by our methodology, we accurately reproduce performance
	metrics for benchmarks of Google compute clusters with a discrepancy of
	only 13\% in task scheduling delay and 5\% in resource utilization.},
}

@INPROCEEDINGS{clusterdata:Wang2011,
  title = {Towards synthesizing realistic workload traces for studying
	the {Hadoop} ecosystem},
  author = {Wang, Guanying and Butt, Ali R. and Monti, Henry and Gupta, Karan},
  booktitle = {19th IEEE Annual International Symposium on Modelling,
	Analysis, and Simulation of Computer and Telecommunication
	Systems (MASCOTS)},
  year = 2011,
  month = Jul,
  isbn = {978-0-7695-4430-4},
  pages = {400--408},
  url = {http://people.cs.vt.edu/~butta/docs/mascots11-hadooptrace.pdf},
  doi = {10.1109/MASCOTS.2011.59},
  publisher = {IEEE Computer Society},
  address = {Raffles Hotel, Singapore},
  keywords = {Cloud computing, Performance analysis, Design
	optimization, Software performance modeling},
  abstract = {Designing cloud computing setups is a challenging task. It
	involves understanding the impact of a plethora of parameters ranging
	from cluster configuration, partitioning, networking characteristics,
	and the targeted applications' behavior. The design space, and the scale
	of the clusters, make it cumbersome and error-prone to test different
	cluster configurations using real setups. Thus, the community is
	increasingly relying on simulations and models of cloud setups to infer
	system behavior and the impact of design choices. The accuracy of the
	results from such approaches depends on the accuracy and realistic
	nature of the workload traces employed. Unfortunately, few cloud
	workload traces are available (in the public domain). In this paper, we
	present the key steps towards analyzing the traces that have been made
	public, e.g., from Google, and inferring lessons that can be used to
	design realistic cloud workloads as well as enable thorough quantitative
	studies of Hadoop design. Moreover, we leverage the lessons learned from
	the traces to undertake two case studies: (i) Evaluating Hadoop job
	schedulers, and (ii) Quantifying the impact of shared storage on Hadoop
	system performance.}
}