Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding bibtex ref of SLearn (Cluster Scheduler) papers in bibliograph… #29

Merged
merged 1 commit into from
Jun 26, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 107 additions & 2 deletions bibliography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,6 @@ @InProceedings{clusterdata:Reiss2012
If you just want one citation about the Cluster2011 trace, then
use \cite{clusterdata:Reiss2012b}.



################ 2022
@inproceedings {clusterdata:jajooSLearn2022,
author = {Akshay Jajoo and Y. Charlie Hu and Xiaojun Lin and Nan Deng},
Expand Down Expand Up @@ -617,6 +615,113 @@ @Article{clusterdata:Mishra2010
use the traces as inputs, e.g., in simulations or load predictions.
Order: most recent first.

################ 2023
@ARTICLE{clusterdata:jajooSLearnTCC2023,
author={Jajoo, Akshay and Hu, Y. Charlie and Lin, Xiaojun and Deng, Nan},
journal={IEEE Transactions on Cloud Computing},
title={SLearn: A Case for Task Sampling Based Learning for Cluster Job Scheduling},
year={2023},
volume={11},
number={3},
pages={2664-2680},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: for next time, page ranges take en dashes: {2664--2680}

publisher = {USENIX Association},
keywords = {data centers, big data, job scheduling, learning, online learning},
abstract = {The ability to accurately estimate job runtime properties allows a
scheduler to effectively schedule jobs. State-of-the-art online cluster
job schedulers use history-based learning, which uses past job execution
information to estimate the runtime properties of newly arrived jobs.
However, with fast-paced development in cluster technology (in both hardware
and software) and changing user inputs, job runtime properties can change over
time, which lead to inaccurate predictions. In this article, we explore the
potential and limitation of real-time learning of job runtime properties,
by proactively sampling and scheduling a small fraction of the tasks of
each job. Such a task-sampling-based approach exploits the similarity among
runtime properties of the tasks of the same job and is inherently immune to
changing job behavior. Our analytical and experimental analysis of 3 production
traces with different skew and job distribution shows that learning in space can
be substantially more accurate. Our simulation and testbed evaluation on Azure of
the two learning approaches anchored in a generic job scheduler using 3 production
cluster job traces shows that despite its online overhead, learning in space reduces
the average Job Completion Time (JCT) by 1.28×, 1.56×, and 1.32× compared to the
prior-art history-based predictor. We further analyze the experimental results to
give intuitive explanations to why learning in space outperforms learning in time
in these experiments. Finally, we show how sampling-based learning can be extended
to schedule DAG jobs and achieve similar speedups over the prior-art history-based
predictor.},
doi={10.1109/TCC.2022.3222649}}


################ 2022
@inproceedings {clusterdata:jajooSLearnNSDI2022,
author = {Akshay Jajoo and Y. Charlie Hu and Xiaojun Lin and Nan Deng},
title = {A Case for Task Sampling based Learning for Cluster Job Scheduling},
booktitle = {19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)},
year = {2022},
address = {Renton, WA, USA},
url = {https://www.usenix.org/conference/nsdi22/presentation/jajoo},
publisher = {USENIX Association},
keywords = {data centers, big data, job scheduling, learning, online learning},
abstract = {The ability to accurately estimate job runtime properties allows a
scheduler to effectively schedule jobs. State-of-the-art online cluster job
schedulers use history-based learning, which uses past job execution information
to estimate the runtime properties of newly arrived jobs. However, with fast-paced
development in cluster technology (in both hardware and software) and changing user
inputs, job runtime properties can change over time, which lead to inaccurate predictions.
In this paper, we explore the potential and limitation of real-time learning of job
runtime properties, by proactively sampling and scheduling a small fraction of the
tasks of each job. Such a task-sampling-based approach exploits the similarity among
runtime properties of the tasks of the same job and is inherently immune to changing
job behavior. Our study focuses on two key questions in comparing task-sampling-based
learning (learning in space) and history-based learning (learning in time): (1) Can
learning in space be more accurate than learning in time? (2) If so, can delaying
scheduling the remaining tasks of a job till the completion of sampled tasks be more
than compensated by the improved accuracy and result in improved job performance? Our
analytical and experimental analysis of 3 production traces with different skew and job
distribution shows that learning in space can be substantially more accurate. Our
simulation and testbed evaluation on Azure of the two learning approaches anchored in a
generic job scheduler using 3 production cluster job traces shows that despite its online
overhead, learning in space reduces the average Job Completion Time (JCT) by 1.28x, 1.56x,
and 1.32x compared to the prior-art history-based predictor.},
}


################ 2021
@article{clusterdata:jajooSLearnTechReport2021,
author = {Akshay Jajoo and Y. Charlie Hu and Xiaojun Lin and Nan Deng},
title = {The Case for Task Sampling based Learning for Cluster Job Scheduling},
journal = {Computing Research Repository},
volume = {abs/2108.10464},
year = {2021},
url = {https://arxiv.org/abs/2108.10464},
eprinttype = {arXiv},
eprint = {2108.10464},
timestamp = {Fri, 27 Aug 2021 15:02:29 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2108-10464.bib},
bibsource = {dblp computer science bibliography, https://dblp.org},
keywords = {data centers, big data, job scheduling, learning, online learning},
abstract = {The ability to accurately estimate job runtime properties allows a
scheduler to effectively schedule jobs. State-of-the-art online cluster job
schedulers use history-based learning, which uses past job execution information
to estimate the runtime properties of newly arrived jobs. However, with fast-paced
development in cluster technology (in both hardware and software) and changing user
inputs, job runtime properties can change over time, which lead to inaccurate predictions.
In this paper, we explore the potential and limitation of real-time learning of job
runtime properties, by proactively sampling and scheduling a small fraction of the
tasks of each job. Such a task-sampling-based approach exploits the similarity among
runtime properties of the tasks of the same job and is inherently immune to changing
job behavior. Our study focuses on two key questions in comparing task-sampling-based
learning (learning in space) and history-based learning (learning in time): (1) Can
learning in space be more accurate than learning in time? (2) If so, can delaying
scheduling the remaining tasks of a job till the completion of sampled tasks be more
than compensated by the improved accuracy and result in improved job performance? Our
analytical and experimental analysis of 3 production traces with different skew and job
distribution shows that learning in space can be substantially more accurate. Our
simulation and testbed evaluation on Azure of the two learning approaches anchored in a
generic job scheduler using 3 production cluster job traces shows that despite its online
overhead, learning in space reduces the average Job Completion Time (JCT) by 1.28x, 1.56x,
and 1.32x compared to the prior-art history-based predictor.},
}

################ 2020

@INPROCEEDINGS{clusterdata:Lin2020,
Expand Down