Skip to content

Commit c2faa9c

Browse files
author
alrossi
committed
(2.6) webadmin: fix exit login in billing refresh loop
Currently the thread which refreshes the billing plots will exit if it encounters an unspecified error from the billing service. But this behavior does not take into account slow start-up of domains (i.e., the billing service may not be there yet but will be eventually). The patch fixes the logic to treat NoRouteToCell exceptions differently by waiting for a short time and retrying. Testing done: On deployed service without billing, and then with billing booted. Target: 2.6 Patch: https://rb.dcache.org/r/7306 Acked-by: Gerd Committed: b315fc0 Require-note: yes Require-book: no RELEASE NOTES: Fixes a bug where a simple timeout (NoRouteToCell) causes an exit from the billing service refresh loop requiring a restart of the domain in order to reconnect.
1 parent 3f5d2f0 commit c2faa9c

File tree

2 files changed

+95
-57
lines changed

2 files changed

+95
-57
lines changed

modules/webadmin/src/main/java/org/dcache/webadmin/controller/IBillingService.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
6161

6262
import java.io.File;
6363

64+
import javax.naming.ServiceUnavailableException;
65+
66+
import dmg.cells.nucleus.NoRouteToCellException;
67+
6468
/**
6569
* Provides plot images to billing page.
6670
*
@@ -83,5 +87,5 @@ public interface IBillingService {
8387

8488
void initialize();
8589

86-
void refresh();
90+
void refresh() throws NoRouteToCellException, ServiceUnavailableException;
8791
}

modules/webadmin/src/main/java/org/dcache/webadmin/controller/impl/StandardBillingService.java

Lines changed: 90 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
5959
*/
6060
package org.dcache.webadmin.controller.impl;
6161

62+
import com.google.common.base.Throwables;
63+
import com.google.common.util.concurrent.RateLimiter;
64+
import org.apache.wicket.util.lang.Exceptions;
6265
import org.slf4j.Logger;
6366
import org.slf4j.LoggerFactory;
6467

@@ -77,6 +80,8 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
7780
import java.util.Properties;
7881
import java.util.concurrent.TimeUnit;
7982

83+
import dmg.cells.nucleus.NoRouteToCellException;
84+
8085
import org.dcache.cells.CellStub;
8186
import org.dcache.services.billing.histograms.ITimeFrameHistogramFactory;
8287
import org.dcache.services.billing.histograms.ITimeFrameHistogramFactory.Style;
@@ -106,6 +111,7 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
106111
*/
107112
public final class StandardBillingService implements IBillingService, Runnable {
108113
private static final Logger logger = LoggerFactory.getLogger(StandardBillingService.class);
114+
private static final double ERRORS_PER_SECOND = 1.0 / 120.0;
109115

110116
/**
111117
* injected
@@ -135,48 +141,78 @@ public final class StandardBillingService implements IBillingService, Runnable {
135141
* refreshing can be done periodically by the daemon, or forced
136142
* through the web interface directly
137143
*/
144+
private final RateLimiter rate = RateLimiter.create(ERRORS_PER_SECOND);
145+
138146
private long timeout;
139147
private int popupWidth;
140148
private int popupHeight;
141149
private long lastUpdate = System.currentTimeMillis();
142150
private Thread refresher;
143151

144152
public List<TimeFrameHistogramData> load(PlotType plotType,
145-
TimeFrame timeFrame) {
153+
TimeFrame timeFrame) throws NoRouteToCellException,
154+
ServiceUnavailableException {
146155
logger.debug("remote fetch of {} {}", plotType, timeFrame);
147156
List<TimeFrameHistogramData> histograms = new ArrayList<>();
148-
switch (plotType) {
149-
case BYTES_READ:
150-
add(client.getDcBytesHistogram(timeFrame, false), histograms);
151-
add(client.getHsmBytesHistogram(timeFrame, false), histograms);
152-
break;
153-
case BYTES_WRITTEN:
154-
add(client.getDcBytesHistogram(timeFrame, true), histograms);
155-
add(client.getHsmBytesHistogram(timeFrame, true), histograms);
156-
break;
157-
case BYTES_P2P:
158-
add(client.getP2pBytesHistogram(timeFrame), histograms);
159-
break;
160-
case TRANSFERS_READ:
161-
add(client.getDcTransfersHistogram(timeFrame, false),
162-
histograms);
163-
add(client.getHsmTransfersHistogram(timeFrame, false),
164-
histograms);
165-
break;
166-
case TRANSFERS_WRITTEN:
167-
add(client.getDcTransfersHistogram(timeFrame, true), histograms);
168-
add(client.getHsmTransfersHistogram(timeFrame, true),
169-
histograms);
170-
break;
171-
case TRANSFERS_P2P:
172-
add(client.getP2pTransfersHistogram(timeFrame), histograms);
173-
break;
174-
case CONNECTION_TIME:
175-
add(client.getDcConnectTimeHistograms(timeFrame), histograms);
176-
break;
177-
case CACHE_HITS:
178-
add(client.getHitHistograms(timeFrame), histograms);
179-
break;
157+
try {
158+
switch (plotType) {
159+
case BYTES_READ:
160+
add(client.getDcBytesHistogram(timeFrame, false),
161+
histograms);
162+
add(client.getHsmBytesHistogram(timeFrame, false),
163+
histograms);
164+
break;
165+
case BYTES_WRITTEN:
166+
add(client.getDcBytesHistogram(timeFrame, true),
167+
histograms);
168+
add(client.getHsmBytesHistogram(timeFrame, true),
169+
histograms);
170+
break;
171+
case BYTES_P2P:
172+
add(client.getP2pBytesHistogram(timeFrame),
173+
histograms);
174+
break;
175+
case TRANSFERS_READ:
176+
add(client.getDcTransfersHistogram(timeFrame, false),
177+
histograms);
178+
add(client.getHsmTransfersHistogram(timeFrame, false),
179+
histograms);
180+
break;
181+
case TRANSFERS_WRITTEN:
182+
add(client.getDcTransfersHistogram(timeFrame, true),
183+
histograms);
184+
add(client.getHsmTransfersHistogram(timeFrame, true),
185+
histograms);
186+
break;
187+
case TRANSFERS_P2P:
188+
add(client.getP2pTransfersHistogram(timeFrame),
189+
histograms);
190+
break;
191+
case CONNECTION_TIME:
192+
add(client.getDcConnectTimeHistograms(timeFrame),
193+
histograms);
194+
break;
195+
case CACHE_HITS:
196+
add(client.getHitHistograms(timeFrame),
197+
histograms);
198+
break;
199+
}
200+
} catch (UndeclaredThrowableException ute) {
201+
Throwable cause
202+
= Exceptions.findCause(ute, ServiceUnavailableException.class);
203+
if (cause != null) {
204+
throw (ServiceUnavailableException)cause;
205+
}
206+
cause = Exceptions.findCause(ute, NoRouteToCellException.class);
207+
if (cause != null) {
208+
throw (NoRouteToCellException)cause;
209+
}
210+
cause = ute.getCause();
211+
Throwables.propagateIfPossible(cause);
212+
throw new RuntimeException("Unexpected error: "
213+
+ "this is probably a bug. Please report "
214+
+ "to the dCache team.",
215+
cause);
180216
}
181217
return histograms;
182218
}
@@ -288,7 +324,8 @@ public void initialize() {
288324
}
289325

290326
@Override
291-
public void refresh() {
327+
public void refresh() throws NoRouteToCellException,
328+
ServiceUnavailableException{
292329
TimeFrame[] timeFrames = generateTimeFrames();
293330
for (int tFrame = 0; tFrame < timeFrames.length; tFrame++) {
294331
Date low = timeFrames[tFrame].getLow();
@@ -304,29 +341,25 @@ public void refresh() {
304341
@Override
305342
public void run() {
306343
try {
307-
while(true) {
308-
refresh();
309-
Thread.sleep(timeout);
344+
while (true) {
345+
try {
346+
refresh();
347+
Thread.sleep(timeout);
348+
} catch (ServiceUnavailableException e) {
349+
logger.error("The billing database has been disabled."
350+
+ " To generate plots, please restart the service when"
351+
+ " the billing database is once again available");
352+
break;
353+
} catch (NoRouteToCellException e) {
354+
if (rate.tryAcquire()) {
355+
logger.warn("No route to the billing service yet; "
356+
+ "retrying every 10 seconds");
357+
}
358+
Thread.sleep(TimeUnit.SECONDS.toMillis(10));
359+
}
310360
}
311361
} catch (InterruptedException interrupted) {
312362
logger.trace("{} interrupted; exiting ...", refresher);
313-
} catch (UndeclaredThrowableException ute) {
314-
Throwable cause = ute.getCause();
315-
if (cause instanceof ServiceUnavailableException) {
316-
logger.error("The billing database has been disabled. "
317-
+ "To generate plots, please restart the service when "
318-
+ "the billing database is once again available");
319-
} else if (cause instanceof Error) {
320-
throw ute;
321-
}
322-
323-
/*
324-
* if the service can't handle the client's requests, then we
325-
* back out here because there is nothing we can do
326-
*/
327-
logger.error("fatal billing request exception; "
328-
+ "client loop is exiting");
329-
logger.debug("refresh", ute);
330363
}
331364
}
332365

@@ -361,7 +394,8 @@ public void shutDown() {
361394
}
362395

363396
private void generatePlot(PlotType type, TimeFrame timeFrame,
364-
String fileName, String title) {
397+
String fileName, String title) throws ServiceUnavailableException,
398+
NoRouteToCellException {
365399
List<TimeFrameHistogramData> data = load(type, timeFrame);
366400
List<HistogramWrapper<?>> config = new ArrayList<>();
367401
int i = 0;
@@ -452,4 +486,4 @@ private void synchronizeTimeFramePlotProperties() {
452486

453487
logger.debug("plot properties are {}", properties.toJavaProperties());
454488
}
455-
}
489+
}

0 commit comments

Comments
 (0)