Skip to content

Commit

Permalink
chimera: introduce lazy update of parent directory attribute update
Browse files Browse the repository at this point in the history
Motivation:
The parent directory attribute update on create creates a bottleneck
when multiple clients creates files in a single directory. To improve
file creation rate such updates can be batched and update only
aggregated results.

Modification:
Introduce new table t_lazy_wcc that is populated no create with
timestamp, nlink count change and  inumber. A periodic thread merges
file systems changes with t_inodes table. A java property allows to
enable/disable this functionality, so that NFS door might enforce strong
consistency, if needed, while the other doors might relax it.

The nlink count of directories are incremented for subdirs only. A
file/dir removal uses `NOT EXISTS` sub-query, which performs good
on directories, as there is sufficient entropy for correct statistics.

Unit tests adjusted to match new behaviour.

Result:
Better throughput with multiple clients into single directory.

64 threads creating in one (all-in-one) or different (one-per-thread) directories
Benchmark                     (dir)   (wcc)   Mode  Cnt     Score     Error  Units
benchmarkCreateFile      all-in-one    weak  thrpt   10  1263.034 ± 392.351  ops/s
benchmarkCreateFile      all-in-one  strong  thrpt   10   243.822 ±  33.384  ops/s
benchmarkCreateFile  one-per-thread    weak  thrpt   10  1142.829 ± 432.221  ops/s
benchmarkCreateFile  one-per-thread  strong  thrpt   10  1227.483 ± 178.552  ops/s

NOTE: on existing installations nlink count might run out of sync, but
as nlinks doesn't use anymore this discrepancy can be ignored.

Acked-by: Dmitry Litvintsev
Target: master
Require-book: no
Require-notes: yes
  • Loading branch information
kofemann committed Apr 28, 2023
1 parent 7565591 commit f3286f9
Show file tree
Hide file tree
Showing 8 changed files with 478 additions and 126 deletions.
@@ -0,0 +1,185 @@
package org.dcache.chimera;


import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.UUID.randomUUID;

import com.google.common.io.Resources;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.Properties;
import java.util.UUID;
import liquibase.Liquibase;
import liquibase.database.Database;
import liquibase.database.DatabaseFactory;
import liquibase.database.jvm.JdbcConnection;
import liquibase.exception.LiquibaseException;
import liquibase.resource.ClassLoaderResourceAccessor;
import org.dcache.chimera.CreateBenchmark.DB.ThreadCtx;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Threads;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import org.slf4j.bridge.SLF4JBridgeHandler;
import org.springframework.jdbc.datasource.DataSourceTransactionManager;
import org.springframework.transaction.PlatformTransactionManager;

@BenchmarkMode(Mode.Throughput)
public class CreateBenchmark {

static {
// redirect java.util.logging used by liquibase
SLF4JBridgeHandler.removeHandlersForRootLogger();
SLF4JBridgeHandler.install();
}
private final static URL DB_TEST_PROPERTIES =
Resources.getResource("org/dcache/chimera/chimera-benchmark.properties");

@State(Scope.Benchmark)
public static class DB {

@Param(value = {"weak", "strong", "week_softupdate"})
String wcc;

@Param(value = {"nlink", "no_nlink"})
String ref;

protected FileSystemProvider _fs;

protected HikariDataSource _dataSource;

protected FsInode _rootInode;

@Setup
public void setUp() throws IOException, SQLException, LiquibaseException {


switch (ref) {
case "nlink":
System.setProperty("nlink", "true");
break;
}

switch (wcc) {
case "week_softupdate":
System.setProperty("chimera_soft_update", "true");
case "weak":
System.setProperty("chimera_lazy_wcc", "true");
case "strong":
;
break;
default:
throw new IllegalArgumentException("Invalid wcc mode: " + wcc);
}

Properties dbProperties = new Properties();
try (InputStream input = Resources.asByteSource(DB_TEST_PROPERTIES).openStream()) {
dbProperties.load(input);
}

HikariConfig config = new HikariConfig();
config.setJdbcUrl(dbProperties.getProperty("chimera.db.url"));
config.setUsername(dbProperties.getProperty("chimera.db.user"));
config.setPassword(dbProperties.getProperty("chimera.db.password"));
config.setMaximumPoolSize(Runtime.getRuntime().availableProcessors() * 2);
config.setMinimumIdle(100);

_dataSource = new HikariDataSource(config);

try (Connection conn = _dataSource.getConnection()) {

conn.createStatement().execute("DROP SCHEMA public CASCADE;");
conn.createStatement().execute("CREATE SCHEMA public;");
conn.setTransactionIsolation(Connection.TRANSACTION_READ_COMMITTED);

Database database = DatabaseFactory.getInstance()
.findCorrectDatabaseImplementation(new JdbcConnection(conn));
Liquibase liquibase = new Liquibase(
"org/dcache/chimera/changelog/changelog-master.xml",
new ClassLoaderResourceAccessor(), database);

liquibase.update("");
conn.createStatement().execute("ALTER TABLE t_inodes SET (fillfactor = 50);");
}

PlatformTransactionManager txManager = new DataSourceTransactionManager(_dataSource);
_fs = new JdbcFs(_dataSource, txManager);
_rootInode = _fs.path2inode("/");

_fs.createTag(_rootInode, "aTag");
FsInode tagInode = new FsInode_TAG(_fs, _rootInode.ino(), "aTag");
byte[] data = "data".getBytes(UTF_8);
tagInode.write(0, data, 0, data.length);
}


@State(Scope.Thread)
public static class ThreadCtx {

@Param(value = {"all-in-one", "one-per-thread"})
String dir;

FsInode threadRoot;
FileSystemProvider fs;

@Setup
public void setUp(DB db) throws ChimeraFsException {
fs = db._fs;
threadRoot = dir.equals("all-in-one") ? db._rootInode
: db._rootInode.mkdir(Thread.currentThread().getName());
}

}

@TearDown
public void tearDown() throws Exception {
_dataSource.close();
_fs.close();
}
}

@Benchmark
@Threads(value = 64)
public FsInode benchmarkCreateDir(ThreadCtx ctx) throws ChimeraFsException {
var dirName = UUID.randomUUID().toString();
FsInode sub = ctx.threadRoot.mkdir(dirName);
return sub;
}

@Benchmark
@Threads(value = 64)
public FsInode benchmarkCreateDeleteDir(ThreadCtx ctx) throws ChimeraFsException {
var dirName = UUID.randomUUID().toString();
FsInode sub = ctx.threadRoot.mkdir(dirName);
ctx.fs.remove(ctx.threadRoot, dirName, sub);
return sub;
}

@Benchmark
@Threads(value = 64)
public FsInode benchmarkCreateFile(ThreadCtx ctx) throws ChimeraFsException {
return ctx.threadRoot.create(randomUUID().toString(), 0, 0, 644);
}

public static void main(String[] args) throws RunnerException {
Options opt = new OptionsBuilder()
.include(CreateBenchmark.class.getSimpleName())
.build();

new Runner(opt).run();
}
}
@@ -0,0 +1,6 @@
#
# JDBC properties for Chimera
#
chimera.db.url=jdbc:postgresql://localhost/dcache
chimera.db.user=dcache
chimera.db.password=let-me-in
67 changes: 40 additions & 27 deletions modules/chimera/src/main/java/org/dcache/chimera/FsSqlDriver.java
Expand Up @@ -306,15 +306,12 @@ private boolean removeDir(FsInode parent, FsInode inode, String name)
return false;
}

// A directory contains two pseudo entries for '.' and '..'
decNlink(inode, 2);

// ensure that t_inodes and t_tags_inodes updated in the same order as
// in mkdir
decNlink(parent);
removeTag(inode);

if (!removeInodeIfUnlinked(inode)) {
if (!removeInodeIfUnlinked(inode, true)) {
throw new DirNotEmptyChimeraFsException("directory is not empty");
}

Expand All @@ -327,45 +324,49 @@ private boolean removeFile(FsInode parent, FsInode inode, String name)
if (!removeEntryInParent(parent, name, inode)) {
return false;
}
// hard link counts
decNlink(inode);
// ignore the result as the file might have a hardlink
removeInodeIfUnlinked(inode, false);

removeInodeIfUnlinked(inode);

/* During bulk deletion of files in the same directory,
* updating the parent inode is often a contention point. The
* link count on the parent is updated last to reduce the time
* in which the directory inode is locked by the database.
*/
decNlink(parent);

// trigger mtime update of parent dir.
// Postgres driver makes it different.
decNlink(parent, 0);
return true;
}

void remove(FsInode inode) {
if (inode.isDirectory()) {
removeTag(inode);
}
void remove(FsInode inode) throws DirNotEmptyChimeraFsException {

/* Updating the inode effectively blocks anybody else from changing it and thus also from
* adding more links.
*/
_jdbc.update("UPDATE t_inodes SET inlink=0 WHERE inumber=?", inode.ino());
_jdbc.update("UPDATE t_inodes SET ictime=now() WHERE inumber=?", inode.ino());

/* Remove all hard-links. */
List<Long> parents =
_jdbc.queryForList(
"SELECT iparent FROM t_dirs WHERE ichild=?",
Long.class, inode.ino());

boolean isDir = inode.isDirectory();
if (isDir) {
removeTag(inode);
}

for (Long parent : parents) {
decNlink(new FsInode(inode.getFs(), parent));
decNlink(new FsInode(inode.getFs(), parent), isDir ? 1 : 0);
}

int n = _jdbc.update("DELETE FROM t_dirs WHERE ichild=?", inode.ino());
if (n != parents.size()) {
throw new JdbcUpdateAffectedIncorrectNumberOfRowsException(
"DELETE FROM t_dirs WHERE ichild=?", parents.size(), n);
}

removeInodeIfUnlinked(inode);
boolean removed = removeInodeIfUnlinked(inode, isDir);
if (!removed && isDir) {
throw new DirNotEmptyChimeraFsException("directory is not empty");
}
}

public Stat stat(String id) {
Expand Down Expand Up @@ -485,9 +486,14 @@ boolean rename(FsInode inode, FsInode srcDir, String source, FsInode destDir, St
throw new JdbcUpdateAffectedIncorrectNumberOfRowsException(moveLink, 1, n);
}

int nlinkDelta = 0;
if (inode.isDirectory()) {
nlinkDelta = 1;
}

if (!srcDir.equals(destDir)) {
incNlink(destDir);
decNlink(srcDir);
incNlink(destDir, nlinkDelta);
decNlink(srcDir, nlinkDelta);
} else {
incNlink(srcDir, 0);
}
Expand Down Expand Up @@ -572,7 +578,7 @@ FsInode createInodeInParent(FsInode parent, String name, String id, int owner, i
Stat stat = createInode(id, type, owner, group, mode, nlink, size);
FsInode inode = new FsInode(parent.getFs(), stat.getIno(), FsInodeType.INODE, 0, stat);
createEntryInParent(parent, name, inode);
incNlink(parent);
incNlink(parent, type == UnixPermission.S_IFDIR ? 1 : 0);
return inode;
}

Expand Down Expand Up @@ -672,10 +678,16 @@ FsInode createLevel(FsInode inode, int uid, int gid, int mode, int level) {
return new FsInode(inode.getFs(), inode.ino(), FsInodeType.INODE, level, stat);
}

boolean removeInodeIfUnlinked(FsInode inode) {
boolean removeInodeIfUnlinked(FsInode inode, boolean isDir) {
if (isDir) {
int n = _jdbc.update("DELETE FROM t_inodes WHERE inumber=? AND NOT EXISTS (SELECT 1 FROM t_dirs WHERE iparent = ? LIMIT 1)",
inode.ino(), inode.ino());
return n > 0;
}

List<String> ids
= _jdbc.queryForList(
"SELECT ipnfsid FROM t_inodes WHERE inumber=? AND inlink=0 FOR UPDATE",
"SELECT ipnfsid FROM t_inodes WHERE inumber=? FOR UPDATE",
String.class, inode.ino());
if (ids.isEmpty()) {
return false;
Expand All @@ -702,8 +714,9 @@ boolean removeInodeIfUnlinked(FsInode inode) {
ps.setTimestamp(2, now);
ps.setTimestamp(3, now);
});
_jdbc.update("DELETE FROM t_inodes WHERE inumber=?", inode.ino());
return true;
int n = _jdbc.update("DELETE FROM t_inodes WHERE inumber=? AND NOT EXISTS (SELECT 1 FROM t_dirs WHERE ichild = ? LIMIT 1)",
inode.ino(), inode.ino());
return n > 0;
}

boolean removeInodeLevel(FsInode inode, int level) {
Expand Down
Expand Up @@ -363,7 +363,7 @@ public FsInode createHLink(FsInode parent, FsInode inode, String name)

_sqlDriver.createEntryInParent(parent, name, inode);
_sqlDriver.incNlink(inode);
_sqlDriver.incNlink(parent);
_sqlDriver.incNlink(parent, 0);
} catch (DuplicateKeyException e) {
throw new FileExistsChimeraFsException(e);
}
Expand Down

0 comments on commit f3286f9

Please sign in to comment.