ClickHouse · alexey-milovidov · Sep 23, 2019 · Sep 20, 2019 · Sep 23, 2019 · Sep 23, 2019
diff --git a/dbms/src/IO/HDFSCommon.cpp b/dbms/src/IO/HDFSCommon.cpp
@@ -17,8 +17,8 @@ HDFSBuilderPtr createHDFSBuilder(const std::string & uri_str)
     const Poco::URI uri(uri_str);
     auto & host = uri.getHost();
     auto port = uri.getPort();
-    auto & path = uri.getPath();
-    if (host.empty() || path.empty())
+    const std::string path = "//";
+    if (host.empty())
         throw Exception("Illegal HDFS URI: " + uri.toString(), ErrorCodes::BAD_ARGUMENTS);
 
     HDFSBuilderPtr builder(hdfsNewBuilder());

diff --git a/dbms/src/IO/WriteBufferFromHDFS.cpp b/dbms/src/IO/WriteBufferFromHDFS.cpp
@@ -15,6 +15,7 @@ namespace ErrorCodes
 extern const int NETWORK_ERROR;
 extern const int CANNOT_OPEN_FILE;
 extern const int CANNOT_FSYNC;
+extern const int BAD_ARGUMENTS;
 }
 
 
@@ -32,10 +33,12 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl
     {
         const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2);
         const std::string path = hdfs_uri.substr(begin_of_path);
-        if (path.find("*?{") != std::string::npos)
+        if (path.find_first_of("*?{") != std::string::npos)
             throw Exception("URI '" + hdfs_uri + "' contains globs, so the table is in readonly mode", ErrorCodes::CANNOT_OPEN_FILE);
 
-        fout = hdfsOpenFile(fs.get(), path.c_str(), O_WRONLY, 0, 0, 0);
+        if (!hdfsExists(fs.get(), path.c_str()))
+            throw Exception("File: " + path + " is already exists", ErrorCodes::BAD_ARGUMENTS);
+        fout = hdfsOpenFile(fs.get(), path.c_str(), O_WRONLY, 0, 0, 0);     /// O_WRONLY meaning create or overwrite i.e., implies O_TRUNCAT here
 
         if (fout == nullptr)
         {

diff --git a/dbms/src/Storages/StorageFile.cpp b/dbms/src/Storages/StorageFile.cpp
@@ -87,7 +87,7 @@ std::vector<std::string> listFilesWithRegexpMatching(const std::string & path_fo
         {
             if (re2::RE2::FullMatch(file_name, matcher))
             {
-                /// TODO: No recursion depth check. No protection for cyclic symlinks. It is a bug.
+                /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check.
                 Strings result_part = listFilesWithRegexpMatching(full_path + "/", suffix_with_globs.substr(next_slash));
                 std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
             }
@@ -284,7 +284,7 @@ class StorageFileBlockOutputStream : public IBlockOutputStream
         else
         {
             if (storage.paths.size() != 1)
-                throw Exception("Table '" + storage.table_name + "' is in readonly mode", ErrorCodes::DATABASE_ACCESS_DENIED);
+                throw Exception("Table '" + storage.table_name + "' is in readonly mode because of globs in filepath", ErrorCodes::DATABASE_ACCESS_DENIED);
             write_buf = std::make_unique<WriteBufferFromFile>(storage.paths[0], DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT);
         }
 

diff --git a/dbms/src/Storages/StorageHDFS.cpp b/dbms/src/Storages/StorageHDFS.cpp
@@ -171,6 +171,7 @@ Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, c
             if (re2::RE2::FullMatch(file_name, matcher))
             {
                 Strings result_part = LSWithRegexpMatching(full_path + "/", fs, suffix_with_globs.substr(next_slash));
+                /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check.
                 std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
             }
         }

diff --git a/dbms/tests/integration/test_storage_hdfs/test.py b/dbms/tests/integration/test_storage_hdfs/test.py
@@ -29,7 +29,6 @@ def started_cluster():
 
 def test_read_write_storage(started_cluster):
     hdfs_api = HDFSApi("root")
-    hdfs_api.write_data("/simple_storage", "1\tMark\t72.53\n")
 
     node1.query("create table SimpleHDFSStorage (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/simple_storage', 'TSV')")
     node1.query("insert into SimpleHDFSStorage values (1, 'Mark', 72.53)")
@@ -39,19 +38,40 @@ def test_read_write_storage(started_cluster):
 def test_read_write_storage_with_globs(started_cluster):
     hdfs_api = HDFSApi("root")
 
-    for i in ["1", "2", "3"]:
-        hdfs_api.write_data("/storage" + i, i + "\tMark\t72.53\n")
-        assert hdfs_api.read_data("/storage" + i) == i + "\tMark\t72.53\n"
-
     node1.query("create table HDFSStorageWithRange (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage{1..5}', 'TSV')")
     node1.query("create table HDFSStorageWithEnum (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage{1,2,3,4,5}', 'TSV')")
     node1.query("create table HDFSStorageWithQuestionMark (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage?', 'TSV')")
     node1.query("create table HDFSStorageWithAsterisk (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage*', 'TSV')")
 
-    assert node1.query("select count(*) from HDFSStorageWithRange") == '3\n'
-    assert node1.query("select count(*) from HDFSStorageWithEnum") == '3\n'
-    assert node1.query("select count(*) from HDFSStorageWithQuestionMark") == '3\n'
-    assert node1.query("select count(*) from HDFSStorageWithAsterisk") == '3\n'
+    for i in ["1", "2", "3"]:
+        hdfs_api.write_data("/storage" + i, i + "\tMark\t72.53\n")
+        assert hdfs_api.read_data("/storage" + i) == i + "\tMark\t72.53\n"
+
+    assert node1.query("select count(*) from HDFSStorageWithRange") == "3\n"
+    assert node1.query("select count(*) from HDFSStorageWithEnum") == "3\n"
+    assert node1.query("select count(*) from HDFSStorageWithQuestionMark") == "3\n"
+    assert node1.query("select count(*) from HDFSStorageWithAsterisk") == "3\n"
+
+    try:
+        node1.query("insert into HDFSStorageWithEnum values (1, 'NEW', 4.2)")
+        assert False, "Exception have to be thrown"
+    except Exception as ex:
+        print ex
+        assert "in readonly mode" in str(ex)
+
+    try:
+        node1.query("insert into HDFSStorageWithQuestionMark values (1, 'NEW', 4.2)")
+        assert False, "Exception have to be thrown"
+    except Exception as ex:
+        print ex
+        assert "in readonly mode" in str(ex)
+
+    try:
+        node1.query("insert into HDFSStorageWithAsterisk values (1, 'NEW', 4.2)")
+        assert False, "Exception have to be thrown"
+    except Exception as ex:
+        print ex
+        assert "in readonly mode" in str(ex)
 
 def test_read_write_table(started_cluster):
     hdfs_api = HDFSApi("root")
@@ -78,18 +98,18 @@ def test_bad_hdfs_uri(started_cluster):
         node1.query("create table BadStorage1 (id UInt32, name String, weight Float64) ENGINE = HDFS('hads:hgsdfs100500:9000/other_storage', 'TSV')")
     except Exception as ex:
         print ex
-        assert 'Illegal HDFS URI' in str(ex)
+        assert "Illegal HDFS URI" in str(ex)
     try:
         node1.query("create table BadStorage2 (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs100500:9000/other_storage', 'TSV')")
     except Exception as ex:
         print ex
-        assert 'Unable to create builder to connect to HDFS' in str(ex)
+        assert "Unable to create builder to connect to HDFS" in str(ex)
 
     try:
         node1.query("create table BadStorage3 (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/<>', 'TSV')")
     except Exception as ex:
         print ex
-        assert 'Unable to open HDFS file' in str(ex)
+        assert "Unable to open HDFS file" in str(ex)
 
 def test_globs_in_read_table(started_cluster):
     hdfs_api = HDFSApi("root")

diff --git a/docs/en/operations/table_engines/hdfs.md b/docs/en/operations/table_engines/hdfs.md
@@ -13,6 +13,7 @@ The `format` parameter specifies one of the available file formats. To perform
 `SELECT` queries, the format must be supported for input, and to perform
 `INSERT` queries -- for output. The available formats are listed in the
 [Formats](../../interfaces/formats.md#formats) section.
+The path part of `URI` may contain globs. In this case the table would be readonly.
 
 **Example:**
 
@@ -48,4 +49,55 @@ SELECT * FROM hdfs_engine_table LIMIT 2
     - Indexes.
     - Replication.
 
+**Globs in path**
+
+Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. Listing of files determines during `SELECT` (not at `CREATE` moment).
+
+- `*` — Substitutes any number of any characters except `/` including empty string.
+- `?` — Substitutes any single character.
+- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`.
+- `{N..M}` — Substitutes any number in range from N to M including both borders.
+
+ Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)).
+
+**Example**
+
+1. Suppose we have several files in TSV format with the following URIs on HDFS:
+
+- 'hdfs://hdfs1:9000/some_dir/some_file_1'
+- 'hdfs://hdfs1:9000/some_dir/some_file_2'
+- 'hdfs://hdfs1:9000/some_dir/some_file_3'
+- 'hdfs://hdfs1:9000/another_dir/some_file_1'
+- 'hdfs://hdfs1:9000/another_dir/some_file_2'
+- 'hdfs://hdfs1:9000/another_dir/some_file_3'
+
+2. There are several ways to make a table consisting of all six files:
+
+```sql
+CREATE TABLE table_with_range (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV')
+```
+
+Another way:
+
+```sql
+CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_?', 'TSV')
+```
+
+Table consists of all the files in both directories (all files should satisfy format and schema described in query):
+
+```sql
+CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV')
+```
+
+!!! warning
+    If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`.
+
+**Example**
+
+Create table with files named `file000`, `file001`, ... , `file999`:
+
+```sql
+CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV')
+```
+
 [Original article](https://clickhouse.yandex/docs/en/operations/table_engines/hdfs/) <!--hide-->
diff --git a/docs/en/query_language/table_functions/file.md b/docs/en/query_language/table_functions/file.md
@@ -1,7 +1,7 @@
 
 # file
 
-Creates a table from a file.
+Creates a table from a file. This table function is similar to [url](url.md) and [hdfs](hdfs.md) ones.
 
 ```
 file(path, format, structure)
@@ -53,14 +53,49 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U
 
 **Globs in path**
 
-- `*` — Matches any number of any characters including none.
-- `?` — Matches any single character.
-- `{some_string,another_string,yet_another_one}` — Matches any of strings `'some_string', 'another_string', 'yet_another_one'`.
-- `{N..M}` — Matches any number in range from N to M including both borders.
+Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix).
 
+- `*` — Substitutes any number of any characters except `/` including empty string.
+- `?` — Substitutes any single character.
+- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`.
+- `{N..M}` — Substitutes any number in range from N to M including both borders.
+
+ Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)).
+
+**Example**
+
+1. Suppose we have several files with the following relative paths:
+
+- 'some_dir/some_file_1'
+- 'some_dir/some_file_2'
+- 'some_dir/some_file_3'
+- 'another_dir/some_file_1'
+- 'another_dir/some_file_2'
+- 'another_dir/some_file_3'
+
+2. Query the amount of rows in these files:
+
+```sql
+SELECT count(*)
+FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32')
+```
+
+3. Query the amount of rows in all files of these two directories:
+
+```sql
+SELECT count(*)
+FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32')
+```
 !!! warning
     If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`.
 
-Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern.
+**Example**
+
+Query the data from files named `file000`, `file001`, ... , `file999`:
+
+```sql
+SELECT count(*)
+FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32')
+```
 
 [Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/file/) <!--hide-->
diff --git a/docs/en/query_language/table_functions/hdfs.md b/docs/en/query_language/table_functions/hdfs.md
@@ -1,7 +1,7 @@
 
 # hdfs
 
-Creates a table from a file in HDFS.
+Creates a table from files in HDFS. This table function is similar to [url](url.md) and [file](file.md) ones.
 
 ```
 hdfs(URI, format, structure)
@@ -36,14 +36,51 @@ LIMIT 2
 
 **Globs in path**
 
-- `*` — Matches any number of any characters including none.
-- `?` — Matches any single character.
-- `{some_string,another_string,yet_another_one}` — Matches any of strings `'some_string', 'another_string', 'yet_another_one'`.
-- `{N..M}` — Matches any number in range from N to M including both borders.
+Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix).
+
+- `*` — Substitutes any number of any characters except `/` including empty string.
+- `?` — Substitutes any single character.
+- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`.
+- `{N..M}` — Substitutes any number in range from N to M including both borders.
+
+
+ Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)).
+
+**Example**
+
+1. Suppose that we have several files with following URIs on HDFS:
+
+- 'hdfs://hdfs1:9000/some_dir/some_file_1'
+- 'hdfs://hdfs1:9000/some_dir/some_file_2'
+- 'hdfs://hdfs1:9000/some_dir/some_file_3'
+- 'hdfs://hdfs1:9000/another_dir/some_file_1'
+- 'hdfs://hdfs1:9000/another_dir/some_file_2'
+- 'hdfs://hdfs1:9000/another_dir/some_file_3'
+
+2. Query the amount of rows in these files:
+
+```sql
+SELECT count(*)
+FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32')
+```
+
+3. Query the amount of rows in all files of these two directories:
+
+```sql
+SELECT count(*)
+FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32')
+```
 
 !!! warning
     If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`.
 
-Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern.
+**Example**
+
+Query the data from files named `file000`, `file001`, ... , `file999`:
+
+```sql
+SELECT count(*)
+FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32')
+```
 
 [Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/hdfs/) <!--hide-->