Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup hdfs docs (instead of #6876 for #5371) #7016

Merged
merged 4 commits into from Sep 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions dbms/src/IO/HDFSCommon.cpp
Expand Up @@ -17,8 +17,8 @@ HDFSBuilderPtr createHDFSBuilder(const std::string & uri_str)
const Poco::URI uri(uri_str);
auto & host = uri.getHost();
auto port = uri.getPort();
auto & path = uri.getPath();
if (host.empty() || path.empty())
const std::string path = "//";
if (host.empty())
throw Exception("Illegal HDFS URI: " + uri.toString(), ErrorCodes::BAD_ARGUMENTS);

HDFSBuilderPtr builder(hdfsNewBuilder());
Expand Down
7 changes: 5 additions & 2 deletions dbms/src/IO/WriteBufferFromHDFS.cpp
Expand Up @@ -15,6 +15,7 @@ namespace ErrorCodes
extern const int NETWORK_ERROR;
extern const int CANNOT_OPEN_FILE;
extern const int CANNOT_FSYNC;
extern const int BAD_ARGUMENTS;
}


Expand All @@ -32,10 +33,12 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl
{
const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2);
const std::string path = hdfs_uri.substr(begin_of_path);
if (path.find("*?{") != std::string::npos)
if (path.find_first_of("*?{") != std::string::npos)
throw Exception("URI '" + hdfs_uri + "' contains globs, so the table is in readonly mode", ErrorCodes::CANNOT_OPEN_FILE);

fout = hdfsOpenFile(fs.get(), path.c_str(), O_WRONLY, 0, 0, 0);
if (!hdfsExists(fs.get(), path.c_str()))
throw Exception("File: " + path + " is already exists", ErrorCodes::BAD_ARGUMENTS);
fout = hdfsOpenFile(fs.get(), path.c_str(), O_WRONLY, 0, 0, 0); /// O_WRONLY meaning create or overwrite i.e., implies O_TRUNCAT here

if (fout == nullptr)
{
Expand Down
4 changes: 2 additions & 2 deletions dbms/src/Storages/StorageFile.cpp
Expand Up @@ -87,7 +87,7 @@ std::vector<std::string> listFilesWithRegexpMatching(const std::string & path_fo
{
if (re2::RE2::FullMatch(file_name, matcher))
{
/// TODO: No recursion depth check. No protection for cyclic symlinks. It is a bug.
/// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check.
Strings result_part = listFilesWithRegexpMatching(full_path + "/", suffix_with_globs.substr(next_slash));
std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
}
Expand Down Expand Up @@ -284,7 +284,7 @@ class StorageFileBlockOutputStream : public IBlockOutputStream
else
{
if (storage.paths.size() != 1)
throw Exception("Table '" + storage.table_name + "' is in readonly mode", ErrorCodes::DATABASE_ACCESS_DENIED);
throw Exception("Table '" + storage.table_name + "' is in readonly mode because of globs in filepath", ErrorCodes::DATABASE_ACCESS_DENIED);
write_buf = std::make_unique<WriteBufferFromFile>(storage.paths[0], DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT);
}

Expand Down
1 change: 1 addition & 0 deletions dbms/src/Storages/StorageHDFS.cpp
Expand Up @@ -171,6 +171,7 @@ Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, c
if (re2::RE2::FullMatch(file_name, matcher))
{
Strings result_part = LSWithRegexpMatching(full_path + "/", fs, suffix_with_globs.substr(next_slash));
/// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check.
std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
}
}
Expand Down
44 changes: 32 additions & 12 deletions dbms/tests/integration/test_storage_hdfs/test.py
Expand Up @@ -29,7 +29,6 @@ def started_cluster():

def test_read_write_storage(started_cluster):
hdfs_api = HDFSApi("root")
hdfs_api.write_data("/simple_storage", "1\tMark\t72.53\n")

node1.query("create table SimpleHDFSStorage (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/simple_storage', 'TSV')")
node1.query("insert into SimpleHDFSStorage values (1, 'Mark', 72.53)")
Expand All @@ -39,19 +38,40 @@ def test_read_write_storage(started_cluster):
def test_read_write_storage_with_globs(started_cluster):
hdfs_api = HDFSApi("root")

for i in ["1", "2", "3"]:
hdfs_api.write_data("/storage" + i, i + "\tMark\t72.53\n")
assert hdfs_api.read_data("/storage" + i) == i + "\tMark\t72.53\n"

node1.query("create table HDFSStorageWithRange (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage{1..5}', 'TSV')")
node1.query("create table HDFSStorageWithEnum (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage{1,2,3,4,5}', 'TSV')")
node1.query("create table HDFSStorageWithQuestionMark (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage?', 'TSV')")
node1.query("create table HDFSStorageWithAsterisk (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage*', 'TSV')")

assert node1.query("select count(*) from HDFSStorageWithRange") == '3\n'
assert node1.query("select count(*) from HDFSStorageWithEnum") == '3\n'
assert node1.query("select count(*) from HDFSStorageWithQuestionMark") == '3\n'
assert node1.query("select count(*) from HDFSStorageWithAsterisk") == '3\n'
for i in ["1", "2", "3"]:
hdfs_api.write_data("/storage" + i, i + "\tMark\t72.53\n")
assert hdfs_api.read_data("/storage" + i) == i + "\tMark\t72.53\n"

assert node1.query("select count(*) from HDFSStorageWithRange") == "3\n"
assert node1.query("select count(*) from HDFSStorageWithEnum") == "3\n"
assert node1.query("select count(*) from HDFSStorageWithQuestionMark") == "3\n"
assert node1.query("select count(*) from HDFSStorageWithAsterisk") == "3\n"

try:
node1.query("insert into HDFSStorageWithEnum values (1, 'NEW', 4.2)")
assert False, "Exception have to be thrown"
except Exception as ex:
print ex
assert "in readonly mode" in str(ex)

try:
node1.query("insert into HDFSStorageWithQuestionMark values (1, 'NEW', 4.2)")
assert False, "Exception have to be thrown"
except Exception as ex:
print ex
assert "in readonly mode" in str(ex)

try:
node1.query("insert into HDFSStorageWithAsterisk values (1, 'NEW', 4.2)")
assert False, "Exception have to be thrown"
except Exception as ex:
print ex
assert "in readonly mode" in str(ex)

def test_read_write_table(started_cluster):
hdfs_api = HDFSApi("root")
Expand All @@ -78,18 +98,18 @@ def test_bad_hdfs_uri(started_cluster):
node1.query("create table BadStorage1 (id UInt32, name String, weight Float64) ENGINE = HDFS('hads:hgsdfs100500:9000/other_storage', 'TSV')")
except Exception as ex:
print ex
assert 'Illegal HDFS URI' in str(ex)
assert "Illegal HDFS URI" in str(ex)
try:
node1.query("create table BadStorage2 (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs100500:9000/other_storage', 'TSV')")
except Exception as ex:
print ex
assert 'Unable to create builder to connect to HDFS' in str(ex)
assert "Unable to create builder to connect to HDFS" in str(ex)

try:
node1.query("create table BadStorage3 (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/<>', 'TSV')")
except Exception as ex:
print ex
assert 'Unable to open HDFS file' in str(ex)
assert "Unable to open HDFS file" in str(ex)

def test_globs_in_read_table(started_cluster):
hdfs_api = HDFSApi("root")
Expand Down
52 changes: 52 additions & 0 deletions docs/en/operations/table_engines/hdfs.md
Expand Up @@ -13,6 +13,7 @@ The `format` parameter specifies one of the available file formats. To perform
`SELECT` queries, the format must be supported for input, and to perform
`INSERT` queries -- for output. The available formats are listed in the
[Formats](../../interfaces/formats.md#formats) section.
The path part of `URI` may contain globs. In this case the table would be readonly.

**Example:**

Expand Down Expand Up @@ -48,4 +49,55 @@ SELECT * FROM hdfs_engine_table LIMIT 2
- Indexes.
- Replication.

**Globs in path**

Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. Listing of files determines during `SELECT` (not at `CREATE` moment).

- `*` — Substitutes any number of any characters except `/` including empty string.
- `?` — Substitutes any single character.
- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`.
- `{N..M}` — Substitutes any number in range from N to M including both borders.

Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)).

**Example**

1. Suppose we have several files in TSV format with the following URIs on HDFS:

- 'hdfs://hdfs1:9000/some_dir/some_file_1'
- 'hdfs://hdfs1:9000/some_dir/some_file_2'
- 'hdfs://hdfs1:9000/some_dir/some_file_3'
- 'hdfs://hdfs1:9000/another_dir/some_file_1'
- 'hdfs://hdfs1:9000/another_dir/some_file_2'
- 'hdfs://hdfs1:9000/another_dir/some_file_3'

2. There are several ways to make a table consisting of all six files:

```sql
CREATE TABLE table_with_range (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV')
```

Another way:

```sql
CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_?', 'TSV')
```

Table consists of all the files in both directories (all files should satisfy format and schema described in query):

```sql
CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV')
```

!!! warning
If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`.

**Example**

Create table with files named `file000`, `file001`, ... , `file999`:

```sql
CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV')
```

[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/hdfs/) <!--hide-->
47 changes: 41 additions & 6 deletions docs/en/query_language/table_functions/file.md
@@ -1,7 +1,7 @@

# file

Creates a table from a file.
Creates a table from a file. This table function is similar to [url](url.md) and [hdfs](hdfs.md) ones.

```
file(path, format, structure)
Expand Down Expand Up @@ -53,14 +53,49 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U

**Globs in path**

- `*` — Matches any number of any characters including none.
- `?` — Matches any single character.
- `{some_string,another_string,yet_another_one}` — Matches any of strings `'some_string', 'another_string', 'yet_another_one'`.
- `{N..M}` — Matches any number in range from N to M including both borders.
Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix).

- `*` — Substitutes any number of any characters except `/` including empty string.
- `?` — Substitutes any single character.
- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`.
- `{N..M}` — Substitutes any number in range from N to M including both borders.

Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)).

**Example**

1. Suppose we have several files with the following relative paths:

- 'some_dir/some_file_1'
- 'some_dir/some_file_2'
- 'some_dir/some_file_3'
- 'another_dir/some_file_1'
- 'another_dir/some_file_2'
- 'another_dir/some_file_3'

2. Query the amount of rows in these files:

```sql
SELECT count(*)
FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32')
```

3. Query the amount of rows in all files of these two directories:

```sql
SELECT count(*)
FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32')
```
!!! warning
If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`.

Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern.
**Example**

Query the data from files named `file000`, `file001`, ... , `file999`:

```sql
SELECT count(*)
FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32')
```

[Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/file/) <!--hide-->
49 changes: 43 additions & 6 deletions docs/en/query_language/table_functions/hdfs.md
@@ -1,7 +1,7 @@

# hdfs

Creates a table from a file in HDFS.
Creates a table from files in HDFS. This table function is similar to [url](url.md) and [file](file.md) ones.

```
hdfs(URI, format, structure)
Expand Down Expand Up @@ -36,14 +36,51 @@ LIMIT 2

**Globs in path**

- `*` — Matches any number of any characters including none.
- `?` — Matches any single character.
- `{some_string,another_string,yet_another_one}` — Matches any of strings `'some_string', 'another_string', 'yet_another_one'`.
- `{N..M}` — Matches any number in range from N to M including both borders.
Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix).

- `*` — Substitutes any number of any characters except `/` including empty string.
- `?` — Substitutes any single character.
- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`.
- `{N..M}` — Substitutes any number in range from N to M including both borders.


Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)).

**Example**

1. Suppose that we have several files with following URIs on HDFS:

- 'hdfs://hdfs1:9000/some_dir/some_file_1'
- 'hdfs://hdfs1:9000/some_dir/some_file_2'
- 'hdfs://hdfs1:9000/some_dir/some_file_3'
- 'hdfs://hdfs1:9000/another_dir/some_file_1'
- 'hdfs://hdfs1:9000/another_dir/some_file_2'
- 'hdfs://hdfs1:9000/another_dir/some_file_3'

2. Query the amount of rows in these files:

```sql
SELECT count(*)
FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32')
```

3. Query the amount of rows in all files of these two directories:

```sql
SELECT count(*)
FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32')
```

!!! warning
If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`.

Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern.
**Example**

Query the data from files named `file000`, `file001`, ... , `file999`:

```sql
SELECT count(*)
FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32')
```

[Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/hdfs/) <!--hide-->