In [None]:
import requests
import pyarrow as pa
import pyarrow.fs
import io

### HDFS

In [None]:
#q1
!hdfs dfsadmin -fs hdfs://main:9000 -report

#### File system commands

- `mkdir`
- `cp`
- `ls`
- `cat`, etc.,

Create a "data" directory.

In [None]:
!hdfs dfs -mkdir hdfs://main:9000/data

Let's copy over "/hadoop-3.3.6/LICENSE.txt" from our VM to the "data" directory inside HDFS.

In [None]:
!ls /hadoop-3.3.6/LICENSE.txt # on our VM

In [None]:
!hdfs dfs -cp /hadoop-3.3.6/LICENSE.txt hdfs://main:9000/data/

Let's try `ls` inside HDFS now.

In [None]:
!hdfs dfs -ls hdfs://main:9000/data/

Let's see how much disk space the file is using.

In [None]:
!hdfs dfs -du hdfs://main:9000/data/

15217 is the logical size (actual size).<br>
45651 is the physical size (replicated size).<br>
Why don't they match? We only have 1 data node.

In [None]:
45651 / 15217 # 3 replicas

In [None]:
!hdfs dfsadmin -fs hdfs://main:9000 -report

DFS filesystem checking utility.

In [None]:
!hdfs fsck hdfs://main:9000/data/LICENSE.txt

#### Let's create a replica.

`-D`: OPTIONS_TO_JAVA (example: `dfs.replication`)

In [None]:
!hdfs dfs -D dfs.replication=1 -cp /hadoop-3.3.6/LICENSE.txt hdfs://main:9000/data/v2.txt

In [None]:
#!hdfs dfs -cat hdfs://main:9000/data/LICENSE.txt
!hdfs dfs -head hdfs://main:9000/data/LICENSE.txt

Let's check v2.txt's health.

In [None]:
!hdfs fsck hdfs://main:9000/data/v2.txt

### WebHDFS

Documentation: https://hadoop.apache.org/docs/r1.0.4/webhdfs.html 

Response with headers (`-i` flag).

In [None]:
# curl -i  "http://<HOST>:<PORT>/webhdfs/v1/<PATH>?op=LISTSTATUS"
! curl -i  "http://main:9870/webhdfs/v1/data?op=LISTSTATUS"

Response without headers.

In [None]:
! curl "http://main:9870/webhdfs/v1/data?op=LISTSTATUS"

Let's access the file contents.<br>

`-L` flag enables us follow redirects.

In [None]:
# curl -i -L "http://<HOST>:<PORT>/webhdfs/v1/<PATH>?op=OPEN
#                     [&offset=<LONG>][&length=<LONG>][&buffersize=<INT>]"
! curl -i -L "http://main:9870/webhdfs/v1/data/v2.txt?op=OPEN&offset=0&length=200"

In [None]:
! curl -L "http://main:9870/webhdfs/v1/data/v2.txt?op=OPEN&offset=0&length=400"

In [None]:
! curl -i "http://main:9870/webhdfs/v1/data/v2.txt?op=OPEN&offset=0&length=200"

In [None]:
! curl "http://main:9870/webhdfs/v1/data/v2.txt?op=OPEN&offset=0&length=200&noredirect=true"

#### Sending requests from `requests` module

In [None]:
r = requests.get("http://main:9870/webhdfs/v1/data/v2.txt?op=OPEN&offset=0&length=200&noredirect=true")
r.raise_for_status()  # checks that HTTP response status code is 200 OK
r.content

In [None]:
type(r.content)

In [None]:
r.json()

In [None]:
r.json()["Location"]

In [None]:
r = requests.get("http://main:9870/webhdfs/v1/data/v2.txt?op=OPEN&offset=0&length=200")
r.raise_for_status()
r.content

### Using PyArrow to read data

In [None]:
hdfs = pa.fs.HadoopFileSystem("main", 9000)

In [None]:
f = hdfs.open_input_file("/data/v2.txt")

In [None]:
type(f)

In [None]:
type(f).__mro__ # method resolution order

In [None]:
dir(f)

In [None]:
f.read_at(200, 100) # read 200 bytes at offset 0

In [None]:
# wouldn't work
for line in f:
    print(line)

In [None]:
with hdfs.open_input_file("/data/v2.txt") as f:
    reader = io.TextIOWrapper(io.BufferedReader(f))
    for i, line in enumerate(reader):
        print(line, end="")
        if i > 10:
            break