## Client

In [1]:
import sys
import json
import requests


class Client:
    """
        official docs:
            https://hadoop.apache.org/docs/r3.3.5/hadoop-project-dist/hadoop-hdfs/WebHDFS.html
        python library:
            https://github.com/mtth/hdfs
    """
    def __init__(self, namenode='http://localhost:9870', user='root'):
        self.namenode = namenode
        self.ports = {'datanode1': '9862', 'datanode2': '9863', 'datanode3': '9864'}
        self.session = requests.Session()
        self.session.params['user.name'] = user
    
    def ls(self, hdfs_path):
        hdfs_path = hdfs_path.lstrip('/')
        
        url = f'{self.namenode}/webhdfs/v1/{hdfs_path}?op=LISTSTATUS'
        res = self.session.get(url)
        return [status['pathSuffix'] for status in res.json()['FileStatuses']['FileStatus']]
    
    def mkdir(self, hdfs_path):
        hdfs_path = hdfs_path.lstrip('/')
        
        url = f'{self.namenode}/webhdfs/v1/{hdfs_path}?op=MKDIRS'
        res = self.session.put(url)
        return res.json()
    
    def write(self, hdfs_path, data, overwrite=False, buffersize=128*1024, append=False):
        hdfs_path = hdfs_path.lstrip('/')
        
        # namenode에게 데이터를 쓸 datanode 요청
        url = f"{self.namenode}/webhdfs/v1/{hdfs_path}?"
        if append:
            if overwrite:
                raise ValueError('Cannot both overwrite and append.')
            url += "op=APPEND"
        else:
            url += "op=CREATE"
        url += f"&noredirect=true&overwrite={str(overwrite).lower()}&buffersize={buffersize}"
        res = self.session.post(url) if append else self.session.put(url)

        # datanode 주소 변환
        location = res.json()['Location']
        location = self.transform(location)
        
        # datanode에게 데이터 쓰기 요청
        res = self.session.post(location, data=data) if append else self.session.put(location, data=data)
        return res
    
    def upload(self, hdfs_path, local_path, buffersize=128*1024):
        hdfs_path = hdfs_path.lstrip('/')
        try:
            with open(local_path, 'rb') as reader:
                data = reader.read()
                self.write(hdfs_path, data, buffersize=buffersize)
            return True
        except:
            self.delete(hdfs_path)
            return False
    
    def read(self, hdfs_path, buffersize=128*1024):
        hdfs_path = hdfs_path.lstrip('/')
        
        # namenode에게 데이터를 읽울 datanode 요청
        url = f"{self.namenode}/webhdfs/v1/{hdfs_path}?op=OPEN&noredirect=true&buffersize={buffersize}"
        res = self.session.get(url)
        
        # datanode 주소 변환
        location = res.json()['Location']
        location = self.transform(location)
        
        # datanode에게 데이터 읽기 요청
        res = self.session.get(location)
        return res
    
    def download(self, hdfs_path, local_path, buffersize=128*1024):
        hdfs_path = hdfs_path.lstrip('/')
        
        try:
            with open(local_path, 'wb') as writer:
                with self.read(hdfs_path, buffersize=buffersize) as reader:
                    for chunk in reader:
                        writer.write(chunk)
            return True
        except:
            return False
    
    def delete(self, hdfs_path, isdir=False):
        hdfs_path = hdfs_path.lstrip('/')
        
        url = f'{self.namenode}/webhdfs/v1/{hdfs_path}?op=DELETE&recursive={str(isdir).lower()}'
        res = self.session.delete(url)
        return res.json()
    
    def transform(self, location):
        location = location.split("/")
        datanode, _ = location[2].split(":")
        location[2] = f'localhost:{self.ports[datanode]}'
        return '/'.join(location)

In [493]:
hdfs = Client()

#### make directory

In [474]:
hdfs.mkdir('/test')

{'boolean': True}

#### list directory

In [475]:
hdfs.ls('/')

['hbase', 'models', 'test']

#### write data

In [476]:
res = hdfs.write('/test/test.txt', 'test text file!', overwrite=True, append=False)

#### read data

In [477]:
res = hdfs.read('/test/test.txt')
res.content

b'test text file!'

#### upload data

In [478]:
import pickle

data = {
    'a': [1, 2.0, 3, 4+6j],
    'b': ('string', u'Unicode string'),
    'c': None
}

with open('data.pkl', 'wb') as f:
    pickle.dump(data, f)

In [479]:
hdfs.upload('/test/data.pkl', 'data.pkl')

True

In [480]:
res = hdfs.read('/test/data.pkl')
pickle.loads(res.content)

{'a': [1, 2.0, 3, (4+6j)], 'b': ('string', 'Unicode string'), 'c': None}

#### download data

In [481]:
hdfs.download('/test/data.pkl', 'downloaded.pkl')

False

In [482]:
with open('downloaded.pkl', 'rb') as f:
    print(pickle.load(f))

{'a': [1, 2.0, 3, (4+6j)], 'b': ('string', 'Unicode string'), 'c': None}


#### delete data

In [373]:
hdfs.ls('/test')

['data.pkl', 'test.txt', 'test2.txt']

In [376]:
hdfs.delete('/test/data.pkl')

{'boolean': False}

In [375]:
hdfs.ls('/test')

['test.txt', 'test2.txt']

In [379]:
hdfs.delete('/test', isdir=True)

{'boolean': True}

In [380]:
hdfs.ls('/')

['hbase']

### models

In [2]:
hdfs = Client()

In [5]:
hdfs.ls('/')

['hbase']

In [6]:
hdfs.mkdir('/models')

{'boolean': True}

In [7]:
hdfs.ls('/models')

[]

In [4]:
hdfs.delete('/models', isdir=True)

{'boolean': True}

In [9]:
hdfs_path = '/models/2023-06-21.model'
local_path = 'models/2023-06-21.model'

hdfs.upload(hdfs_path, local_path)

True