# PART 1

In [1]:
# ! wget https://pages.cs.wisc.edu/~harter/cs639/data/hdma-wi-2021.csv

In [2]:
! hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=1 -cp hdma-wi-2021.csv hdfs://main:9000/single.csv

In [3]:
! hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=2 -cp hdma-wi-2021.csv hdfs://main:9000/double.csv

In [4]:
! hdfs dfs -du -h hdfs://main:9000/

166.8 M  333.7 M  hdfs://main:9000/double.csv
166.8 M  166.8 M  hdfs://main:9000/single.csv


# PART 2

In [5]:
import requests, math, io, time

In [6]:
#! curl -i "http://main:9870/webhdfs/v1/single.csv?op=OPEN&offset=0"
# resp = requests.get("http://main:9870/webhdfs/v1/single.csv?op=OPEN&offset=0", allow_redirects=False)
# resp.headers
# resp.headers["Location"]
# new_resp = requests.get(resp.headers["Location"])

In [7]:
# #1 MB = 1048576
dummy_resp = requests.get("http://main:9870/webhdfs/v1/single.csv?op=GETFILESTATUS", allow_redirects=False)
block_size = math.ceil(dummy_resp.json()["FileStatus"]["length"] / 1024 / 1024)

urls = {}
for i in range(block_size):
    offset = 1048576 * i
    resp = requests.get(f"http://main:9870/webhdfs/v1/single.csv?op=OPEN&offset={offset}", allow_redirects=False)
    resp_url = resp.headers["Location"]
    
    # use regex to grab the string before the "?" (as shown in the Part 2 specification)
    url_edit = resp_url.split("?")[0]
    
    #dict mapping w/ counts
    if url_edit not in urls:
        urls[url_edit] = 1
    else:
        urls[url_edit] += 1
urls

{'http://478292fd6023:9864/webhdfs/v1/single.csv': 85,
 'http://e4fe85ff4375:9864/webhdfs/v1/single.csv': 82}

# PART 3

In [24]:
class hdfsFile(io.RawIOBase):
    def __init__(self, path):
        self.path = path
        self.offset = 0
        self.length = requests.get(f"http://main:9870/webhdfs/v1/{self.path}?op=GETFILESTATUS", allow_redirects=False).json()["FileStatus"]["length"]

    def readable(self):
        return True

    def readinto(self, b):
        if self.offset < self.length:
            temp_resp = requests.get(f"http://main:9870/webhdfs/v1/{self.path}?op=OPEN&offset={self.offset}&length={len(b)}", allow_redirects=False)
            # print(temp_resp.status_code)
            
            # PART 4
            if temp_resp.status_code != 307:
                self.offset += 1024**2
                b[:1] = bytes("\n", "utf-8")
                return 1
            
            resp = requests.get(temp_resp.headers["Location"])
            text = bytes(resp.text, "utf-8")
            b[:len(text)] = text
            self.offset += len(b)
            return len(text)
        return 0

In [9]:
single_family_count_1MB = 0
multi_family_count_1MB = 0

start = time.time()
for line in io.BufferedReader(hdfsFile("single.csv"), buffer_size =(1024**2)): # 1MB
    line = str(line, "utf-8")
    # print(line)
    
    if "Single Family" in line:
        single_family_count_1MB += 1
        
    if "Multifamily" in line:
        multi_family_count_1MB += 1
        
end = time.time()
elapsed = end - start

print("Counts from single.csv using 1MB buffer_size")
print("Single Family:", single_family_count_1MB)
print("Multi Family:", multi_family_count_1MB)
print("Seconds:", elapsed)

Counts from single.csv using 1MB buffer_size
Single Family: 444874
Multi Family: 2493
Seconds: 14.653716802597046


In [10]:
single_family_count_2MB = 0
multi_family_count_2MB = 0

start = time.time()
for line in io.BufferedReader(hdfsFile("single.csv"), buffer_size =(1000**2)*2): # 2MB
    line = str(line, "utf-8")
    # print(line)
    
    if "Single Family" in line:
        single_family_count_2MB += 1
        
    if "Multifamily" in line:
        multi_family_count_2MB += 1
        
end = time.time()
elapsed = end - start

print("Counts from single.csv using 2MB buffer_size")
print("Single Family:", single_family_count_2MB)
print("Multi Family:", multi_family_count_2MB)
print("Seconds:", elapsed)

Counts from single.csv using 2MB buffer_size
Single Family: 444874
Multi Family: 2493
Seconds: 5.959940195083618


# PART 4

In [15]:
!hdfs dfsadmin -fs hdfs://main:9000/ -report

Configured Capacity: 51642105856 (48.10 GB)
Present Capacity: 36001053820 (33.53 GB)
DFS Remaining: 35472068608 (33.04 GB)
DFS Used: 528985212 (504.48 MB)
DFS Used%: 1.47%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (1):

Name: 172.18.0.4:9866 (project-3-cam-worker-2.cs544net)
Hostname: e4fe85ff4375
Decommission Status : Normal
Configured Capacity: 25821052928 (24.05 GB)
DFS Used: 262822741 (250.65 MB)
Non DFS Used: 7805433003 (7.27 GB)
DFS Remaining: 17736019968 (16.52 GB)
DFS Used%: 1.02%
DFS Remaining%: 68.69%
Configu

In [25]:
double_single_family_count = 0
double_multi_family_count = 0

for line in io.BufferedReader(hdfsFile("double.csv"), buffer_size =1048576): # 1MB
    line = str(line, "utf-8")
    # print(line)
    
    if "Single Family" in line:
        double_single_family_count += 1
        
    if "Multifamily" in line:
        double_multi_family_count += 1

print("Counts from double.csv")
print("Single Family:", double_single_family_count)
print("Multi Family:", double_multi_family_count)

Counts from double.csv
Single Family: 444874
Multi Family: 2493


In [26]:
single_single_family_count = 0
single_multi_family_count = 0

for line in io.BufferedReader(hdfsFile("single.csv"), buffer_size =1048576): # 1MB
    line = str(line, "utf-8")
    # print(line)
    
    if "Single Family" in line:
        single_single_family_count += 1
        
    if "Multifamily" in line:
        single_multi_family_count += 1

print("Counts from single.csv")
print("Single Family:", single_single_family_count)
print("Multi Family:", single_multi_family_count)

Counts from single.csv
Single Family: 218589
Multi Family: 993
