In [None]:
# 1.Read and display the core components of Hadoop from a configuration file:
def read_hadoop_config(config_file):
    core_components = []
    with open(config_file, 'r') as f:
        for line in f:
            if line.startswith("<name>"):
                component = line.strip("<name>").strip("</>").strip()
                core_components.append(component)
    return core_components

config_file = "hadoop-config.xml"
core_components = read_hadoop_config(config_file)
print("Core components of Hadoop:")
for component in core_components:
    print(component)


In [None]:
# 2.Calculate the total file size in a Hadoop Distributed File System (HDFS) directory:
import subprocess

def get_directory_size(directory):
    cmd = f"hadoop fs -du -s {directory}"
    output = subprocess.check_output(cmd, shell=True)
    total_size = int(output.decode().split()[0])
    return total_size

hdfs_directory = "/user/data"
total_size = get_directory_size(hdfs_directory)
print(f"Total file size in {hdfs_directory}: {total_size} bytes")


In [None]:
# 3.Extract and display the top N most frequent words from a large text file using the MapReduce approach:
from pyspark import SparkContext

def get_top_words(text_file, n):
    sc = SparkContext("local", "WordCount")
    text = sc.textFile(text_file)
    word_counts = text.flatMap(lambda line: line.split(" ")) \
                     .map(lambda word: (word, 1)) \
                     .reduceByKey(lambda a, b: a + b)
    top_words = word_counts.takeOrdered(n, key=lambda x: -x[1])
    return top_words

text_file = "large_text.txt"
n = 10
top_words = get_top_words(text_file, n)
print(f"Top {n} most frequent words:")
for word, count in top_words:
    print(f"{word}: {count}")


In [None]:
# 4.Check the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API:
import requests

def check_health_status():
    namenode_url = "http://namenode:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
    datanode_url = "http://datanode:50075/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo"
    
    namenode_response = requests.get(namenode_url).json()
    namenode_status = namenode_response['beans'][0]['State']
    
    datanode_response = requests.get(datanode_url).json()
    datanode_status = datanode_response['beans'][0]['State']
    
    return namenode_status, datanode_status

nn_status, dn_status = check_health_status()
print("Health Status:")
print(f"NameNode: {nn_status}")
print(f"DataNode: {dn_status}")


In [None]:
# 5.List all the files and directories in a specific HDFS path:
import subprocess

def list_hdfs_path(hdfs_path):
    cmd = f"hadoop fs -ls {hdfs_path}"
    output = subprocess.check_output(cmd, shell=True)
    files = output.decode().split("\n")[1:-1]
    file_list = [file.split()[-1] for file in files]
    return file_list

hdfs_path = "/user/data"
files = list_hdfs_path(hdfs_path)
print("Files and directories:")
for file in files:
    print(file)


In [None]:
# 6.Analyze the storage utilization of DataNodes in a Hadoop cluster and identify the nodes with the highest and lowest storage capacities:
import requests

def analyze_data_nodes():
    datanodes_url = "http://datanode:50075/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState-*"
    datanodes_response = requests.get(datanodes_url).json()
    datanodes = datanodes_response['beans']
    
    storage_utilization = []
    for datanode in datanodes:
        storage_info = {
            'node': datanode['name'].split('=')[-1],
            'capacity': datanode['Capacity'],
            'used': datanode['DfsUsed'],
            'remaining': datanode['Remaining'],
            'utilization': float(datanode['DfsUsed']) / float(datanode['Capacity'])
        }
        storage_utilization.append(storage_info)
    
    storage_utilization.sort(key=lambda x: x['utilization'], reverse=True)
    highest_utilization = storage_utilization[0]
    lowest_utilization = storage_utilization[-1]
    
    return highest_utilization, lowest_utilization

highest_util, lowest_util = analyze_data_nodes()
print("Storage Utilization:")
print(f"Highest Utilization: {highest_util['node']} ({highest_util['utilization']*100:.2f}%)")
print(f"Lowest Utilization: {lowest_util['node']} ({lowest_util['utilization']*100:.2f}%)")


In [None]:
# 7.Interact with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output:
import requests

def submit_hadoop_job(jar_path, input_path, output_path):
    url = "http://resourcemanager:8088/ws/v1/cluster/apps/new-application"
    response = requests.post(url)
    application_id = response.json()['application-id']
    
    submit_url = f"http://resourcemanager:8088/ws/v1/cluster/apps/{application_id}/submit"
    submit_payload = {
        "application-id": application_id,
        "application-name": "MyHadoopJob",
        "am-container-spec": {
            "commands": {
                "command": f"hadoop jar {jar_path} input {output_path}"
            },
            "local-resources": {
                "resource": [
                    {
                        "name": "input",
                        "type": "FILE",
                        "visibility": "APPLICATION",
                        "uri": f"file://{input_path}"
                    }
                ]
            }
        },
        "unmanaged-AM": False
    }
    requests.post(submit_url, json=submit_payload)
    
    return application_id

def monitor_job_progress(application_id):
    url = f"http://resourcemanager:8088/ws/v1/cluster/apps/{application_id}"
    response = requests.get(url)
    state = response.json()['app']['state']
    final_status = response.json()['app']['finalStatus']
    
    return state, final_status

def retrieve_job_output(output_path):
    url = f"http://resourcemanager:8088/ws/v1/cluster/apps/{application_id}/containers"
    response = requests.get(url)
    containers = response.json()['containers']
    container_id = containers[0]['id']
    
    output_url = f"http://node:8042/node/containerlogs/{container_id}/stdout"
    response = requests.get(output_url)
    job_output = response.text
    
    return job_output

jar_path = "myjob.jar"
input_path = "input.txt"
output_path = "output"
application_id = submit_hadoop_job(jar_path, input_path, output_path)
state, final_status = monitor_job_progress(application_id)
job_output = retrieve_job_output(output_path)

print("Job Progress:")
print(f"Application ID: {application_id}")
print(f"State: {state}")
print(f"Final Status: {final_status}")

print("Job Output:")
print(job_output)


In [None]:
# 8.Interact with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution:
import requests

def submit_hadoop_job(jar_path, input_path, output_path, num_executors, executor_memory):
    url = "http://resourcemanager:8088/ws/v1/cluster/apps/new-application"
    response = requests.post(url)
    application_id = response.json()['application-id']
    
    submit_url = f"http://resourcemanager:8088/ws/v1/cluster/apps/{application_id}/submit"
    submit_payload = {
        "application-id": application_id,
        "application-name": "MyHadoopJob",
        "am-container-spec": {
            "commands": {
                "command": f"hadoop jar {jar_path} input {output_path}"
            },
            "local-resources": {
                "resource": [
                    {
                        "name": "input",
                        "type": "FILE",
                        "visibility": "APPLICATION",
                        "uri": f"file://{input_path}"
                    }
                ]
            },
            "resource": {
                "vCores": 1,
                "memory": 1024
            },
            "instances": 1,
            "environment": {
                "variables": {
                    "SPARK_EXECUTOR_MEMORY": executor_memory
                }
            }
        },
        "unmanaged-AM": False
    }
    requests.post(submit_url, json=submit_payload)
    
    return application_id

def monitor_resource_usage(application_id):
    url = f"http://resourcemanager:8088/ws/v1/cluster/apps/{application_id}/containers"
    response = requests.get(url)
    containers = response.json()['containers']
    total_memory = 0
    total_vcores = 0
    
    for container in containers:
        total_memory += container['allocatedMB']
        total_vcores += container['allocatedVCores']
    
    return total_memory, total_vcores

jar_path = "myjob.jar"
input_path = "input.txt"
output_path = "output"
num_executors = 4
executor_memory = "4g"
application_id = submit_hadoop_job(jar_path, input_path, output_path, num_executors, executor_memory)
total_memory, total_vcores = monitor_resource_usage(application_id)

print("Resource Usage:")
print(f"Application ID: {application_id}")
print(f"Total Memory Allocated: {total_memory} MB")
print(f"Total vCores Allocated: {total_vcores}")


In [None]:
# 9.Compare the performance of a MapReduce job with different input split sizes, showcasing the impact on the overall job execution time:
import subprocess
import time

def run_mapreduce_job(input_file, split_size):
    start_time = time.time()
    cmd = f"hadoop jar mapreduce.jar input {input_file} split {split_size}"
    subprocess.check_output(cmd, shell=True)
    end_time = time.time()
    execution_time = end_time - start_time
    
    return execution_time

input_file = "large_input.txt"
split_sizes = [64, 128, 256]
for split_size in split_sizes:
    execution_time = run_mapreduce_job(input_file, split_size)
    print(f"Execution time for split size {split_size}: {execution_time} seconds")
