# Installing Java 8
Hadoop is a java programming-based data processing framework

OpenJDK is a development environment for building applications, applets, and components using the Java programming language.

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!java -version

!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!update-alternatives --set javac /usr/lib/jvm/java-8-openjdk-amd64/bin/javac
!update-alternatives --set jps /usr/lib/jvm/java-8-openjdk-amd64/bin/jps
!java -version

#Finding the default Java path
!readlink -f /usr/bin/java | sed "s:bin/java::"
!apt-get install openssh-server -qq > /dev/null
!service ssh start

!grep Port /etc/ssh/sshd_config

#Creating a new rsa key pair with empty password
!ssh-keygen -t rsa -P "" -f ~/.ssh/id_rsa <<< y

# See id_rsa.pub content
!more /root/.ssh/id_rsa.pub

#Copying the key to autorized keys
!cat $HOME/.ssh/id_rsa.pub > $HOME/.ssh/authorized_keys
#Changing the permissions on the key
!chmod 0600 ~/.ssh/authorized_keys

#Conneting with the local machine
!ssh -o StrictHostKeyChecking=no localhost uptime

# https://archive.apache.org/dist/hadoop/common/hadoop-3.2.3/hadoop-3.2.3.tar.gz
# mirror download: http://mirrors.cloud.aliyuncs.com/apache/hadoop/common/hadoop-3.2.4/hadoop-3.2.4.tar.gz

#Downloading Hadoop 3.2.4
!wget -q https://downloads.apache.org/hadoop/common/hadoop-3.2.4/hadoop-3.2.4.tar.gz

#Untarring the file
!sudo tar -xzf hadoop-3.2.4.tar.gz
#Removing the tar file
!rm hadoop-3.2.4.tar.gz


#Copying the hadoop files to user/local
!cp -r hadoop-3.2.4/ /usr/local/
#-r copy directories recursively

#Adding JAVA_HOME directory to hadoop-env.sh file
!sed -i '/export JAVA_HOME=/a export JAVA_HOME=\/usr\/lib\/jvm\/java-8-openjdk-amd64' /usr/local/hadoop-3.2.4/etc/hadoop/hadoop-env.sh

import os
#Creating environment variables
#Creating Hadoop home variable

os.environ["HADOOP_HOME"] = "/usr/local/hadoop-3.2.4"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["JRE_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre"
os.environ["PATH"] += f'{os.environ["JAVA_HOME"]}/bin:{os.environ["JRE_HOME"]}/bin:{os.environ["HADOOP_HOME"]}/sbin'

#Dowloading text example to use as input
!wget -q https://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/0/101/101.txt

openjdk version "11.0.26" 2025-01-21
OpenJDK Runtime Environment (build 11.0.26+4-post-Ubuntu-1ubuntu122.04)
OpenJDK 64-Bit Server VM (build 11.0.26+4-post-Ubuntu-1ubuntu122.04, mixed mode, sharing)
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java to provide /usr/bin/java (java) in manual mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/javac to provide /usr/bin/javac (javac) in manual mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jps to provide /usr/bin/jps (jps) in manual mode
openjdk version "1.8.0_442"
OpenJDK Runtime Environment (build 1.8.0_442-8u442-b06~us1-0ubuntu1~22.04-b06)
OpenJDK 64-Bit Server VM (build 25.442-b06, mixed mode)
/usr/lib/jvm/java-8-openjdk-amd64/jre/
 * Starting OpenBSD Secure Shell server sshd
   ...done.
#Port 22
#GatewayPorts no
Generating public/private rsa key pair.
Created directory '/root/.ssh'.
Your identification has been saved in /root/.ssh/id_rsa
Your public key has been saved in

In [2]:
#Adding required property to core-site.xml file
%%bash
cat <<EOF > $HADOOP_HOME/etc/hadoop/core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
  <property>
          <name>fs.defaultFS</name>
          <value>hdfs://localhost:9000</value>
          <description>Where HDFS NameNode can be found on the network</description>
  </property>
</configuration>
EOF

In [3]:
%%bash
cat <<EOF > $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>

</configuration>
EOF

In [4]:
%%bash
cat <<EOF > $HADOOP_HOME/etc/hadoop/mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
  <property>
    <name>mapreduce.application.classpath</name>
    <value>$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*</value>
  </property>

</configuration>
EOF

In [5]:
%%bash
cat <<EOF > $HADOOP_HOME/etc/hadoop/yarn-site.xml
<?xml version="1.0"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>
<property>
    <description>The hostname of the RM.</description>
    <name>yarn.resourcemanager.hostname</name>
    <value>localhost</value>
  </property>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.nodemanager.env-whitelist</name>
    <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>
  </property>

<!-- Site specific YARN configuration properties -->

</configuration>
EOF

In [6]:
!cat $HADOOP_HOME/etc/hadoop/core-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
  <property>
          <name>fs.defaultFS</name>
          <value>hdfs://localhost:9000</value>
          <description>Where HDFS NameNode can be found on the network</description>
  </property>
</configuration>


# Formatting the HDFS Filesystem

Before HDFS can be used for the first time the file system must be formatted. The formatting process creates an empty file system by creating the storage directories and the initial versions of the NameNodes

In [7]:
!$HADOOP_HOME/bin/hdfs namenode -format

#Creating other necessary enviroment variables before starting nodes
os.environ["HDFS_NAMENODE_USER"] = "root"
os.environ["HDFS_DATANODE_USER"] = "root"
os.environ["HDFS_SECONDARYNAMENODE_USER"] = "root"
os.environ["YARN_RESOURCEMANAGER_USER"] = "root"
os.environ["YARN_NODEMANAGER_USER"] = "root"

#Launching hdfs deamons
!$HADOOP_HOME/sbin/start-dfs.sh

#Launching yarn deamons
#nohup causes a process to ignore a SIGHUP signal
!nohup $HADOOP_HOME/sbin/start-yarn.sh

#Listing the running deamons
!jps

2025-03-13 17:03:25,608 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = d5cdf3b79686/172.28.0.12
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.2.4
STARTUP_MSG:   classpath = /usr/local/hadoop-3.2.4/etc/hadoop:/usr/local/hadoop-3.2.4/share/hadoop/common/lib/jetty-webapp-9.4.43.v20210629.jar:/usr/local/hadoop-3.2.4/share/hadoop/common/lib/kerby-pkix-1.0.1.jar:/usr/local/hadoop-3.2.4/share/hadoop/common/lib/error_prone_annotations-2.2.0.jar:/usr/local/hadoop-3.2.4/share/hadoop/common/lib/commons-compress-1.21.jar:/usr/local/hadoop-3.2.4/share/hadoop/common/lib/checker-qual-2.5.2.jar:/usr/local/hadoop-3.2.4/share/hadoop/common/lib/commons-io-2.8.0.jar:/usr/local/hadoop-3.2.4/share/hadoop/common/lib/httpcore-4.4.13.jar:/usr/local/hadoop-3.2.4/share/hadoop/common/lib/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar:/usr/local/hadoop-3.2.4/share/hadoop/common/lib

### Monitoring Hadoop cluster with hadoop admin commands

In [8]:
#Report the basic file system information and statistics
!$HADOOP_HOME/bin/hdfs dfsadmin -report

Configured Capacity: 115658190848 (107.72 GB)
Present Capacity: 73758453760 (68.69 GB)
DFS Remaining: 73758429184 (68.69 GB)
DFS Used: 24576 (24 KB)
DFS Used%: 0.00%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (1):

Name: 127.0.0.1:9866 (localhost)
Hostname: d5cdf3b79686
Decommission Status : Normal
Configured Capacity: 115658190848 (107.72 GB)
DFS Used: 24576 (24 KB)
Non DFS Used: 41882959872 (39.01 GB)
DFS Remaining: 73758429184 (68.69 GB)
DFS Used%: 0.00%
DFS Remaining%: 63.77%
Configured Cache Capacity: 0 (0 B)
Cache

# MapReduce

In [9]:
#Dowloading text example to use as input (if it has not been donwloaded yet)
!wget -q https://raw.githubusercontent.com/ayyoubmaul/hadoop-docker/refs/heads/main/data/uud45.txt -O wordcount.txt

# Create a directory in HDFS and upload the file
!$HADOOP_HOME/bin/hdfs dfs -mkdir /word_count
!$HADOOP_HOME/bin/hdfs dfs -put /content/wordcount.txt /word_count

#Exploring Hadoop folder
!$HADOOP_HOME/bin/hdfs dfs -ls /word_count

Found 1 items
-rw-r--r--   1 root supergroup       1412 2025-03-13 17:04 /word_count/wordcount.txt


In [10]:
#Exploring the created output directory
#part-r-00000 contains the actual ouput
!$HADOOP_HOME/bin/hdfs dfs -ls /word_count/output_wordcount

ls: `/word_count/output_wordcount': No such file or directory


# Mapper and Reducer

In [11]:
%%writefile mapper.py
#!/usr/bin/env python
import sys

for line in sys.stdin:
    line = line.strip()
    words = line.split()
    for word in words:
        print('%s\t%s' % (word, 1))

Writing mapper.py


In [12]:
%%writefile reducer.py
#!/usr/bin/env python
import sys

current_word = None
current_count = 0
word = None

for line in sys.stdin:
    line = line.strip()
    word, count = line.split('\t', 1)
    try:
        count = int(count)
    except ValueError:
        continue

    if current_word == word:
        current_count += count
    else:
        if current_word:
            print('%s\t%s' % (current_word, current_count))
        current_count = count
        current_word = word

if current_word == word:
    print('%s\t%s' % (current_word, current_count))

Writing reducer.py


In [13]:
!chmod +x /content/mapper.py
!chmod +x /content/reducer.py

In [14]:
# Run the MapReduce job
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.2.4.jar \
  -input /word_count/wordcount.txt \
  -output /word_count/output \
  -mapper "python /content/mapper.py" \
  -reducer "python /content/reducer.py"

packageJobJar: [/tmp/hadoop-unjar7922688624283937706/] [] /tmp/streamjob2485452831778438858.jar tmpDir=null
2025-03-13 17:04:22,681 INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:8032
2025-03-13 17:04:23,074 INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:8032
2025-03-13 17:04:23,511 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1741885439730_0001
2025-03-13 17:04:23,979 INFO mapred.FileInputFormat: Total input files to process : 1
2025-03-13 17:04:24,132 INFO mapreduce.JobSubmitter: number of splits:2
2025-03-13 17:04:24,880 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1741885439730_0001
2025-03-13 17:04:24,882 INFO mapreduce.JobSubmitter: Executing with tokens: []
2025-03-13 17:04:25,192 INFO conf.Configuration: resource-types.xml not found
2025-03-13 17:04:25,193 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2025-03-13 17:04:25,6

In [15]:
# Copy output from HDFS to local
!$HADOOP_HOME/bin/hdfs dfs -copyToLocal /word_count/output/part-00000 /content/output_word_count.txt

# View the output
!cat /content/output_word_count.txt

:	1
Allah	1
Atas	1
Bahwa	1
Dan	1
Dasar	1
Esa,	1
Indonesia	8
Indonesia,	3
Indonesia.	1
Kebangsaan	1
Kemerdekaan	1
Kemudian	1
Ketuhanan	1
Kuasa	1
Maha	2
Negara	3
Pemerintah	1
Republik	1
Undang-Undang	1
Yang	2
abadi	1
adil	2
bagi	1
bangsa	2
bangsa,	1
bebas,	1
beradab,	1
berbahagia	1
berdasar	1
berdasarkan	1
berdaulat,	1
berkat	1
berkedaulatan	1
berkehidupan	1
bersatu,	1
dalam	3
dan	10
darah	1
daripada	1
dengan	6
depan	1
diatas	1
didorongkan	1
dihapuskan,	1
dipimpin	1
disusunlah	1
dunia	2
gerbang	1
hak	1
harus	1
hikmat	1
ialah	1
ikut	1
ini	1
itu	3
itu,	1
karena	1
ke	1
keadilan	2
kebangsaan	1
kebijaksanaan	1
kehidupan	1
keinginan	1
kemanusiaan	1
kemerdekaan	3
kemerdekaan,	1
kemerdekaannya.	1
kepada	2
kerakyatan	1
kesejahteraan	1
ketertiban	1
luhur,	1
maka	3
makmur.	1
melaksanakan	1
melindungi	1
memajukan	1
membentuk	1
mencerdaskan	1
mengantarkan	1
menyatakan	1
merdeka,	1
mewujudkan	1
negara	1
oleh	3
penjajahan	1
perdamaian	1
pergerakan	1
perikeadilan.	1
perikemanusiaan	1
perjuangan	1
permus