# Test HDFS and Spark setup for cluster

Here we try some test commands to verify the cluster setup.

In [1]:
# Download some dummy data.
!wget https://www.gutenberg.org/files/1661/1661-0.txt -O ~/holmes.txt

--2024-05-31 09:13:11--  https://www.gutenberg.org/files/1661/1661-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607504 (593K) [text/plain]
Saving to: ‘/home/cluster/holmes.txt’


2024-05-31 09:13:12 (1.17 MB/s) - ‘/home/cluster/holmes.txt’ saved [607504/607504]



In [2]:
# Verify the download.
!head -n 5 /home/cluster/holmes.txt

﻿The Project Gutenberg eBook of The Adventures of Sherlock Holmes,
by Arthur Conan Doyle

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions


In [14]:
# Upload dummy data to hdfs.
!hdfs dfs -mkdir /test
!hdfs dfs -put ~/holmes.txt /test/

In [15]:
# Verify upload.
!hdfs dfs -ls /test

Found 1 items
-rw-r--r--   2 cluster supergroup     607504 2024-05-31 09:19 /test/holmes.txt


In [12]:
# Read dummy data from hdfs.
!hdfs dfs -cat /test/holmes.txt | wc -l

12306


In [None]:
# Init Spark to be used by pyspark.
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Init SparkSession.
spark = SparkSession \
    .builder \
    .appName("Test Spark and HDFS") \
    .getOrCreate()

In [2]:
# Try out plain pyspark.
myRange = spark.range(1000).toDF("number")
myRange.where("number % 2 = 0").show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+------+
|number|
+------+
|     0|
|     2|
|     4|
|     6|
|     8|
+------+
only showing top 5 rows



                                                                                

In [3]:
# Read dummy data from hdfs.
holmes_raw = spark.read.text("/test/holmes.txt")

In [4]:
# Simple world count example for dummy data.

from pyspark.sql.functions import split, col
wc = holmes_raw \
    .select(split(col("value"), " ").alias("sentence")) \
    .selectExpr("(explode(sentence)) as word") \
    .selectExpr("lower(word) as word") \
    .filter("word != ''") \
    .groupBy("word") \
    .count()\
    .orderBy("count", ascending=False) \
    .show(20)

[Stage 1:>                                                          (0 + 1) / 1]

+----+-----+
|word|count|
+----+-----+
| the| 5709|
| and| 2878|
|  of| 2759|
|  to| 2721|
|   a| 2648|
|   i| 2533|
|  in| 1761|
|that| 1604|
| was| 1371|
|  he| 1278|
|  it| 1267|
| you| 1176|
| his| 1146|
|  is| 1079|
|  my|  955|
|have|  903|
|with|  869|
|  as|  848|
| had|  813|
|  at|  768|
+----+-----+
only showing top 20 rows



                                                                                

In [5]:
spark.stop()

In [6]:
# Remove dummy data.
!hdfs dfs -rm /test/holmes.txt
!hdfs dfs -rmdir /test

Deleted /test/holmes.txt
