## Test Spark setup for cluster

Here we try some test commands to verify the cluster setup.

In [None]:
# Init Spark to be used by pyspark.
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Init SparkSession.
spark = SparkSession \
    .builder \
    .appName("Test Spark and HDFS") \
    .getOrCreate()

In [7]:
# Try out plain pyspark.
myRange = spark.range(1000).toDF("number")
myRange.where("number % 2 = 0").show(5)

[Stage 3:>                                                          (0 + 1) / 1]

+------+
|number|
+------+
|     0|
|     2|
|     4|
|     6|
|     8|
+------+
only showing top 5 rows



                                                                                

In [8]:
# Read dummy data from hdfs.
holmes_raw = spark.read.text("/test/holmes.txt")

In [10]:
# Simple world count example for dummy data.

from pyspark.sql.functions import split, col
wc = holmes_raw \
    .select(split(col("value"), " ").alias("sentence")) \
    .selectExpr("(explode(sentence)) as word") \
    .selectExpr("lower(word) as word") \
    .filter("word != ''") \
    .groupBy("word") \
    .count()\
    .orderBy("count", ascending=False) \
    .show(20)

[Stage 6:>                                                          (0 + 1) / 1]

+----+-----+
|word|count|
+----+-----+
| the| 5709|
| and| 2878|
|  of| 2759|
|  to| 2721|
|   a| 2648|
|   i| 2533|
|  in| 1761|
|that| 1604|
| was| 1371|
|  he| 1278|
|  it| 1267|
| you| 1176|
| his| 1146|
|  is| 1079|
|  my|  955|
|have|  903|
|with|  869|
|  as|  848|
| had|  813|
|  at|  768|
+----+-----+
only showing top 20 rows



                                                                                

In [11]:
spark.stop()