# Quick Start

From [Quick Start](https://spark.apache.org/docs/latest/quick-start.html)

In [17]:
# Print environment information

import sys
import os

print(f"Python version:\n\t{sys.version}")
print("\n\n")
print(f"Environment:\n\t{repr(os.environ)}")



Python version:
	3.7.0 (default, Jul 10 2018, 07:48:31) 
[Clang 9.1.0 (clang-902.0.39.2)]



Environment:
	environ({'SPARK_HOME': '/usr/local/Cellar/apache-spark/2.3.2/libexec', 'TERM_PROGRAM': 'iTerm.app', 'rvm_bin_path': '/Users/dra/.rvm/bin', 'GEM_HOME': '/Users/dra/.rvm/gems/ruby-2.5.1@ios', 'NVM_CD_FLAGS': '', 'SHELL': '/usr/local/bin/fish', 'TERM': 'xterm-color', 'OMF_CONFIG': '/Users/dra/.config/omf', 'TMPDIR': '/var/folders/60/9grv09hj6c39tv4rlmyjc4pm94m_nc/T/', 'Apple_PubSub_Socket_Render': '/private/tmp/com.apple.launchd.uy4QaOFFql/Render', 'TERM_PROGRAM_VERSION': '3.2.5', 'TERM_SESSION_ID': 'w0t0p0:7C79ECDD-608F-4371-85BB-54AD8841C156', 'SPARK_CONF_DIR': '/usr/local/Cellar/apache-spark/2.3.2/libexec/conf', 'NVM_DIR': '/Users/dra/.nvm', 'USER': 'dra', 'COMMAND_MODE': 'unix2003', 'PYSPARK_PYTHON': 'python', 'rvm_path': '/Users/dra/.rvm', 'PYSPARK_DRIVER_PYTHON': 'jupyter', 'SSH_AUTH_SOCK': '/private/tmp/com.apple.launchd.nLcGXejecb/Listeners', '__CF_USER_TEXT_ENCODING': '0x124

In [15]:
# Print the spark context.
#
# pyspark will create a SparkContext. You can configure the SparkContext by passing command line args to `pyspark`
#
# Adds test.py to the path - which can be imported later.
#
# $ pyspark --py-files test.py
#
# You can customize the Jupyter commands using the PYSPARK_DRIVER_PYTHON_OPTS environment variables.
# 
# These variables are set at: ~/.config/fish/config.fish
# set --export PYSPARK_DRIVER_PYTHON jupyter
# set --export PYSPARK_DRIVER_PYTHON_OPTS 'notebook'

spark

In [2]:
# Read a text file into an RDD of a collection of lines.
#
# NOTE: For local files, the file must also be accessible at the same path on worker nodes. Either
#       use HDFS or a network file share when operating in a cluster.

textFile = spark.read.text("data/names.txt")
print(f"Count is: {textFile.count()}")

Count is: 5


In [8]:
#
# RDDs support two types of operations:
#
# * Transformations (map, filter) - return new RDDs.
# * Actions (reduce) - collects results.
#
# Transformations are lazy - they are not executed until an action is performed.
#
# Create a new Dataset by filtering the existing one.
#

linesWithDamon = textFile.filter(textFile.value.contains("damon"))
print(f"Damon count is: {linesWithDamon.count()}")

spark.

Damon count is: 1


In [4]:
# Find the line with the most words

from pyspark.sql.functions import *

textFile.select(size(split(textFile.value, "\s+")).name("numWords")).agg(max(col("numWords"))).collect()


[Row(max(numWords)=3)]

In [5]:
# MapReduce Example
# 
# Use the explode function in select to transform a Dataset of lines into a Dataset of words, and then combine
# them with groupBy and count to compute the per-word countes in the file as a Dataset of 2 columns - "word" and "count"

wordCounts = textFile.select(explode(split(textFile.value, "\s+")).alias("word")).groupBy("word").count()

In [6]:
wordCounts.collect()

[Row(word='elizabeth', count=1),
 Row(word='kari', count=1),
 Row(word='damon', count=1),
 Row(word='grace', count=1),
 Row(word='allison', count=4),
 Row(word='lily', count=1),
 Row(word='cole', count=1)]

In [7]:
# Cache the dataset into a cluster-wide in-memory cache.
# 
# This is useful when data is accessed multiple times.

wordCounts.cache()

DataFrame[word: string, count: bigint]