# Quick Start

From [Quick Start](https://spark.apache.org/docs/latest/quick-start.html)

In [2]:
# Read a text file into a Dataset.

textFile = spark.read.text("data/names.txt")
print(f"Count is: {textFile.count()}")

Count is: 5


In [3]:
# Create a new Dataset by filtering the existing one.

linesWithDamon = textFile.filter(textFile.value.contains("damon"))
print(f"Damon count is: {linesWithDamon.count()}")

Damon count is: 1


In [4]:
# Find the line with the most words

from pyspark.sql.functions import *

textFile.select(size(split(textFile.value, "\s+")).name("numWords")).agg(max(col("numWords"))).collect()


[Row(max(numWords)=3)]

In [5]:
# MapReduce Example
# 
# Use the explode function in select to transform a Dataset of lines into a Dataset of words, and then combine
# them with groupBy and count to compute the per-word countes in the file as a Dataset of 2 columns - "word" and "count"

wordCounts = textFile.select(explode(split(textFile.value, "\s+")).alias("word")).groupBy("word").count()

In [7]:
wordCounts.collect()

[Row(word='elizabeth', count=1),
 Row(word='kari', count=1),
 Row(word='damon', count=1),
 Row(word='grace', count=1),
 Row(word='allison', count=4),
 Row(word='lily', count=1),
 Row(word='cole', count=1)]