In [None]:
# https://spark.apache.org/docs/latest/quick-start.html

# Interactive analysis

## Basics

In [1]:
textFile = spark.read.text("README.md")

In [5]:
!head -5 README.md

# Assignment 3: Apache Spark

The goal of this assignment is to learn how to do large-scale data analysis tasks using Apache Spark: for this assignment, we will use relatively small datasets and  we won't run anything in distributed mode; however Spark can be easily used to run the same programs on much larger datasets.

### Getting Started with Spark


In [8]:
print(textFile)

DataFrame[value: string]
DataFrame[value: string]


In [11]:
textFile.show()

+--------------------+
|               value|
+--------------------+
|# Assignment 3: A...|
|                    |
|The goal of this ...|
|                    |
|### Getting Start...|
|                    |
|This guide is bas...|
|                    |
|[Apache Spark](ht...|
|                    |
|Spark can be used...|
|                    |
|### Installing Spark|
|                    |
|Since the Spark d...|
|                    |
|1. Download the S...|
|2. Move the downl...|
|`tar zxvf spark-3...|
|3. This will crea...|
+--------------------+
only showing top 20 rows



In [2]:
# Print number of rows.
print(textFile.count())

140


In [10]:
textFile.collect()

[Row(value='# Assignment 3: Apache Spark'),
 Row(value=''),
 Row(value="The goal of this assignment is to learn how to do large-scale data analysis tasks using Apache Spark: for this assignment, we will use relatively small datasets and  we won't run anything in distributed mode; however Spark can be easily used to run the same programs on much larger datasets."),
 Row(value=''),
 Row(value='### Getting Started with Spark'),
 Row(value=''),
 Row(value='This guide is basically a summary of the excellent tutorials that can be found at the [Spark website](http://spark.apache.org).'),
 Row(value=''),
 Row(value='[Apache Spark](https://spark.apache.org) is a relatively new cluster computing framework, developed originally at UC Berkeley. It significantly generalizes the 2-stage Map-Reduce paradigm (originally proposed by Google and popularized by open-source Hadoop system); Spark is instead based on the abstraction of **resilient distributed datasets (RDDs)**. An RDD is basically a distribu

In [6]:
textFile.first()

Row(value='# Assignment 3: Apache Spark')

In [12]:
linesWithSpark = textFile.filter(textFile.value.contains("Spark"))
print(linesWithSpark.count())

17


## More on Dataset Operations

In [25]:
from pyspark.sql.functions import *

# Split each line in words and count.
df = textFile.select(size(split(textFile.value, "\s+")))
df.show()

+---------------------------+
|size(split(value, \s+, -1))|
+---------------------------+
|                          5|
|                          1|
|                         50|
|                          1|
|                          5|
|                          1|
|                         18|
|                          1|
|                         87|
|                          1|
|                         19|
|                          1|
|                          3|
|                          1|
|                         17|
|                          1|
|                         18|
|                         20|
|                          3|
|                          9|
+---------------------------+
only showing top 20 rows



In [26]:
df = textFile.select(size(split(textFile.value, "\s+")).name("numWords"))
df.show()

+--------+
|numWords|
+--------+
|       5|
|       1|
|      50|
|       1|
|       5|
|       1|
|      18|
|       1|
|      87|
|       1|
|      19|
|       1|
|       3|
|       1|
|      17|
|       1|
|      18|
|      20|
|       3|
|       9|
+--------+
only showing top 20 rows



In [27]:
# Find the max.
df.agg(max(col("numWords"))).collect()

[Row(max(numWords)=101)]

In [32]:
# Implement map-reduce in one line.
wordCounts = textFile.select(explode(split(textFile.value, "\s+")).alias("word")).groupBy("word").count()
wordCounts.show()

+--------------------+-----+
|                word|count|
+--------------------+-----+
|                some|    3|
|                 few|    1|
|               input|    5|
|            `(user1,|    1|
|               those|    1|
|   self-explanatory,|    1|
|              [Spark|    2|
|                 map|    1|
|        Messenger]`.|    1|
|               Nobel|    2|
|website](http://s...|    1|
|           typically|    1|
|           sanitized|    1|
|               ready|    1|
|                port|    1|
|        manipulation|    1|
|         interfaces.|    1|
|                  If|    1|
|           `pyspark`|    1|
|                used|    2|
+--------------------+-----+
only showing top 20 rows



In [36]:
# Convert a dataset of lines into a dataset of words.
map_ = textFile.select(explode(split(textFile.value, "\s+")).alias("word"))
map_.show()

+-----------+
|       word|
+-----------+
|          #|
| Assignment|
|         3:|
|     Apache|
|      Spark|
|           |
|        The|
|       goal|
|         of|
|       this|
| assignment|
|         is|
|         to|
|      learn|
|        how|
|         to|
|         do|
|large-scale|
|       data|
|   analysis|
+-----------+
only showing top 20 rows



In [40]:
result = map_.groupBy("word").count()

#result.show()
result.collect()

                                                                                

[Row(word='some', count=3),
 Row(word='few', count=1),
 Row(word='input', count=5),
 Row(word='`(user1,', count=1),
 Row(word='those', count=1),
 Row(word='self-explanatory,', count=1),
 Row(word='[Spark', count=2),
 Row(word='map', count=1),
 Row(word='Messenger]`.', count=1),
 Row(word='Nobel', count=2),
 Row(word='website](http://spark.apache.org).', count=1),
 Row(word='typically', count=1),
 Row(word='sanitized', count=1),
 Row(word='ready', count=1),
 Row(word='port', count=1),
 Row(word='manipulation', count=1),
 Row(word='interfaces.', count=1),
 Row(word='If', count=1),
 Row(word='`pyspark`', count=1),
 Row(word='used', count=2),
 Row(word='basically', count=3),
 Row(word='Application', count=2),
 Row(word='Spark](https://spark.apache.org)', count=1),
 Row(word='(we', count=1),
 Row(word='local', count=2),
 Row(word='returns', count=4),
 Row(word='present', count=1),
 Row(word='`spark_assignment.py`', count=2),
 Row(word='assignment,', count=1),
 Row(word='consisting', count=4

# RDD Programming guide

In [43]:
class DisplayRDD:
        def __init__(self, rdd):
                self.rdd = rdd

        def _repr_html_(self):                                  
                x = self.rdd.mapPartitionsWithIndex(lambda i, x: [(i, [y for y in x])])
                l = x.collect()
                s = "<table><tr>{}</tr><tr><td>".format("".join(["<th>Partition {}".format(str(j)) for (j, r) in l]))
                s += '</td><td valign="bottom" halignt="left">'.join(["<ul><li>{}</ul>".format("<li>".join([str(rr) for rr in r])) for (j, r) in l])
                s += "</td></table>"
                return s

In [47]:
data = list(range(20))

data_rdd = sc.parallelize(data)
print(data_rdd)

DisplayRDD(data_rdd)

ParallelCollectionRDD[117] at readRDDFromFile at PythonRDD.scala:274


Partition 0,Partition 1,Partition 2,Partition 3,Partition 4,Partition 5
12,345,678,91011,121314,1516171819


In [48]:
data = list(range(20))

data_rdd = sc.parallelize(data, 10)
print(data_rdd)

DisplayRDD(data_rdd)

ParallelCollectionRDD[119] at readRDDFromFile at PythonRDD.scala:274


Partition 0,Partition 1,Partition 2,Partition 3,Partition 4,Partition 5,Partition 6,Partition 7,Partition 8,Partition 9
1,23,45,67,89,1011,1213,1415,1617,1819


In [51]:
# Return one record per line.
states_rdd = sc.textFile('states.txt', 10)
print(states_rdd)
DisplayRDD(states_rdd)

states.txt MapPartitionsRDD[127] at textFile at NativeMethodAccessorImpl.java:0


Partition 0,Partition 1,Partition 2,Partition 3,Partition 4,Partition 5,Partition 6,Partition 7,Partition 8,Partition 9
AlabamaHawaiiMassachusettsNew MexicoSouth Dakota,AlaskaIdahoMichiganNew YorkTennesseeArizona,IllinoisMinnesotaNorth CarolinaTexas,ArkansasIndianaMississippiNorth DakotaUtah,CaliforniaIowaMissouriOhioVermontColorado,KansasMontanaOklahomaVirginiaConnecticutKentucky,NebraskaOregonWashingtonDelawareLouisiana,NevadaPennsylvaniaWest VirginiaFlorida,MaineNew HampshireRhode IslandWisconsinGeorgia,MarylandNew JerseySouth CarolinaWyoming


## Basics

In [54]:
# lines and lineLengths are not computed immediately (due to lazy execution).
lines = sc.textFile("states.txt")
lineLengths = lines.map(lambda s: len(s))
# reduce is an aciton and triggers the execution.
totalLength = lineLengths.reduce(lambda a, b: a + b)
print(totalLength)

422


## Passing functions.

In [58]:
def myFunc(s):
    words = s.split(" ")
    return len(words)


lines = sc.textFile("states.txt")
lineLengths = lines.map(lambda s: myFunc(s))
# reduce is an aciton and triggers the execution.
totalLength = lineLengths.reduce(lambda a, b: a + b)
print(totalLength)

60


In [61]:
counter = 0
print(data)
rdd = sc.parallelize(data)

# Wrong: Don't do this!!
def increment_counter(x):
    global counter
    counter += x
rdd.foreach(increment_counter)

# The output is zero since the executors are updating the copy.
print("Counter value: ", counter)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Counter value:  0


In [66]:
lines = sc.textFile("states.txt") + sc.textFile("states.txt")
pairs = lines.map(lambda s: (s, 1))
counts = pairs.reduceByKey(lambda a, b: a + b)
print(counts.collect())

[('Tennessee', 2), ('Arizona', 2), ('North Dakota', 2), ('Nebraska', 2), ('Washington', 2), ('West Virginia', 2), ('New Hampshire', 2), ('Maryland', 2), ('New Jersey', 2), ('South Carolina', 2), ('Alabama', 2), ('Massachusetts', 2), ('Michigan', 2), ('Mississippi', 2), ('Utah', 2), ('Iowa', 2), ('Missouri', 2), ('Ohio', 2), ('Montana', 2), ('Connecticut', 2), ('Kentucky', 2), ('Nevada', 2), ('Rhode Island', 2), ('Georgia', 2), ('Hawaii', 2), ('New Mexico', 2), ('Illinois', 2), ('Minnesota', 2), ('North Carolina', 2), ('Texas', 2), ('Arkansas', 2), ('Indiana', 2), ('Vermont', 2), ('Colorado', 2), ('Kansas', 2), ('Oregon', 2), ('Delaware', 2), ('Louisiana', 2), ('Florida', 2), ('Maine', 2), ('South Dakota', 2), ('Alaska', 2), ('Idaho', 2), ('New York', 2), ('California', 2), ('Oklahoma', 2), ('Virginia', 2), ('Pennsylvania', 2), ('Wisconsin', 2), ('Wyoming', 2)]


## Pi

In [77]:
# Estimate π (compute-intensive task).
# Pick random points in the unit square [(0,0)-(1,1)].
# See how many fall in the unit circle center=(0, 0), radius=1.
# The fraction should be π / 4.

import random
random.seed(314)

def sample(p):
    x, y = random.random(), random.random()
    in_unit_circle = 1 if x*x + y*y < 1 else 0
    return in_unit_circle

# “parallelize” method creates an RDD.
NUM_SAMPLES = int(1e6)
count = sc.parallelize(range(0, NUM_SAMPLES)) \
           .map(sample) \
           .reduce(lambda a, b: a + b)
approx_pi = 4.0 * count / NUM_SAMPLES
print("pi is roughly %f" % approx_pi)

pi is roughly 3.141400


## Working with key-value pairs

In [84]:
!more data.txt

One a penny, two a penny, hot cross buns


In [83]:
lines = sc.textFile("data.txt").flatMap(lambda line: line.split(" "))
pairs = lines.map(lambda s: (s, 1))
counts = pairs.reduceByKey(lambda a, b: a + b)
result = counts.collect()
print(result)

[('One', 1), ('two', 1), ('hot', 1), ('cross', 1), ('a', 2), ('penny,', 2), ('buns', 1)]


In [85]:
result = sc.textFile("data.txt").\
    flatMap(lambda line: line.split(" ")).\
    map(lambda s: (s, 1))\
    .reduceByKey(lambda a, b: a + b)\
#     .collect()
print(result)

[('One', 1), ('two', 1), ('hot', 1), ('cross', 1), ('a', 2), ('penny,', 2), ('buns', 1)]


In [None]:
## Transformations

In [None]:
## Actions

In [None]:
## Shuffle operation

In [None]:
## RDD persistence

In [None]:
## Shared variables

In [None]:
# Spark SQL

In [None]:
# Structured Streaming

In [None]:
# MLlib

In [None]:
# GraphX

# pyspark

https://spark.apache.org/docs/latest/api/python/getting_started/index.html

In [90]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting numpy>=1.21.0
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy, pandas
Successfully installed numpy-1.23.5 pandas-1.5.2
[0m

In [88]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x7fdce815d840>


In [91]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1,
        b=2.,
        c='string1',
        d=date(2000, 1, 1),
        e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2,
        b=3.,
        c='string2',
        d=date(2000, 2, 1),
        e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4,
        b=5.,
        c='string3',
        d=date(2000, 3, 1),
        e=datetime(2000, 1, 3, 12, 0))
])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]