In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/09 03:43:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc = spark.sparkContext # entry point for RDD stuff

In [4]:
nums = list(range(1_000_000))

In [5]:
rdd = sc.parallelize(nums)

In [6]:
# rdd.map(lambda ARG: RETVAL)
inverses = rdd.map(lambda x: 1/x) # TRANSFORMATION

In [35]:
# inverses.collect() # ACTION get all the numbers, maybe using a lot of RAM

In [8]:
# ACTION triggers the work, including the divide by zero
# inverses.take(3) # ACTION get first N results

In [9]:
inverses = rdd.filter(lambda x: x>0).map(lambda x: 1/x)
inverses

PythonRDD[2] at RDD at PythonRDD.scala:53

In [10]:
inverses.mean() # ACTION

24/01/09 03:43:54 WARN TaskSetManager: Stage 1 contains a task of very large size (2332 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

1.4392740115605892e-05

In [11]:
rdd.getNumPartitions()

2

In [12]:
rdd = sc.parallelize(nums, 10)
rdd.getNumPartitions()

10

In [13]:
# 4 + 2 / 10
# 4 tasks done, 2 tasks running, 10 tasks total
inverses = rdd.filter(lambda x: x>0).map(lambda x: 1/x)
inverses.mean()

                                                                                

1.4392740115605814e-05

# Sample and cache

In [14]:
sample = rdd.sample(True, fraction=0.1, seed=544)

In [15]:
sample.cache()

PythonRDD[6] at RDD at PythonRDD.scala:53

In [16]:
%%time
sample.mean()



CPU times: user 6.11 ms, sys: 823 µs, total: 6.93 ms
Wall time: 1.1 s


                                                                                

498504.761576394

In [17]:
%%time
sample.mean()

CPU times: user 6.04 ms, sys: 788 µs, total: 6.82 ms
Wall time: 554 ms


498504.761576394

# Repartition

In [18]:
sample = rdd.sample(True, fraction=0.1, seed=544).repartition(1)

In [23]:
sample.cache()

MapPartitionsRDD[13] at coalesce at NativeMethodAccessorImpl.java:0

In [20]:
%%time
sample.mean()

CPU times: user 479 µs, sys: 4.25 ms, total: 4.73 ms
Wall time: 1.07 s


498504.7615763908

In [58]:
%%time
sample.mean()

CPU times: user 7.68 ms, sys: 0 ns, total: 7.68 ms
Wall time: 288 ms


498504.7615763908

In [21]:
%%time
sample.mean()

CPU times: user 1.5 ms, sys: 3.04 ms, total: 4.54 ms
Wall time: 318 ms


498504.7615763908

In [22]:
sample.unpersist()

MapPartitionsRDD[13] at coalesce at NativeMethodAccessorImpl.java:0

# Spark Dataframes

In [24]:
!wget -nc https://pages.cs.wisc.edu/~harter/cs544/data/ghcnd-stations.txt

--2024-01-09 03:46:22--  https://pages.cs.wisc.edu/~harter/cs544/data/ghcnd-stations.txt
Resolving pages.cs.wisc.edu (pages.cs.wisc.edu)... 128.105.7.9
Connecting to pages.cs.wisc.edu (pages.cs.wisc.edu)|128.105.7.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10607756 (10M) [text/plain]
Saving to: ‘ghcnd-stations.txt.1’


2024-01-09 03:46:24 (5.99 MB/s) - ‘ghcnd-stations.txt.1’ saved [10607756/10607756]



In [27]:
!ls

demo.ipynb   demo3.ipynb  demo5.ipynb  ghcnd-stations.txt  metastore_db  sf.zip
demo2.ipynb  demo4.ipynb  derby.log    holidays2.csv	   sf.csv


In [28]:
df = spark.read.text("ghcnd-stations.txt")

In [29]:
df

DataFrame[value: string]

In [30]:
type(df), type(df.rdd)

(pyspark.sql.dataframe.DataFrame, pyspark.rdd.RDD)

In [71]:
# SparkFileNotFoundException, because the file is in my local filesystem
# but, the workers are in different container, they don't have access to 
# this file
# df.take(10)

In [31]:
!hdfs dfs -cp ghcnd-stations.txt hdfs://nn:9000/

In [32]:
!hdfs dfs -ls hdfs://nn:9000/

Found 1 items
-rw-r--r--   3 root supergroup   10607756 2024-01-09 03:47 hdfs://nn:9000/ghcnd-stations.txt


In [33]:
df = spark.read.text("hdfs://nn:9000/ghcnd-stations.txt")

In [36]:
df.rdd.getNumPartitions()

2

In [37]:
df.take(5)

                                                                                

[Row(value='ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       '),
 Row(value='ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    '),
 Row(value='AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196'),
 Row(value='AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194'),
 Row(value='AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217')]

In [38]:
pandas_df = df.limit(10).toPandas()

                                                                                

In [39]:
pandas_df

Unnamed: 0,value
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...
2,AE000041196 25.3330 55.5170 34.0 SHARJ...
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...
4,AEM00041217 24.4330 54.6510 26.8 ABU D...
5,AEM00041218 24.2620 55.6090 264.9 AL AI...
6,AF000040930 35.3170 69.0170 3366.0 NORTH...
7,AFM00040938 34.2100 62.2280 977.2 HERAT...
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...


In [40]:
type(pandas_df)

pandas.core.frame.DataFrame

## extract station ID using pandas

In [42]:
pandas_df["station"] = pandas_df["value"].str[:11]

In [43]:
pandas_df

Unnamed: 0,value,station
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...,ACW00011604
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...,ACW00011647
2,AE000041196 25.3330 55.5170 34.0 SHARJ...,AE000041196
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...,AEM00041194
4,AEM00041217 24.4330 54.6510 26.8 ABU D...,AEM00041217
5,AEM00041218 24.2620 55.6090 264.9 AL AI...,AEM00041218
6,AF000040930 35.3170 69.0170 3366.0 NORTH...,AF000040930
7,AFM00040938 34.2100 62.2280 977.2 HERAT...,AFM00040938
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...,AFM00040948
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...,AFM00040990


## extract station ID using Spark

In [44]:
from pyspark.sql.functions import col, expr

In [45]:
expr("x + 1")

Column<'(x + 1)'>

In [46]:
# expr("SQL STUFF HERE")

In [47]:
expr("substring(value, 0, 11)")

Column<'substring(value, 0, 11)'>

In [49]:
df.take(3)

[Row(value='ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       '),
 Row(value='ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    '),
 Row(value='AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196')]

In [53]:
df2 = df.withColumn("station", expr("substring(value, 0, 11)"))

In [54]:
df2

DataFrame[value: string, station: string]

In [55]:
df2.limit(5).toPandas()

Unnamed: 0,value,station
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...,ACW00011604
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...,ACW00011647
2,AE000041196 25.3330 55.5170 34.0 SHARJ...,AE000041196
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...,AEM00041194
4,AEM00041217 24.4330 54.6510 26.8 ABU D...,AEM00041217


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 56990)
Traceback (most recent call last):
  File "/usr/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/lib/python3.10/dist-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
  File "/usr/local/lib/python3.10/dist-packages/pyspark/accumulators.py", line 271, in accum_updates
    num_updates = read_int(