In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/25 18:10:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc = spark.sparkContext # entry point for RDD stuff

In [4]:
nums = list(range(1_000_000))
rdd = sc.parallelize(nums)

In [5]:
inverses = rdd.map(lambda x: 1/x) # TRANSFORMATION

In [6]:
# inverses.collect() # ACTION to get all the numbers, maybe using a lot of RAM

In [7]:
# ACTION triggers the work, including the divide by zero!
# inverses.take(10)  # ACTION to get first N results

In [8]:
inverses = rdd.filter(lambda x: x > 0).map(lambda x: 1/x)
inverses

PythonRDD[1] at RDD at PythonRDD.scala:53

In [9]:
inverses.mean() # mean is an ACTION

23/10/25 18:24:58 WARN TaskSetManager: Stage 0 contains a task of very large size (2332 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

1.4392740115605892e-05

In [10]:
rdd.getNumPartitions()

2

In [11]:
rdd = sc.parallelize(nums, 10)
rdd.getNumPartitions()

10

In [12]:
# 4 + 2 / 10
# 4 tasks done
# 2 tasks running
# 10 tasks total
inverses = rdd.filter(lambda x: x > 0).map(lambda x: 1/x)
inverses.mean()

                                                                                

1.4392740115605814e-05

In [14]:
sample = rdd.sample(True, fraction=0.1, seed=544)

In [15]:
import time

In [17]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0



498504.761576394


                                                                                

3.643348455429077

In [18]:
sample.cache()

PythonRDD[7] at RDD at PythonRDD.scala:53

In [19]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0



498504.761576394


                                                                                

6.206595420837402

In [20]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0



498504.761576394


                                                                                

3.3982625007629395

In [21]:
sample = rdd.sample(True, fraction=0.1, seed=544).repartition(1).cache()

In [22]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0

[Stage 7:>                                                          (0 + 1) / 1]

498504.7615763901


                                                                                

5.471904516220093

In [23]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0

[Stage 9:>                                                          (0 + 1) / 1]

498504.7615763901


                                                                                

0.9089465141296387

# Spark DataFrames

In [24]:
! wget https://pages.cs.wisc.edu/~harter/cs544/data/ghcnd-stations.txt

--2023-10-25 18:34:59--  https://pages.cs.wisc.edu/~harter/cs544/data/ghcnd-stations.txt
Resolving pages.cs.wisc.edu (pages.cs.wisc.edu)... 128.105.7.9
Connecting to pages.cs.wisc.edu (pages.cs.wisc.edu)|128.105.7.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10607756 (10M) [text/plain]
Saving to: ‘ghcnd-stations.txt’


2023-10-25 18:35:01 (11.4 MB/s) - ‘ghcnd-stations.txt’ saved [10607756/10607756]



In [26]:
df = spark.read.text("ghcnd-stations.txt")

In [27]:
df

DataFrame[value: string]

In [28]:
type(df), type(df.rdd)

(pyspark.sql.dataframe.DataFrame, pyspark.rdd.RDD)

In [30]:
# df.take(10)

In [31]:
! hdfs dfs -cp ghcnd-stations.txt hdfs://nn:9000/

In [32]:
! hdfs dfs -ls hdfs://nn:9000/

Found 1 items
-rw-r--r--   3 root supergroup   10607756 2023-10-25 18:39 hdfs://nn:9000/ghcnd-stations.txt


In [33]:
df = spark.read.text("hdfs://nn:9000/ghcnd-stations.txt")

In [36]:
!head ghcnd-stations.txt

ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       
ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    
AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196
AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194
AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217
AEM00041218  24.2620   55.6090  264.9    AL AIN INTL                            41218
AF000040930  35.3170   69.0170 3366.0    NORTH-SALANG                   GSN     40930
AFM00040938  34.2100   62.2280  977.2    HERAT                                  40938
AFM00040948  34.5660   69.2120 1791.3    KABUL INTL                             40948
AFM00040990  31.5000   65.8500 1010.0    KANDAHAR AIRPORT                       40990


In [35]:
df.take(10)

                                                                                

[Row(value='ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       '),
 Row(value='ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    '),
 Row(value='AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196'),
 Row(value='AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194'),
 Row(value='AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217'),
 Row(value='AEM00041218  24.2620   55.6090  264.9    AL AIN INTL                            41218'),
 Row(value='AF000040930  35.3170   69.0170 3366.0    NORTH-SALANG                   GSN     40930'),
 Row(value='AFM00040938  34.2100   62.2280  977.2    HERAT                                  40938'),
 Row(value='AFM00040948  34.5660   69.2120 1791.3    KABUL INTL                             40948'),
 Row(value='AFM00040990  31.5000   65.8500 1010.0    KANDAHAR AIRPORT                      

In [38]:
pandas_df = df.limit(10).toPandas()
pandas_df

Unnamed: 0,value
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...
2,AE000041196 25.3330 55.5170 34.0 SHARJ...
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...
4,AEM00041217 24.4330 54.6510 26.8 ABU D...
5,AEM00041218 24.2620 55.6090 264.9 AL AI...
6,AF000040930 35.3170 69.0170 3366.0 NORTH...
7,AFM00040938 34.2100 62.2280 977.2 HERAT...
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...


In [39]:
# extract station ID using pandas

In [44]:
pandas_df["station"] = pandas_df["value"].str[:11]
pandas_df

Unnamed: 0,value,station
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...,ACW00011604
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...,ACW00011647
2,AE000041196 25.3330 55.5170 34.0 SHARJ...,AE000041196
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...,AEM00041194
4,AEM00041217 24.4330 54.6510 26.8 ABU D...,AEM00041217
5,AEM00041218 24.2620 55.6090 264.9 AL AI...,AEM00041218
6,AF000040930 35.3170 69.0170 3366.0 NORTH...,AF000040930
7,AFM00040938 34.2100 62.2280 977.2 HERAT...,AFM00040938
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...,AFM00040948
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...,AFM00040990


In [40]:
# extract station ID using Spark

In [46]:
from pyspark.sql.functions import col, expr

In [47]:
expr("x + 1")

Column<'(x + 1)'>

In [None]:
# expr("SQL STUFF HERE")

In [48]:
expr("substring(value, 0, 11)")

Column<'substring(value, 0, 11)'>

In [49]:
df2 = df.withColumn("station", expr("substring(value, 0, 11)"))

In [50]:
df2

DataFrame[value: string, station: string]

In [51]:
df2.limit(10).toPandas()

Unnamed: 0,value,station
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...,ACW00011604
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...,ACW00011647
2,AE000041196 25.3330 55.5170 34.0 SHARJ...,AE000041196
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...,AEM00041194
4,AEM00041217 24.4330 54.6510 26.8 ABU D...,AEM00041217
5,AEM00041218 24.2620 55.6090 264.9 AL AI...,AEM00041218
6,AF000040930 35.3170 69.0170 3366.0 NORTH...,AF000040930
7,AFM00040938 34.2100 62.2280 977.2 HERAT...,AFM00040938
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...,AFM00040948
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...,AFM00040990
