In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/25 14:24:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc = spark.sparkContext # for interacting directly with RDDs

In [4]:
nums = list(range(0,1_000_000))

In [5]:
rdd = sc.parallelize(nums)

In [6]:
inverses = rdd.map(lambda x: 1/x) # map is lazy, it's a transformation

In [7]:
# inverses.collect() # action that gets all of them

In [8]:
# triggers the work, which causes divide by zero
# inverses.take(10) # action that gets first N

In [9]:
inverses = rdd.filter(lambda x: x > 0).map(lambda x: 1/x)

In [10]:
inverses.mean() # action

23/10/25 14:24:19 WARN TaskSetManager: Stage 0 contains a task of very large size (2332 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

1.4392740115605892e-05

In [11]:
rdd = sc.parallelize(nums, 10)

In [12]:
inverses = rdd.filter(lambda x: x > 0).map(lambda x: 1/x)
inverses.mean()

                                                                                

1.4392740115605814e-05

In [17]:
sample = rdd.sample(True, 0.1, 544)

In [18]:
import time

In [20]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1-t0



498504.761576394


                                                                                

3.820559501647949

In [21]:
sample.cache()

PythonRDD[6] at RDD at PythonRDD.scala:53

In [22]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1-t0



498504.761576394


                                                                                

5.748225212097168

In [23]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1-t0



498504.761576394


                                                                                

2.614823579788208

### Caching of 1 partition

In [24]:
sample = rdd.sample(True, 0.1, 544).repartition(1).cache()

In [25]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1-t0

[Stage 7:>                                                          (0 + 1) / 1]

498504.7615763949


                                                                                

5.800291538238525

In [26]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1-t0

498504.7615763949


                                                                                

0.7088220119476318

In [27]:
sample.unpersist()    # lazy operation?  (Tyler should look into it)

MapPartitionsRDD[13] at coalesce at NativeMethodAccessorImpl.java:0

In [30]:
# slow again?
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1-t0

498504.76157639234


                                                                                

0.7619664669036865

# DataFrames

In [32]:
# ! wget https://pages.cs.wisc.edu/~harter/cs544/data/ghcnd-stations.txt

In [33]:
! head ghcnd-stations.txt

ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       
ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    
AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196
AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194
AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217
AEM00041218  24.2620   55.6090  264.9    AL AIN INTL                            41218
AF000040930  35.3170   69.0170 3366.0    NORTH-SALANG                   GSN     40930
AFM00040938  34.2100   62.2280  977.2    HERAT                                  40938
AFM00040948  34.5660   69.2120 1791.3    KABUL INTL                             40948
AFM00040990  31.5000   65.8500 1010.0    KANDAHAR AIRPORT                       40990


In [34]:
df = spark.read.text("ghcnd-stations.txt")

In [35]:
df.dtypes

[('value', 'string')]

In [36]:
type(df)

pyspark.sql.dataframe.DataFrame

In [38]:
type(df.rdd)

pyspark.rdd.RDD

In [40]:
# won't work, because the executors can't see our local FS
# df.take(10)   # this is "head" operation for Spark

In [41]:
! hdfs dfs -cp ghcnd-stations.txt hdfs://nn:9000/

In [42]:
! hdfs dfs -ls hdfs://nn:9000/

Found 1 items
-rw-r--r--   3 root supergroup   10607756 2023-10-25 15:15 hdfs://nn:9000/ghcnd-stations.txt


In [43]:
df = spark.read.text("hdfs://nn:9000/ghcnd-stations.txt")

In [44]:
df.take(10)

                                                                                

[Row(value='ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       '),
 Row(value='ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    '),
 Row(value='AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196'),
 Row(value='AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194'),
 Row(value='AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217'),
 Row(value='AEM00041218  24.2620   55.6090  264.9    AL AIN INTL                            41218'),
 Row(value='AF000040930  35.3170   69.0170 3366.0    NORTH-SALANG                   GSN     40930'),
 Row(value='AFM00040938  34.2100   62.2280  977.2    HERAT                                  40938'),
 Row(value='AFM00040948  34.5660   69.2120 1791.3    KABUL INTL                             40948'),
 Row(value='AFM00040990  31.5000   65.8500 1010.0    KANDAHAR AIRPORT                      

In [46]:
# how do we get station name with Pandas?
pandas_df = df.limit(10).toPandas()
pandas_df

Unnamed: 0,value
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...
2,AE000041196 25.3330 55.5170 34.0 SHARJ...
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...
4,AEM00041217 24.4330 54.6510 26.8 ABU D...
5,AEM00041218 24.2620 55.6090 264.9 AL AI...
6,AF000040930 35.3170 69.0170 3366.0 NORTH...
7,AFM00040938 34.2100 62.2280 977.2 HERAT...
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...


In [51]:
pandas_df["station"] = pandas_df["value"].str[:11]
pandas_df.head(3)

Unnamed: 0,value,station
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...,ACW00011604
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...,ACW00011647
2,AE000041196 25.3330 55.5170 34.0 SHARJ...,AE000041196


In [52]:
# how do we extract station name with Spark

In [53]:
from pyspark.sql.functions import col, expr

In [54]:
expr("x + 1")

Column<'(x + 1)'>

In [58]:
df2 = df.withColumn("station", expr("substring(value, 0, 11)"))
df2

DataFrame[value: string, station: string]

In [59]:
df2.limit(10).toPandas()

Unnamed: 0,value,station
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...,ACW00011604
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...,ACW00011647
2,AE000041196 25.3330 55.5170 34.0 SHARJ...,AE000041196
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...,AEM00041194
4,AEM00041217 24.4330 54.6510 26.8 ABU D...,AEM00041217
5,AEM00041218 24.2620 55.6090 264.9 AL AI...,AEM00041218
6,AF000040930 35.3170 69.0170 3366.0 NORTH...,AF000040930
7,AFM00040938 34.2100 62.2280 977.2 HERAT...,AFM00040938
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...,AFM00040948
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...,AFM00040990
