In [1]:
from pyspark.sql import SparkSession
# SparkSession only takes in 1 argument, it uses these functions to set attributes and then returns itself
# slow because it has to setup the JVM (and other steps)
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/02 03:49:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sc = spark.sparkContext

In [3]:
!wget -nc https://pages.cs.wisc.edu/~harter/cs544/data/ghcnd-stations.txt

--2023-11-02 03:51:18--  https://pages.cs.wisc.edu/~harter/cs544/data/ghcnd-stations.txt
Resolving pages.cs.wisc.edu (pages.cs.wisc.edu)... 128.105.7.9
Connecting to pages.cs.wisc.edu (pages.cs.wisc.edu)|128.105.7.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10607756 (10M) [text/plain]
Saving to: ‘ghcnd-stations.txt’


2023-11-02 03:51:19 (14.8 MB/s) - ‘ghcnd-stations.txt’ saved [10607756/10607756]



In [6]:
df = spark.read.text("ghcnd-stations.txt")
df

DataFrame[value: string]

In [7]:
type(df), type(df.rdd)

(pyspark.sql.dataframe.DataFrame, pyspark.rdd.RDD)

In [None]:
# the file doesn't exist? ---> that's because we need to put it in HDFS so that the Spark workers can access it
# df.take(10)

In [10]:
! hdfs dfs -cp ghcnd-stations.txt hdfs://nn:9000/

In [12]:
# now we can tell that the file is in HDFS
! hdfs dfs -ls hdfs://nn:9000/

Found 1 items
-rw-r--r--   3 root supergroup   10607756 2023-11-02 04:06 hdfs://nn:9000/ghcnd-stations.txt


In [13]:
df = spark.read.text("hdfs://nn:9000/ghcnd-stations.txt")

In [15]:
! head ghcnd-stations.txt

ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       
ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    
AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196
AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194
AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217
AEM00041218  24.2620   55.6090  264.9    AL AIN INTL                            41218
AF000040930  35.3170   69.0170 3366.0    NORTH-SALANG                   GSN     40930
AFM00040938  34.2100   62.2280  977.2    HERAT                                  40938
AFM00040948  34.5660   69.2120 1791.3    KABUL INTL                             40948
AFM00040990  31.5000   65.8500 1010.0    KANDAHAR AIRPORT                       40990


In [16]:
df.take(10)  # we can see the rows from the file in our DataFrame

[Row(value='ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       '),
 Row(value='ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    '),
 Row(value='AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196'),
 Row(value='AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194'),
 Row(value='AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217'),
 Row(value='AEM00041218  24.2620   55.6090  264.9    AL AIN INTL                            41218'),
 Row(value='AF000040930  35.3170   69.0170 3366.0    NORTH-SALANG                   GSN     40930'),
 Row(value='AFM00040938  34.2100   62.2280  977.2    HERAT                                  40938'),
 Row(value='AFM00040948  34.5660   69.2120 1791.3    KABUL INTL                             40948'),
 Row(value='AFM00040990  31.5000   65.8500 1010.0    KANDAHAR AIRPORT                      

In [18]:
# to make the output easier to read, can move to a Pandas dataframe
# but be careful not to load too much into Pandas (then it's only on 1 computer's memory)
pandas_df = df.limit(10).toPandas()
pandas_df   # this is all bunched up though, can we extract a column of only the station ID?

Unnamed: 0,value
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...
2,AE000041196 25.3330 55.5170 34.0 SHARJ...
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...
4,AEM00041217 24.4330 54.6510 26.8 ABU D...
5,AEM00041218 24.2620 55.6090 264.9 AL AI...
6,AF000040930 35.3170 69.0170 3366.0 NORTH...
7,AFM00040938 34.2100 62.2280 977.2 HERAT...
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...


#### Extract station ID using Pandas

In [24]:
# pandas DataFrames are mutable
pandas_df["station"] = pandas_df["value"].str[ : 11]    # all station IDs are 11 char long, so this slicing works
pandas_df

Unnamed: 0,value,station
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...,ACW00011604
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...,ACW00011647
2,AE000041196 25.3330 55.5170 34.0 SHARJ...,AE000041196
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...,AEM00041194
4,AEM00041217 24.4330 54.6510 26.8 ABU D...,AEM00041217
5,AEM00041218 24.2620 55.6090 264.9 AL AI...,AEM00041218
6,AF000040930 35.3170 69.0170 3366.0 NORTH...,AF000040930
7,AFM00040938 34.2100 62.2280 977.2 HERAT...,AFM00040938
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...,AFM00040948
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...,AFM00040990


#### Extract station ID using Spark

In [25]:
# Spark DataFrames are immutable, so we can't just put an extra column in there

In [26]:
from pyspark.sql.functions import expr, col

In [27]:
# expr("SQL STUFF HERE")
expr("substring(value, 0, 11)")   # only a demo of how expr works

Column<'substring(value, 0, 11)'>

In [28]:
# df.withColumn(COLUMN NAME, EXPRESSION)
df2 = df.withColumn("station", expr("substring(value, 0, 11)"))

In [32]:
df2.limit(10).toPandas()    # same result as with Pandas

# Spark disadvantages: more work to do the same result as Pandas
# Spark advantages: use many machines, use Spark optimizer (rearrange RDDs to make query faster), 
#     don't need whole dataset in RAM at the same time (or even in the cumulative RAM of all machines)

Unnamed: 0,value,station
0,ACW00011604 17.1167 -61.7833 10.1 ST JO...,ACW00011604
1,ACW00011647 17.1333 -61.7833 19.2 ST JO...,ACW00011647
2,AE000041196 25.3330 55.5170 34.0 SHARJ...,AE000041196
3,AEM00041194 25.2550 55.3640 10.4 DUBAI...,AEM00041194
4,AEM00041217 24.4330 54.6510 26.8 ABU D...,AEM00041217
5,AEM00041218 24.2620 55.6090 264.9 AL AI...,AEM00041218
6,AF000040930 35.3170 69.0170 3366.0 NORTH...,AF000040930
7,AFM00040938 34.2100 62.2280 977.2 HERAT...,AFM00040938
8,AFM00040948 34.5660 69.2120 1791.3 KABUL...,AFM00040948
9,AFM00040990 31.5000 65.8500 1010.0 KANDA...,AFM00040990
