# NOTEBOOK 3.2(b) Spark DataFrames - Advanced Operations

## 0. Preparations

Before starting this practical, ensure that the json data file named **nyt2.json** exists in the HDFS directory named **data**.

In [1]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import date, timedelta, datetime
import time

## 1. Initialize SparkSession

https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html?highlight=sparksession#pyspark.sql.SparkSession

In [2]:
spark = SparkSession.builder.appName("PysparkExample")\
.config('spark.executor.instances', '2')\
.config('spark.executor.memory', '1g')\
.config('spark.executor.cores', '4')\
.config ("spark.sql.shuffle.partitions", "50")\
.config("spark.driver.maxResultSize","5g")\
.config("spark.sql.execution.arrow.pyspark.enabled", "true")\
.getOrCreate()

25/06/12 14:15:42 WARN Utils: Your hostname, PC25. resolves to a loopback address: 127.0.1.1; using 192.168.76.195 instead (on interface eth0)
25/06/12 14:15:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/12 14:15:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc = spark.sparkContext
conf = sc.getConf()

# Get all Spark parameter values as a list of key-value pairs
all_configs = conf.getAll()

# Print them
for key, value in all_configs:
    print(f"{key}: {value}")

spark.executor.cores: 4
spark.rdd.compress: True
spark.executor.memory: 1g
spark.app.id: local-1749708944122
spark.app.submitTime: 1749708943412
spark.master: local[*]
spark.sql.execution.arrow.pyspark.enabled: true
spark.driver.host: 192.168.76.195
spark.driver.maxResultSize: 5g
spark.driver.port: 39845
spark.driver.extraJavaOptions: -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java

In [4]:
conf_dict = dict(sc.getConf().getAll())

print(f"Effective spark.executor.instances: {conf_dict.get('spark.executor.instances')}")
print(f"Effective spark.executor.memory: {conf_dict.get('spark.executor.memory')}")
print(f"Effective spark.executor.cores: {conf_dict.get('spark.executor.cores')}")

Effective spark.executor.instances: 2
Effective spark.executor.memory: 1g
Effective spark.executor.cores: 4


## 2. Spark DataFrames


### 2.1 Create Spark DataFrame

In [5]:
# Create Spark dataframe from the json data file
df = spark.read.json("data/nyt2.json")
df.show(2)

                                                                                

+--------------------+--------------------+---------------+-----------------+--------------------+-------------+-----------------+-------------+----+--------------+---------+-------------+
|                 _id|  amazon_product_url|         author| bestsellers_date|         description|        price|   published_date|    publisher|rank|rank_last_week|    title|weeks_on_list|
+--------------------+--------------------+---------------+-----------------+--------------------+-------------+-----------------+-------------+----+--------------+---------+-------------+
|{5b4aa4ead3089013...|http://www.amazon...|  Dean R Koontz|{{1211587200000}}|Odd Thomas, who c...|   {NULL, 27}|{{1212883200000}}|       Bantam| {1}|           {0}|ODD HOURS|          {1}|
|{5b4aa4ead3089013...|http://www.amazon...|Stephenie Meyer|{{1211587200000}}|Aliens have taken...|{25.99, NULL}|{{1212883200000}}|Little, Brown| {2}|           {1}| THE HOST|          {3}|
+--------------------+--------------------+------------

### 2.2 Inspecting the DataFrame

In [6]:
# Check the type of df
type(df)

pyspark.sql.dataframe.DataFrame

In [7]:
# Return the dataframe column names and data types
print("DataFrame column names and data types")
df.dtypes

DataFrame column names and data types


[('_id', 'struct<$oid:string>'),
 ('amazon_product_url', 'string'),
 ('author', 'string'),
 ('bestsellers_date', 'struct<$date:struct<$numberLong:string>>'),
 ('description', 'string'),
 ('price', 'struct<$numberDouble:string,$numberInt:string>'),
 ('published_date', 'struct<$date:struct<$numberLong:string>>'),
 ('publisher', 'string'),
 ('rank', 'struct<$numberInt:string>'),
 ('rank_last_week', 'struct<$numberInt:string>'),
 ('title', 'string'),
 ('weeks_on_list', 'struct<$numberInt:string>')]

In [8]:
# Compute basic statistics for the dataframe columns.
print("\nSummary statistics")
df.describe().show()


Summary statistics


25/06/12 14:15:49 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+--------------------+---------------+--------------------+---------+------------------+
|summary|  amazon_product_url|         author|         description|publisher|             title|
+-------+--------------------+---------------+--------------------+---------+------------------+
|  count|               10195|          10195|               10195|    10195|             10195|
|   mean|                NULL|           NULL|                NULL|     NULL|1877.7142857142858|
| stddev|                NULL|           NULL|                NULL|     NULL| 370.9760613506458|
|    min|http://www.amazon...|        AJ Finn|                    |      ACE|  10TH ANNIVERSARY|
|    max|https://www.amazo...|various authors|’Tis for the Rebe...|allantine|               ZOO|
+-------+--------------------+---------------+--------------------+---------+------------------+



In [9]:
# Returns columns of dataframe
print("\nDataFrame columns")
df.columns


DataFrame columns


['_id',
 'amazon_product_url',
 'author',
 'bestsellers_date',
 'description',
 'price',
 'published_date',
 'publisher',
 'rank',
 'rank_last_week',
 'title',
 'weeks_on_list']

In [10]:
# Counts the number of rows in dataframe
print("\nNumber of rows in the dataframe")
df.count()


Number of rows in the dataframe


10195

In [11]:
# Counts the number of distinct rows in dataframe
print("\nNumber of distinct rows in the dataframe")
df.distinct().count()


Number of distinct rows in the dataframe


10195

In [12]:
# Prints plans including physical and logical
print("\nPhysical and logical dataframe plans")
df.explain()


Physical and logical dataframe plans
== Physical Plan ==
FileScan json [_id#8,amazon_product_url#9,author#10,bestsellers_date#11,description#12,price#13,published_date#14,publisher#15,rank#16,rank_last_week#17,title#18,weeks_on_list#19] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex(1 paths)[hdfs://localhost:9000/user/student/data/nyt2.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_id:struct<$oid:string>,amazon_product_url:string,author:string,bestsellers_date:struct<$d...




## 3. Advanced Spark DataFrame Operations
https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html

### 3.1 when() Operation

In [13]:
# Show title and assign 0 or 1 depending on title
df.select("title", when(df.title != 'ODD HOURS', 1).otherwise(0)).show(10)

+--------------------+-----------------------------------------------------+
|               title|CASE WHEN (NOT (title = ODD HOURS)) THEN 1 ELSE 0 END|
+--------------------+-----------------------------------------------------+
|           ODD HOURS|                                                    0|
|            THE HOST|                                                    1|
|LOVE THE ONE YOU'...|                                                    1|
|           THE FRONT|                                                    1|
|               SNUFF|                                                    1|
|SUNDAYS AT TIFFANY’S|                                                    1|
|        PHANTOM PREY|                                                    1|
|          SWINE NOT?|                                                    1|
|     CARELESS IN RED|                                                    1|
|     THE WHOLE TRUTH|                                                    1|

### 3.2 isin Operation

In [14]:
df[df.author.isin("John Sandford", "Emily Giffin")].show(3)

+--------------------+--------------------+-------------+-----------------+--------------------+-------------+-----------------+------------+----+--------------+--------------------+-------------+
|                 _id|  amazon_product_url|       author| bestsellers_date|         description|        price|   published_date|   publisher|rank|rank_last_week|               title|weeks_on_list|
+--------------------+--------------------+-------------+-----------------+--------------------+-------------+-----------------+------------+----+--------------+--------------------+-------------+
|{5b4aa4ead3089013...|http://www.amazon...| Emily Giffin|{{1211587200000}}|A woman's happy m...|{24.95, NULL}|{{1212883200000}}|St. Martin's| {3}|           {2}|LOVE THE ONE YOU'...|          {2}|
|{5b4aa4ead3089013...|http://www.amazon...|John Sandford|{{1211587200000}}|The Minneapolis d...|{26.95, NULL}|{{1212883200000}}|      Putnam| {7}|           {4}|        PHANTOM PREY|          {3}|
|{5b4aa4ead3089

### 3.3 like Operation
The % character is used to filter out all titles containing the word " THE " word.
To match an entire string (i.e., an exact match), omit the % character.

In [15]:
# Show author and title, and whether the title contains the word " THE "
df.select("author", "title", df.title.like("% THE %")).show(15)

+--------------------+--------------------+------------------+
|              author|               title|title LIKE % THE %|
+--------------------+--------------------+------------------+
|       Dean R Koontz|           ODD HOURS|             false|
|     Stephenie Meyer|            THE HOST|             false|
|        Emily Giffin|LOVE THE ONE YOU'...|              true|
|   Patricia Cornwell|           THE FRONT|             false|
|     Chuck Palahniuk|               SNUFF|             false|
|James Patterson a...|SUNDAYS AT TIFFANY’S|             false|
|       John Sandford|        PHANTOM PREY|             false|
|       Jimmy Buffett|          SWINE NOT?|             false|
|    Elizabeth George|     CARELESS IN RED|             false|
|      David Baldacci|     THE WHOLE TRUTH|             false|
|        Troy Denning|          INVINCIBLE|             false|
|          James Frey|BRIGHT SHINY MORNING|             false|
|         Garth Stein|THE ART OF RACING...|            

### 3.4 startswith and endswith Operations

In [16]:
df.select("author", "title", df.title.startswith("THE")).show(5, truncate=False)
df.select("author", "title", df.title.endswith("NT")).show(5, truncate=False)

+-----------------+------------------------+----------------------+
|author           |title                   |startswith(title, THE)|
+-----------------+------------------------+----------------------+
|Dean R Koontz    |ODD HOURS               |false                 |
|Stephenie Meyer  |THE HOST                |true                  |
|Emily Giffin     |LOVE THE ONE YOU'RE WITH|false                 |
|Patricia Cornwell|THE FRONT               |true                  |
|Chuck Palahniuk  |SNUFF                   |false                 |
+-----------------+------------------------+----------------------+
only showing top 5 rows

+-----------------+------------------------+-------------------+
|author           |title                   |endswith(title, NT)|
+-----------------+------------------------+-------------------+
|Dean R Koontz    |ODD HOURS               |false              |
|Stephenie Meyer  |THE HOST                |false              |
|Emily Giffin     |LOVE THE ONE YOU'RE

### 3.5 substring Operation

In [17]:
df.select(df.author.substr(1, 3).alias("title")).show(5)
df.select(df.author.substr(3, 6).alias("title")).show(5)
df.select(df.author.substr(1, 6).alias("title")).show(5)

+-----+
|title|
+-----+
|  Dea|
|  Ste|
|  Emi|
|  Pat|
|  Chu|
+-----+
only showing top 5 rows

+------+
| title|
+------+
|an R K|
|epheni|
|ily Gi|
|tricia|
|uck Pa|
+------+
only showing top 5 rows

+------+
| title|
+------+
|Dean R|
|Stephe|
|Emily |
|Patric|
|Chuck |
+------+
only showing top 5 rows



## 4. Join Operations
https://pedropark99.github.io/Introd-pyspark/Chapters/08-transforming2.html

In [18]:
# Create sample data

artists = [
    ('Mick', 'Rolling Stones', '1943-07-26', True),
    ('John', 'Beatles', '1940-09-10', True),
    ('Paul', 'Beatles', '1942-06-18', True),
    ('George', 'Beatles', '1943-02-25', True),
    ('Ringo', 'Beatles', '1940-07-07', True)
]

artists_df = spark.createDataFrame(
    artists,
    ['name', 'band', 'born', 'children']
)

instruments = [
    ('John', 'guitar'),
    ('Paul', 'bass'),
    ('Keith', 'guitar')
]

instruments_df = spark.createDataFrame(
    instruments,
    ['name', 'plays']
)

print("Artist's info:")
artists_df.show()
print("Instruments:")
instruments_df.show()

Artist's info:
+------+--------------+----------+--------+
|  name|          band|      born|children|
+------+--------------+----------+--------+
|  Mick|Rolling Stones|1943-07-26|    true|
|  John|       Beatles|1940-09-10|    true|
|  Paul|       Beatles|1942-06-18|    true|
|George|       Beatles|1943-02-25|    true|
| Ringo|       Beatles|1940-07-07|    true|
+------+--------------+----------+--------+

Instruments:
+-----+------+
| name| plays|
+-----+------+
| John|guitar|
| Paul|  bass|
|Keith|guitar|
+-----+------+



### 4.1 Inner join (the default)

In [19]:
artists_df.join(instruments_df, on = 'name', how = 'inner').show()

+----+-------+----------+--------+------+
|name|   band|      born|children| plays|
+----+-------+----------+--------+------+
|John|Beatles|1940-09-10|    true|guitar|
|Paul|Beatles|1942-06-18|    true|  bass|
+----+-------+----------+--------+------+



### 4.2 Left anti join

In [20]:
artists_df.join(instruments_df, on = 'name', how = 'leftanti').show()

+------+--------------+----------+--------+
|  name|          band|      born|children|
+------+--------------+----------+--------+
|  Mick|Rolling Stones|1943-07-26|    true|
|George|       Beatles|1943-02-25|    true|
| Ringo|       Beatles|1940-07-07|    true|
+------+--------------+----------+--------+



### 4.3 Left outer join

In [21]:
artists_df.join(instruments_df, on = 'name', how = 'leftouter').show()

+------+--------------+----------+--------+------+
|  name|          band|      born|children| plays|
+------+--------------+----------+--------+------+
|  Mick|Rolling Stones|1943-07-26|    true|  NULL|
|  John|       Beatles|1940-09-10|    true|guitar|
|  Paul|       Beatles|1942-06-18|    true|  bass|
|George|       Beatles|1943-02-25|    true|  NULL|
| Ringo|       Beatles|1940-07-07|    true|  NULL|
+------+--------------+----------+--------+------+



### 4.4 Right outer join

In [22]:
artists_df.join(instruments_df, on = 'name', how = 'rightouter').show()

+-----+-------+----------+--------+------+
| name|   band|      born|children| plays|
+-----+-------+----------+--------+------+
| John|Beatles|1940-09-10|    true|guitar|
| Paul|Beatles|1942-06-18|    true|  bass|
|Keith|   NULL|      NULL|    NULL|guitar|
+-----+-------+----------+--------+------+



### 4.5 Full outer join

In [23]:
artists_df.join(instruments_df, on = 'name', how = 'fullouter').show()

+------+--------------+----------+--------+------+
|  name|          band|      born|children| plays|
+------+--------------+----------+--------+------+
|George|       Beatles|1943-02-25|    true|  NULL|
|  John|       Beatles|1940-09-10|    true|guitar|
| Keith|          NULL|      NULL|    NULL|guitar|
|  Mick|Rolling Stones|1943-07-26|    true|  NULL|
|  Paul|       Beatles|1942-06-18|    true|  bass|
| Ringo|       Beatles|1940-07-07|    true|  NULL|
+------+--------------+----------+--------+------+



### 4.6 Left Semi Join

In [24]:
artists_df.join(instruments_df, on = 'name', how = 'leftsemi').show()

+----+-------+----------+--------+
|name|   band|      born|children|
+----+-------+----------+--------+
|John|Beatles|1940-09-10|    true|
|Paul|Beatles|1942-06-18|    true|
+----+-------+----------+--------+



### 4.7 Cross Join

In [25]:
artists_df.crossJoin(instruments_df).show()



+------+--------------+----------+--------+-----+------+
|  name|          band|      born|children| name| plays|
+------+--------------+----------+--------+-----+------+
|  Mick|Rolling Stones|1943-07-26|    true| John|guitar|
|  Mick|Rolling Stones|1943-07-26|    true| Paul|  bass|
|  Mick|Rolling Stones|1943-07-26|    true|Keith|guitar|
|  John|       Beatles|1940-09-10|    true| John|guitar|
|  John|       Beatles|1940-09-10|    true| Paul|  bass|
|  John|       Beatles|1940-09-10|    true|Keith|guitar|
|  Paul|       Beatles|1942-06-18|    true| John|guitar|
|  Paul|       Beatles|1942-06-18|    true| Paul|  bass|
|  Paul|       Beatles|1942-06-18|    true|Keith|guitar|
|George|       Beatles|1943-02-25|    true| John|guitar|
|George|       Beatles|1943-02-25|    true| Paul|  bass|
|George|       Beatles|1943-02-25|    true|Keith|guitar|
| Ringo|       Beatles|1940-07-07|    true| John|guitar|
| Ringo|       Beatles|1940-07-07|    true| Paul|  bass|
| Ringo|       Beatles|1940-07-

                                                                                

In [26]:
spark.stop()