# NOTEBOOK 3.2 Spark DataFrames

# 1. Create Spark DataFrames

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

25/06/12 14:16:53 WARN Utils: Your hostname, PC25. resolves to a loopback address: 127.0.1.1; using 192.168.76.195 instead (on interface eth0)
25/06/12 14:16:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/12 14:16:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 1.1 Create Spark DataFrames from a List of Row objects

In [2]:
from datetime import datetime, date
from pyspark.sql import Row

data_df = spark.createDataFrame([
    Row(col_1=100, col_2=200., col_3='string_test_1', col_4=date(2023, 1, 1), col_5=datetime(2023, 1, 1, 12, 0)),
    Row(col_1=200, col_2=300., col_3='string_test_2', col_4=date(2023, 2, 1), col_5=datetime(2023, 1, 2, 12, 0)),
    Row(col_1=400, col_2=500., col_3='string_test_3', col_4=date(2023, 3, 1), col_5=datetime(2023, 1, 3, 12, 0))
])

## 1.2 Create Spark DataFrames with a Schema using DDL

In [3]:
data_df = spark.createDataFrame([
    Row(col_1=100, col_2=200., col_3='string_test_1', col_4=date(2023, 1, 1), col_5=datetime(2023, 1, 1, 12, 0)),
    Row(col_1=200, col_2=300., col_3='string_test_2', col_4=date(2023, 2, 1), col_5=datetime(2023, 1, 2, 12, 0)),
    Row(col_1=400, col_2=500., col_3='string_test_3', col_4=date(2023, 3, 1), col_5=datetime(2023, 1, 3, 12, 0))
], schema=' col_1 long, col_2 double, col_3 string, col_4 date, col_5 timestamp')

## 1.3 Create Spark DataFrames from an RDD

In [4]:
rdd = spark.sparkContext.parallelize([
    (100, 200., 'string_test_1', date(2023, 1, 1), 5.5),
    (100, 200., 'string_test_1', date(2023, 1, 1), 10.2),
    (200, 300., 'string_test_2', date(2023, 2, 1), None),
    (100, 400., 'string_test_3', date(2023, 3, 1), 3.1),
    (300, 400., 'string_test_3', date(2023, 3, 1), 7.8),
    (300, 400., 'string_test_3', date(2023, 3, 1), 4.9),
    (200, 300., 'string_test_2', date(2023, 2, 1), 6.8),
    (200, 300., 'string_test_2', date(2023, 2, 1), 1.7),
    (400, 400., None, None, 8.8),
    (500, 500., None, date(2023, 2, 1), None)
    ])

data_df = spark.createDataFrame(rdd, schema=['col_1', 'col_2', 'col_3', 'col_4', 'col_5'])

                                                                                

# 2. Viewing Dataframes

In [5]:
data_df.show()

+-----+-----+-------------+----------+-----+
|col_1|col_2|        col_3|     col_4|col_5|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|
|  100|200.0|string_test_1|2023-01-01| 10.2|
|  200|300.0|string_test_2|2023-02-01| NULL|
|  100|400.0|string_test_3|2023-03-01|  3.1|
|  300|400.0|string_test_3|2023-03-01|  7.8|
|  300|400.0|string_test_3|2023-03-01|  4.9|
|  200|300.0|string_test_2|2023-02-01|  6.8|
|  200|300.0|string_test_2|2023-02-01|  1.7|
|  400|400.0|         NULL|      NULL|  8.8|
|  500|500.0|         NULL|2023-02-01| NULL|
+-----+-----+-------------+----------+-----+



In [6]:
data_df.show(2)

+-----+-----+-------------+----------+-----+
|col_1|col_2|        col_3|     col_4|col_5|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|
|  100|200.0|string_test_1|2023-01-01| 10.2|
+-----+-----+-------------+----------+-----+
only showing top 2 rows



In [7]:
data_df.printSchema()

root
 |-- col_1: long (nullable = true)
 |-- col_2: double (nullable = true)
 |-- col_3: string (nullable = true)
 |-- col_4: date (nullable = true)
 |-- col_5: double (nullable = true)



In [8]:
data_df.show(1, vertical=True)

-RECORD 0--------------
 col_1 | 100           
 col_2 | 200.0         
 col_3 | string_test_1 
 col_4 | 2023-01-01    
 col_5 | 5.5           
only showing top 1 row



In [9]:
data_df.columns

['col_1', 'col_2', 'col_3', 'col_4', 'col_5']

In [10]:
data_df.describe().show()

25/06/12 14:17:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------------------+----------------+-------------+-----------------+
|summary|             col_1|           col_2|        col_3|            col_5|
+-------+------------------+----------------+-------------+-----------------+
|  count|                10|              10|            8|                8|
|   mean|             240.0|           340.0|         NULL|              6.1|
| stddev|134.98971154211057|96.6091783079296|         NULL|2.869544513382867|
|    min|               100|           200.0|string_test_1|              1.7|
|    max|               500|           500.0|string_test_3|             10.2|
+-------+------------------+----------------+-------------+-----------------+



In [11]:
data_df.count()

10

In [12]:
data_df.distinct().count()

10

In [13]:
data_df.select('col_1', 'col_2', 'col_3').describe().show()

+-------+------------------+----------------+-------------+
|summary|             col_1|           col_2|        col_3|
+-------+------------------+----------------+-------------+
|  count|                10|              10|            8|
|   mean|             240.0|           340.0|         NULL|
| stddev|134.98971154211057|96.6091783079296|         NULL|
|    min|               100|           200.0|string_test_1|
|    max|               500|           500.0|string_test_3|
+-------+------------------+----------------+-------------+



In [14]:
data_df.take(1)

[Row(col_1=100, col_2=200.0, col_3='string_test_1', col_4=datetime.date(2023, 1, 1), col_5=5.5)]

In [15]:
data_df.tail(1)

[Row(col_1=500, col_2=500.0, col_3=None, col_4=datetime.date(2023, 2, 1), col_5=None)]

In [16]:
data_df.head(1)

[Row(col_1=100, col_2=200.0, col_3='string_test_1', col_4=datetime.date(2023, 1, 1), col_5=5.5)]

# 3. Manipulating Spark DataFrames

## 3.1 select(), withColumn(), withColumnRenamed()

In [17]:
from pyspark.sql import Column

data_df.select(data_df.col_3).show()

+-------------+
|        col_3|
+-------------+
|string_test_1|
|string_test_1|
|string_test_2|
|string_test_3|
|string_test_3|
|string_test_3|
|string_test_2|
|string_test_2|
|         NULL|
|         NULL|
+-------------+



In [18]:
from pyspark.sql import functions as F
data_df = data_df.withColumn("col_6", F.lit("A"))
data_df.show()

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|        col_3|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  200|300.0|string_test_2|2023-02-01| NULL|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
|  300|400.0|string_test_3|2023-03-01|  7.8|    A|
|  300|400.0|string_test_3|2023-03-01|  4.9|    A|
|  200|300.0|string_test_2|2023-02-01|  6.8|    A|
|  200|300.0|string_test_2|2023-02-01|  1.7|    A|
|  400|400.0|         NULL|      NULL|  8.8|    A|
|  500|500.0|         NULL|2023-02-01| NULL|    A|
+-----+-----+-------------+----------+-----+-----+



In [19]:
data_df.withColumn("col_2", F.col("col_2") / 100).show()

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|        col_3|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|  2.0|string_test_1|2023-01-01|  5.5|    A|
|  100|  2.0|string_test_1|2023-01-01| 10.2|    A|
|  200|  3.0|string_test_2|2023-02-01| NULL|    A|
|  100|  4.0|string_test_3|2023-03-01|  3.1|    A|
|  300|  4.0|string_test_3|2023-03-01|  7.8|    A|
|  300|  4.0|string_test_3|2023-03-01|  4.9|    A|
|  200|  3.0|string_test_2|2023-02-01|  6.8|    A|
|  200|  3.0|string_test_2|2023-02-01|  1.7|    A|
|  400|  4.0|         NULL|      NULL|  8.8|    A|
|  500|  5.0|         NULL|2023-02-01| NULL|    A|
+-----+-----+-------------+----------+-----+-----+



In [20]:
data_df = data_df.withColumnRenamed("col_3", "string_col")
data_df.show()

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|   string_col|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  200|300.0|string_test_2|2023-02-01| NULL|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
|  300|400.0|string_test_3|2023-03-01|  7.8|    A|
|  300|400.0|string_test_3|2023-03-01|  4.9|    A|
|  200|300.0|string_test_2|2023-02-01|  6.8|    A|
|  200|300.0|string_test_2|2023-02-01|  1.7|    A|
|  400|400.0|         NULL|      NULL|  8.8|    A|
|  500|500.0|         NULL|2023-02-01| NULL|    A|
+-----+-----+-------------+----------+-----+-----+



In [21]:
data_df.select("col_6").distinct().show()

+-----+
|col_6|
+-----+
|    A|
+-----+



In [22]:
data_df.select(F.countDistinct("col_6").alias("Total_Unique")).show()

+------------+
|Total_Unique|
+------------+
|           1|
+------------+



## 3.2 filter()

In [23]:
data_df.filter(data_df.col_1 == 100).show()

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|   string_col|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
+-----+-----+-------------+----------+-----+-----+



In [24]:
data_df.filter((data_df.col_1 == 100) & (data_df.col_6 == 'A')).show()


+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|   string_col|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
+-----+-----+-------------+----------+-----+-----+



In [25]:
data_df.filter((data_df.col_1 == 100) | (data_df.col_2 == 300.00)).show()

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|   string_col|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  200|300.0|string_test_2|2023-02-01| NULL|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
|  200|300.0|string_test_2|2023-02-01|  6.8|    A|
|  200|300.0|string_test_2|2023-02-01|  1.7|    A|
+-----+-----+-------------+----------+-----+-----+



In [26]:
list = [100, 200]
data_df.filter(data_df.col_1.isin(list)).show()

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|   string_col|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  200|300.0|string_test_2|2023-02-01| NULL|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
|  200|300.0|string_test_2|2023-02-01|  6.8|    A|
|  200|300.0|string_test_2|2023-02-01|  1.7|    A|
+-----+-----+-------------+----------+-----+-----+



## 3.3 Casting columns to specific data types

In [27]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType

data_df2 = data_df.withColumn("col_4",col("col_4").cast(StringType())) \
    .withColumn("col_1",col("col_1").cast(IntegerType()))
data_df2.printSchema()
data_df2.show()

root
 |-- col_1: integer (nullable = true)
 |-- col_2: double (nullable = true)
 |-- string_col: string (nullable = true)
 |-- col_4: string (nullable = true)
 |-- col_5: double (nullable = true)
 |-- col_6: string (nullable = false)

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|   string_col|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  200|300.0|string_test_2|2023-02-01| NULL|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
|  300|400.0|string_test_3|2023-03-01|  7.8|    A|
|  300|400.0|string_test_3|2023-03-01|  4.9|    A|
|  200|300.0|string_test_2|2023-02-01|  6.8|    A|
|  200|300.0|string_test_2|2023-02-01|  1.7|    A|
|  400|400.0|         NULL|      NULL|  8.8|    A|
|  500|500.0|         NULL|2023-02-01| NULL|    A|
+-----+-----+-------------+----------+-----+-----+



In [28]:
data_df3 = data_df2.selectExpr("cast(col_4 as date) col_4",
    "cast(col_1 as long) col_1")
data_df3.printSchema()

root
 |-- col_4: date (nullable = true)
 |-- col_1: long (nullable = true)



## 3.4 Data Cleaning

In [29]:
data_df.show()

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|   string_col|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  200|300.0|string_test_2|2023-02-01| NULL|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
|  300|400.0|string_test_3|2023-03-01|  7.8|    A|
|  300|400.0|string_test_3|2023-03-01|  4.9|    A|
|  200|300.0|string_test_2|2023-02-01|  6.8|    A|
|  200|300.0|string_test_2|2023-02-01|  1.7|    A|
|  400|400.0|         NULL|      NULL|  8.8|    A|
|  500|500.0|         NULL|2023-02-01| NULL|    A|
+-----+-----+-------------+----------+-----+-----+



In [30]:
data_df.na.fill('EMPTY').show()

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|   string_col|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  200|300.0|string_test_2|2023-02-01| NULL|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
|  300|400.0|string_test_3|2023-03-01|  7.8|    A|
|  300|400.0|string_test_3|2023-03-01|  4.9|    A|
|  200|300.0|string_test_2|2023-02-01|  6.8|    A|
|  200|300.0|string_test_2|2023-02-01|  1.7|    A|
|  400|400.0|        EMPTY|      NULL|  8.8|    A|
|  500|500.0|        EMPTY|2023-02-01| NULL|    A|
+-----+-----+-------------+----------+-----+-----+



In [31]:
data_df.na.replace('EMPTY', 'Blank')

DataFrame[col_1: bigint, col_2: double, string_col: string, col_4: date, col_5: double, col_6: string]

In [32]:
data_df.fillna(({'string_col': 'unknown', 'col_5': 0.0})).show()

+-----+-----+-------------+----------+-----+-----+
|col_1|col_2|   string_col|     col_4|col_5|col_6|
+-----+-----+-------------+----------+-----+-----+
|  100|200.0|string_test_1|2023-01-01|  5.5|    A|
|  100|200.0|string_test_1|2023-01-01| 10.2|    A|
|  200|300.0|string_test_2|2023-02-01|  0.0|    A|
|  100|400.0|string_test_3|2023-03-01|  3.1|    A|
|  300|400.0|string_test_3|2023-03-01|  7.8|    A|
|  300|400.0|string_test_3|2023-03-01|  4.9|    A|
|  200|300.0|string_test_2|2023-02-01|  6.8|    A|
|  200|300.0|string_test_2|2023-02-01|  1.7|    A|
|  400|400.0|      unknown|      NULL|  8.8|    A|
|  500|500.0|      unknown|2023-02-01|  0.0|    A|
+-----+-----+-------------+----------+-----+-----+



In [33]:
data_df = data_df.drop("col_5")
data_df.show()

+-----+-----+-------------+----------+-----+
|col_1|col_2|   string_col|     col_4|col_6|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|    A|
|  100|200.0|string_test_1|2023-01-01|    A|
|  200|300.0|string_test_2|2023-02-01|    A|
|  100|400.0|string_test_3|2023-03-01|    A|
|  300|400.0|string_test_3|2023-03-01|    A|
|  300|400.0|string_test_3|2023-03-01|    A|
|  200|300.0|string_test_2|2023-02-01|    A|
|  200|300.0|string_test_2|2023-02-01|    A|
|  400|400.0|         NULL|      NULL|    A|
|  500|500.0|         NULL|2023-02-01|    A|
+-----+-----+-------------+----------+-----+



In [34]:
salary_data = [("John", "Field-eng", 3500),
    ("Michael", "Field-eng", 4500),
    ("Robert", None, 4000),
    ("Maria", "Finance", 3500),
    ("John", "Sales", 3000),
    ("Kelly", "Finance", 3500),
    ("Kate", "Finance", 3000),
    ("Martin", None, 3500),
    ("Kiran", "Sales", 2200),
    ("Michael", "Field-eng", 4500)
  ]
columns= ["Employee", "Department", "Salary"]
salary_df = spark.createDataFrame(data=salary_data, schema=columns)
salary_df.printSchema()
salary_df.show()

root
 |-- Employee: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
|    John| Field-eng|  3500|
| Michael| Field-eng|  4500|
|  Robert|      NULL|  4000|
|   Maria|   Finance|  3500|
|    John|     Sales|  3000|
|   Kelly|   Finance|  3500|
|    Kate|   Finance|  3000|
|  Martin|      NULL|  3500|
|   Kiran|     Sales|  2200|
| Michael| Field-eng|  4500|
+--------+----------+------+



In [35]:
salary_df.dropna().show()

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
|    John| Field-eng|  3500|
| Michael| Field-eng|  4500|
|   Maria|   Finance|  3500|
|    John|     Sales|  3000|
|   Kelly|   Finance|  3500|
|    Kate|   Finance|  3000|
|   Kiran|     Sales|  2200|
| Michael| Field-eng|  4500|
+--------+----------+------+



In [36]:
new_salary_df = salary_df.dropDuplicates().show()

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
|    John| Field-eng|  3500|
| Michael| Field-eng|  4500|
|   Maria|   Finance|  3500|
|    John|     Sales|  3000|
|   Kelly|   Finance|  3500|
|    Kate|   Finance|  3000|
|   Kiran|     Sales|  2200|
|  Robert|      NULL|  4000|
|  Martin|      NULL|  3500|
+--------+----------+------+



# 4. Using Aggregrates in a Dataframe

## 4.1 avg(), agg(), countDistinct(), sum, max

In [37]:
from pyspark.sql.functions import countDistinct, avg
salary_df.select(avg('Salary')).show()


+-----------+
|avg(Salary)|
+-----------+
|     3520.0|
+-----------+



In [38]:
salary_df.agg({'Salary':'count'}).show()

+-------------+
|count(Salary)|
+-------------+
|           10|
+-------------+



In [39]:
salary_df.select(countDistinct("Salary").alias("Distinct Salary")).show()

+---------------+
|Distinct Salary|
+---------------+
|              5|
+---------------+



In [40]:
salary_df.agg({'Salary':'max'}).show()

+-----------+
|max(Salary)|
+-----------+
|       4500|
+-----------+



In [41]:
salary_df.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      35200|
+-----------+



## 4.2 groupBy()

In [42]:
salary_df.groupby('Department')

GroupedData[grouping expressions: [Department], value: [Employee: string, Department: string ... 1 more field], type: GroupBy]

In [43]:
salary_df.groupby('Department').avg().show()

+----------+------------------+
|Department|       avg(Salary)|
+----------+------------------+
| Field-eng| 4166.666666666667|
|      NULL|            3750.0|
|   Finance|3333.3333333333335|
|     Sales|            2600.0|
+----------+------------------+



## 4.3 orderBy()

In [44]:
salary_df.orderBy("Salary").show()

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
|   Kiran|     Sales|  2200|
|    Kate|   Finance|  3000|
|    John|     Sales|  3000|
|    John| Field-eng|  3500|
|  Martin|      NULL|  3500|
|   Kelly|   Finance|  3500|
|   Maria|   Finance|  3500|
|  Robert|      NULL|  4000|
| Michael| Field-eng|  4500|
| Michael| Field-eng|  4500|
+--------+----------+------+



In [45]:
salary_df.orderBy(salary_df["Salary"].desc()).show()

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
| Michael| Field-eng|  4500|
| Michael| Field-eng|  4500|
|  Robert|      NULL|  4000|
|   Kelly|   Finance|  3500|
|   Maria|   Finance|  3500|
|  Martin|      NULL|  3500|
|    John| Field-eng|  3500|
|    John|     Sales|  3000|
|    Kate|   Finance|  3000|
|   Kiran|     Sales|  2200|
+--------+----------+------+



In [46]:
from pyspark.sql.functions import col, round

salary_df.groupBy('Department')\
  .sum('Salary')\
  .withColumn('sum(Salary)',round(col('sum(Salary)'), 2))\
  .withColumnRenamed('sum(Salary)', 'Salary')\
  .orderBy('Department')\
  .show()

+----------+------+
|Department|Salary|
+----------+------+
|      NULL|  7500|
| Field-eng| 12500|
|   Finance| 10000|
|     Sales|  5200|
+----------+------+



## 5. Reading into DataFrame with Programmatically-Defined Schema

In [47]:
from pyspark.sql.types import *

filepath = 'data/salary_data.csv'

schema = StructType([
    StructField("ID", IntegerType(),True),
    StructField("Employee",  StringType(),True),
    StructField("Department",  StringType(),True),
    StructField("Salary",  DoubleType(),True)
])

salary_df = spark.read.format("csv").option("header","true").schema(schema).load(filepath)
salary_df.show()

+---+--------+----------+------+
| ID|Employee|Department|Salary|
+---+--------+----------+------+
|  1|    John| Field-eng|3500.0|
|  2|  Robert|     Sales|4000.0|
|  3|   Maria|   Finance|3500.0|
|  4| Michael|     Sales|3000.0|
|  5|   Kelly|   Finance|3500.0|
|  6|    Kate|   Finance|3000.0|
|  7|  Martin|   Finance|3500.0|
|  8|   Kiran|     Sales|2200.0|
+---+--------+----------+------+



In [48]:
spark.stop()