Here, we will solve problems two ways
1. First using PySpark function 
2. Second using Spark SQL

In [1]:
# First Load all the required library and also Start Spark Session
# Load all the required library
from pyspark.sql import SparkSession

In [2]:
#Start Spark Session
spark = SparkSession.builder.appName("problem1").getOrCreate()
sqlContext = SparkSession(spark)
#Dont Show warning only error
spark.sparkContext.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/03 10:13:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [17]:
#Load CSV file into DataFrame
employeedf = spark.read.format("csv").option("header","true").option("inferSchema","true").load("employee.csv")

In [18]:
#Check Schema of DataFrame
employeedf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department_id: integer (nullable = true)



In [19]:
#Check sample Data 
employeedf.show()

+---+----------+---------+------+-------------+
| id|first_name|last_name|salary|department_id|
+---+----------+---------+------+-------------+
|  1|      Todd|   Wilson|110000|         1006|
|  1|      Todd|   Wilson|106119|         1006|
|  2|    Justin|    Simon|128922|         1005|
|  2|    Justin|    Simon|130000|         1005|
|  3|     Kelly|  Rosario| 42689|         1002|
|  4|  Patricia|   Powell|162825|         1004|
|  4|  Patricia|   Powell|170000|         1004|
|  5|    Sherry|   Golden| 44101|         1002|
|  6|   Natasha|  Swanson| 79632|         1005|
|  6|   Natasha|  Swanson| 90000|         1005|
|  7|     Diane|   Gordon| 74591|         1002|
|  8|  Mercedes|Rodriguez| 61048|         1005|
|  9|   Christy| Mitchell|137236|         1001|
|  9|   Christy| Mitchell|140000|         1001|
|  9|   Christy| Mitchell|150000|         1001|
| 10|      Sean| Crawford|182065|         1006|
| 10|      Sean| Crawford|190000|         1006|
| 11|     Kevin| Townsend|166861|       

In [20]:
#Checking number of rows in dataframe
employeedf.count()

95

In [28]:
#Solving Problem using PySpark 
# 1. We need to print latest salary of each employee
# 2. We also need their id, first name, lastname, department id and latest salary 
# 3. We also want to order by it by id 

# On a first step we are just getting all the columns and doing order by 

employeedf.select("id","first_name","last_name","department_id","salary").orderBy("id").show()


+---+----------+---------+-------------+------+
| id|first_name|last_name|department_id|salary|
+---+----------+---------+-------------+------+
|  1|      Todd|   Wilson|         1006|110000|
|  1|      Todd|   Wilson|         1006|106119|
|  2|    Justin|    Simon|         1005|128922|
|  2|    Justin|    Simon|         1005|130000|
|  3|     Kelly|  Rosario|         1002| 42689|
|  4|  Patricia|   Powell|         1004|170000|
|  4|  Patricia|   Powell|         1004|162825|
|  5|    Sherry|   Golden|         1002| 44101|
|  6|   Natasha|  Swanson|         1005| 79632|
|  6|   Natasha|  Swanson|         1005| 90000|
|  7|     Diane|   Gordon|         1002| 74591|
|  8|  Mercedes|Rodriguez|         1005| 61048|
|  9|   Christy| Mitchell|         1001|140000|
|  9|   Christy| Mitchell|         1001|150000|
|  9|   Christy| Mitchell|         1001|137236|
| 10|      Sean| Crawford|         1006|182065|
| 10|      Sean| Crawford|         1006|190000|
| 11|     Kevin| Townsend|         1002|

In [33]:
# Now we will use group by function and get max salary for each employee 
employeedf.groupBy("id","first_name","last_name","department_id").max("salary").orderBy("id").show(n=100)
# We can also store result into dataframe
finaldf = employeedf.groupBy("id","first_name","last_name","department_id").max("salary").orderBy("id")

+---+----------+---------+-------------+-----------+
| id|first_name|last_name|department_id|max(salary)|
+---+----------+---------+-------------+-----------+
|  1|      Todd|   Wilson|         1006|     110000|
|  2|    Justin|    Simon|         1005|     130000|
|  3|     Kelly|  Rosario|         1002|      42689|
|  4|  Patricia|   Powell|         1004|     170000|
|  5|    Sherry|   Golden|         1002|      44101|
|  6|   Natasha|  Swanson|         1005|      90000|
|  7|     Diane|   Gordon|         1002|      74591|
|  8|  Mercedes|Rodriguez|         1005|      61048|
|  9|   Christy| Mitchell|         1001|     150000|
| 10|      Sean| Crawford|         1006|     190000|
| 11|     Kevin| Townsend|         1002|     166861|
| 12|    Joshua|  Johnson|         1004|     123082|
| 13|     Julie|  Sanchez|         1001|     210000|
| 14|      John|  Coleman|         1001|     152434|
| 15|   Anthony|   Valdez|         1001|      96898|
| 16|    Briana|    Rivas|         1005|     1

In [34]:
# Final result into final dataframe
finaldf.count()

75

In [35]:
# Now we are solving Same problem using Spark SQL 
# Creating Temp Table or HIVE table
employeedf.createOrReplaceTempView("tmpEmployee")

In [36]:
# Now we have SQL Table and we can write SQL Query on top of that 
# For example by Select on table 
sqlContext.sql("SELECT * FROM tmpEmployee").show()

+---+----------+---------+------+-------------+
| id|first_name|last_name|salary|department_id|
+---+----------+---------+------+-------------+
|  1|      Todd|   Wilson|110000|         1006|
|  1|      Todd|   Wilson|106119|         1006|
|  2|    Justin|    Simon|128922|         1005|
|  2|    Justin|    Simon|130000|         1005|
|  3|     Kelly|  Rosario| 42689|         1002|
|  4|  Patricia|   Powell|162825|         1004|
|  4|  Patricia|   Powell|170000|         1004|
|  5|    Sherry|   Golden| 44101|         1002|
|  6|   Natasha|  Swanson| 79632|         1005|
|  6|   Natasha|  Swanson| 90000|         1005|
|  7|     Diane|   Gordon| 74591|         1002|
|  8|  Mercedes|Rodriguez| 61048|         1005|
|  9|   Christy| Mitchell|137236|         1001|
|  9|   Christy| Mitchell|140000|         1001|
|  9|   Christy| Mitchell|150000|         1001|
| 10|      Sean| Crawford|182065|         1006|
| 10|      Sean| Crawford|190000|         1006|
| 11|     Kevin| Townsend|166861|       

In [38]:
# Now we will write query to get max salary for each employee 
# so we will use SQL Group by and SQL Order by functions 
sqlContext.sql("SELECT id,first_name,last_name,MAX(salary) AS LatesSalary,department_id \
                       FROM tmpEmployee \
                       GROUP BY id,first_name,last_name,department_id \
                       ORDER BY id").show(n=100)

+---+----------+---------+-----------+-------------+
| id|first_name|last_name|LatesSalary|department_id|
+---+----------+---------+-----------+-------------+
|  1|      Todd|   Wilson|     110000|         1006|
|  2|    Justin|    Simon|     130000|         1005|
|  3|     Kelly|  Rosario|      42689|         1002|
|  4|  Patricia|   Powell|     170000|         1004|
|  5|    Sherry|   Golden|      44101|         1002|
|  6|   Natasha|  Swanson|      90000|         1005|
|  7|     Diane|   Gordon|      74591|         1002|
|  8|  Mercedes|Rodriguez|      61048|         1005|
|  9|   Christy| Mitchell|     150000|         1001|
| 10|      Sean| Crawford|     190000|         1006|
| 11|     Kevin| Townsend|     166861|         1002|
| 12|    Joshua|  Johnson|     123082|         1004|
| 13|     Julie|  Sanchez|     210000|         1001|
| 14|      John|  Coleman|     152434|         1001|
| 15|   Anthony|   Valdez|      96898|         1001|
| 16|    Briana|    Rivas|     151668|        