In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [10]:
empDF = spark.read.option("header",True).csv("/user/arnavmoutl12edu/module6/emp.csv")
deptDF = spark.read.option("header",True).csv("/user/arnavmoutl12edu/module6/dept.csv")

In [11]:
empDF.printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- dept_id: string (nullable = true)
 |-- emp_gender: string (nullable = true)
 |-- emp_salary: string (nullable = true)
 |-- emp_age: string (nullable = true)



In [12]:
deptDF.printSchema()

root
 |-- dept_id: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- dept_region: string (nullable = true)



In [13]:
joinDF = empDF.join(deptDF, empDF.dept_id == deptDF.dept_id).select(empDF['*'], deptDF['dept_name'], deptDF['dept_region'])

In [14]:
joinDF.printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- dept_id: string (nullable = true)
 |-- emp_gender: string (nullable = true)
 |-- emp_salary: string (nullable = true)
 |-- emp_age: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- dept_region: string (nullable = true)



In [16]:
joinDF.write.csv("/user/arnavmoutl12edu/module6/employee_details.csv")

In [17]:
joinDF = joinDF.withColumn("emp_age", joinDF["emp_age"].cast('int'))

In [18]:
joinDF.groupby("dept_region").avg("emp_age").show()

+-----------+------------------+
|dept_region|      avg(emp_age)|
+-----------+------------------+
|      India|27.333333333333332|
|        USA|26.666666666666668|
|         UK|27.666666666666668|
|     Canada|27.333333333333332|
|  Australia|              28.0|
+-----------+------------------+



In [19]:
countDF = joinDF.groupby("dept_id").count()
countDF.write.csv("/user/arnavmoutl12edu/module6/empcount.csv")

In [20]:
genderDF = joinDF.groupby("emp_gender").count()

In [21]:
genderDF.show()

+----------+-----+
|emp_gender|count|
+----------+-----+
|         F|   15|
|         M|   15|
+----------+-----+



In [22]:
genderDF.printSchema()

root
 |-- emp_gender: string (nullable = true)
 |-- count: long (nullable = false)



In [23]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.functions import sum as _sum

In [24]:
distDF = genderDF.withColumn('distribution', genderDF["count"]/_sum('count').over(Window.partitionBy())*100)

In [25]:
distDF.show()

+----------+-----+------------+
|emp_gender|count|distribution|
+----------+-----+------------+
|         F|   15|        50.0|
|         M|   15|        50.0|
+----------+-----+------------+



In [26]:
sc.stop()