Adding parquet files to the gold folder of the employees container

In [0]:

from pyspark.sql.types import * 
from pyspark.sql.functions import *

In [0]:
employees = spark.read.parquet("dbfs:/FileStore/tables/employees_medallion/silver/employees")
dept=spark.read.parquet("dbfs:/FileStore/tables/employees_medallion/silver/dept")
countries= spark.read.parquet("dbfs:/FileStore/tables/employees_medallion/silver/countries")


In [0]:
# create a new column called full_name via the withColumn method and using the concat_ws function

employees=employees.withColumn("FULL_NAME", concat_ws(' ', employees['FIRST_NAME'], employees['LAST_NAME']))

In [0]:
display(employees)

EMPLOYEE_ID,FIRST_NAME,LAST_NAME,HIRE_DATE,JOB_ID,SALARY,MANAGER_ID,DEPARTMENT_ID,FULL_NAME
100,Steven,King,2009-09-01,AD_PRES,24000,,90.0,Steven King
101,Neena,Kochhar,2011-12-07,AD_VP,17000,100.0,90.0,Neena Kochhar
102,Lex,De Haan,2015-03-31,AD_VP,17000,100.0,90.0,Lex De Haan
103,Alexander,Hunold,2012-03-20,IT_PROG,9000,102.0,60.0,Alexander Hunold
104,Bruce,Ernst,2013-08-05,IT_PROG,6000,103.0,60.0,Bruce Ernst
105,David,Austin,2019-09-10,IT_PROG,4800,103.0,60.0,David Austin
106,Valli,Pataballa,2020-04-22,IT_PROG,4800,103.0,60.0,Valli Pataballa
107,Diana,Lorentz,2021-04-24,IT_PROG,4200,103.0,60.0,Diana Lorentz
108,Nancy,Greenberg,2016-11-01,FI_MGR,12000,101.0,100.0,Nancy Greenberg
109,Daniel,Faviet,2016-10-31,FI_ACCOUNT,9000,108.0,100.0,Daniel Faviet


In [0]:
# Dropping unnecessary columns
employees = employees.drop("FIRST_NAME", "LAST_NAME", "MANAGER_ID")

In [0]:
# Reading in the departments parquet file from the silver layer
dept=spark.read.parquet("dbfs:/FileStore/tables/employees_medallion/silver/dept")

In [0]:
display(dept)

DEPARTMENT_ID,DEPARTMENT_NAME
10,Administration
20,Marketing
30,Purchasing
40,Human Resources
50,Shipping
60,IT
70,Public Relations
80,Sales
90,Executive
100,Finance


In [0]:
# Joining the employees and departments silver tables to include the relevant fields such as department id, and drop columns that are not required
employees = employees.join(dept, employees['department_id']==dept['department_id'], 'left').select("EMPLOYEE_ID", "FULL_NAME","HIRE_DATE", "JOB_ID", "SALARY", "DEPARTMENT_NAME")

In [0]:
employees.display()

EMPLOYEE_ID,FULL_NAME,HIRE_DATE,JOB_ID,SALARY,DEPARTMENT_NAME
100,Steven King,2009-09-01,AD_PRES,24000,Executive
101,Neena Kochhar,2011-12-07,AD_VP,17000,Executive
102,Lex De Haan,2015-03-31,AD_VP,17000,Executive
103,Alexander Hunold,2012-03-20,IT_PROG,9000,IT
104,Bruce Ernst,2013-08-05,IT_PROG,6000,IT
105,David Austin,2019-09-10,IT_PROG,4800,IT
106,Valli Pataballa,2020-04-22,IT_PROG,4800,IT
107,Diana Lorentz,2021-04-24,IT_PROG,4200,IT
108,Nancy Greenberg,2016-11-01,FI_MGR,12000,Finance
109,Daniel Faviet,2016-10-31,FI_ACCOUNT,9000,Finance


In [0]:
# Writing the employees dataframe to the gold layer
employees.write.parquet("dbfs:/FileStore/tables/employees_medallion/gold/employees", mode='overwrite')

In [0]:
display(employees)

EMPLOYEE_ID,FULL_NAME,HIRE_DATE,JOB_ID,SALARY,DEPARTMENT_NAME
100,Steven King,2009-09-01,AD_PRES,24000,Executive
101,Neena Kochhar,2011-12-07,AD_VP,17000,Executive
102,Lex De Haan,2015-03-31,AD_VP,17000,Executive
103,Alexander Hunold,2012-03-20,IT_PROG,9000,IT
104,Bruce Ernst,2013-08-05,IT_PROG,6000,IT
105,David Austin,2019-09-10,IT_PROG,4800,IT
106,Valli Pataballa,2020-04-22,IT_PROG,4800,IT
107,Diana Lorentz,2021-04-24,IT_PROG,4200,IT
108,Nancy Greenberg,2016-11-01,FI_MGR,12000,Finance
109,Daniel Faviet,2016-10-31,FI_ACCOUNT,9000,Finance
