##Bronze to silver

Employees (dataframe-parquet file)

In [0]:
from pyspark.sql.types import * 
from pyspark.sql.functions import *

In [0]:
employees_schema = StructType([
                    StructField("EMPLOYEE_ID", IntegerType(), False),
                    StructField("FIRST_NAME", StringType(), False),
                    StructField("LAST_NAME", StringType(), False),
                    StructField("EMAIL", StringType(), False),
                    StructField("PHONE_NUMBER", StringType(), False),
                    StructField("HIRE_DATE", StringType(), False),
                    StructField("JOB_ID", StringType(), False),
                    StructField("SALARY", IntegerType(), False),
                    StructField("MANAGER_ID", IntegerType(), True),
                    StructField("DEPARTMENT_ID", IntegerType(), False)
                    ]
                    )



In [0]:
employees= spark.read.csv("dbfs:/FileStore/tables/employees_medallion/bronze/employees.csv", header=True, schema= employees_schema)

In [0]:
display ( employees.limit(10) )

EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,MANAGER_ID,DEPARTMENT_ID
100,Steven,King,SKING,515.123.4567,09/01/2009,AD_PRES,24000,,90
101,Neena,Kochhar,NKOCHHAR,515.123.4568,12/07/2011,AD_VP,17000,100.0,90
102,Lex,De Haan,LDEHAAN,515.123.4569,03/31/2015,AD_VP,17000,100.0,90
103,Alexander,Hunold,AHUNOLD,590.423.4567,03/20/2012,IT_PROG,9000,102.0,60
104,Bruce,Ernst,BERNST,590.423.4568,08/05/2013,IT_PROG,6000,103.0,60
105,David,Austin,DAUSTIN,590.423.4569,09/10/2019,IT_PROG,4800,103.0,60
106,Valli,Pataballa,VPATABAL,590.423.4560,04/22/2020,IT_PROG,4800,103.0,60
107,Diana,Lorentz,DLORENTZ,590.423.5567,04/24/2021,IT_PROG,4200,103.0,60
108,Nancy,Greenberg,NGREENBE,515.124.4569,11/01/2016,FI_MGR,12000,101.0,100
109,Daniel,Faviet,DFAVIET,515.124.4169,10/31/2016,FI_ACCOUNT,9000,108.0,100


In [0]:
employees = employees.drop("EMAIL", "PHONE_NUMBER")

In [0]:
employees.printSchema()

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- MANAGER_ID: integer (nullable = true)
 |-- DEPARTMENT_ID: integer (nullable = true)



In [0]:
# Changing the data type of hire_date from string to data via the to_date function

employees = employees.select(
    "EMPLOYEE_ID",
    "FIRST_NAME",
    "LAST_NAME",
    to_date(employees["HIRE_DATE"], "MM/dd/yyyy").alias('HIRE_DATE'),
    "JOB_ID",
    "SALARY",
    "MANAGER_ID",
    "DEPARTMENT_ID"
)

In [0]:
# Writing dataframe as parquet file to the silver layer

employees.write.parquet("dbfs:/FileStore/tables/employees_medallion/silver/employees", mode='overwrite')

Departments (dataframe-parquet file )

In [0]:
dept_schema = StructType([
                    StructField("DEPARTMENT_ID", IntegerType(), False),
                    StructField("DEPARTMENT_NAME", StringType(), False),
                    StructField("MANAGER_ID", IntegerType(), True),
                    StructField("LOCATION_ID", IntegerType(), False)
                    ]
                    )
 


In [0]:
dept=spark.read.csv("dbfs:/FileStore/tables/employees_medallion/bronze/departments.csv", header=True, schema=dept_schema)

In [0]:
display(dept.limit(10))

DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID,LOCATION_ID
10,Administration,200,1700
20,Marketing,201,1800
30,Purchasing,114,1700
40,Human Resources,203,2400
50,Shipping,121,1500
60,IT,103,1400
70,Public Relations,204,2700
80,Sales,145,2500
90,Executive,100,1700
100,Finance,108,1700


In [0]:
# Dropping unnecessary columns

dept=dept.drop("MANAGER_ID","LOCATION_ID")

In [0]:
display(dept.limit(5))

DEPARTMENT_ID,DEPARTMENT_NAME
10,Administration
20,Marketing
30,Purchasing
40,Human Resources
50,Shipping


In [0]:
# Writing dataframe as parquet file to the silver layer
dept.write.parquet("dbfs:/FileStore/tables/employees_medallion/silver/dept", mode='overwrite')

Countries (dataframe -- parquet file)

In [0]:
countries_schema = StructType([
                    StructField("COUNTRY_ID", StringType(), False),
                    StructField("COUNTRY_NAME", StringType(), False)
                    ]
                    )

In [0]:
countries=spark.read.csv("dbfs:/FileStore/tables/employees_medallion/bronze/countries.csv", header=True, schema=countries_schema)

In [0]:
# Writing dataframe as parquet file to the silver layer
countries.write.parquet("dbfs:/FileStore/tables/employees_medallion/silver/countries", mode='overwrite')