# Data Engineer with Azure
## Datapath Project
## Author: Gael Velasquez

In [8]:
spark

StatementMeta(DatapathSpark, 6, 6, Finished, Available)

# **EXTRACT DATA**

In [9]:
from pyspark.sql.types import StructField, StructType, StringType, LongType, DateType, IntegerType, FloatType
from pyspark.sql.functions import substring, col, when

StatementMeta(DatapathSpark, 6, 7, Finished, Available)

In [10]:
bronze_path = 'abfss://projectcontainer@datapathproject.dfs.core.windows.net/bronze'
silver_path = 'abfss://projectcontainer@datapathproject.dfs.core.windows.net/silver'
gold_path = 'abfss://projectcontainer@datapathproject.dfs.core.windows.net/gold'

StatementMeta(DatapathSpark, 6, 8, Finished, Available)

## Departments

In [11]:
Departments_Schema = StructType([
StructField("deparment_id", StringType(), True),
StructField("department_name", StringType(), True)])

StatementMeta(DatapathSpark, 6, 9, Finished, Available)

In [12]:
ddepartments = (
  spark
  .read
  .format("csv")
  .option("header", "false")
  .option("delimiter", "|")
  .schema(Departments_Schema)
  .load(f"{bronze_path}/departments")
)

StatementMeta(DatapathSpark, 6, 10, Finished, Available)

In [13]:
ddepartments.write.mode("overwrite").format("delta").save(f"{silver_path}/departments")

StatementMeta(DatapathSpark, 6, 11, Finished, Available)

## Categories

In [14]:
Categories_Schema = StructType([
StructField("category_id", StringType(), True),
StructField("category_department_id", StringType(), True),
StructField("category_name", StringType(), True)])

StatementMeta(DatapathSpark, 6, 12, Finished, Available)

In [15]:
dcategories = (
  spark
  .read
  .format("csv")
  .option("header", "false")
  .option("delimiter", "|")
  .schema(Categories_Schema)
  .load(f"{bronze_path}/categories")
)

StatementMeta(DatapathSpark, 6, 13, Finished, Available)

In [16]:
dcategories.write.mode("overwrite").format("delta").save(f"{silver_path}/categories")

StatementMeta(DatapathSpark, 6, 14, Finished, Available)

## Products

In [17]:
Products_Schema = StructType([
StructField("product_id", StringType(), True),
StructField("product_category_id", StringType(), True),
StructField("product_name", StringType(), True),
StructField("product_description", StringType(), True),
StructField("product_price", FloatType(), True),
StructField("product_image", StringType(), True)])

StatementMeta(DatapathSpark, 6, 15, Finished, Available)

In [18]:
dproducts = (
  spark
  .read
  .format("csv")
  .option("header", "false")
  .option("delimiter", "|")
  .schema(Products_Schema)
  .load(f"{bronze_path}/products")
)

StatementMeta(DatapathSpark, 6, 16, Finished, Available)

In [19]:
dproducts.write.mode("overwrite").format("delta").save(f"{silver_path}/products")

StatementMeta(DatapathSpark, 6, 17, Finished, Available)

## Order Items

In [20]:
Order_Items_Schema = StructType([
StructField("order_item_id", StringType(), True),
StructField("order_item_order_id", StringType(), True),
StructField("order_item_product_id", StringType(), True),
StructField("order_item_quantity", FloatType(), True),
StructField("order_item_subtotal", FloatType(), True),
StructField("order_item_product_price", FloatType(), True)])

StatementMeta(DatapathSpark, 6, 18, Finished, Available)

In [21]:
dorder_items = (
  spark
  .read
  .format("csv")
  .option("header", "false")
  .option("delimiter", "|")
  .schema(Order_Items_Schema)
  .load(f"{bronze_path}/order_items")
)

StatementMeta(DatapathSpark, 6, 19, Finished, Available)

In [22]:
dorder_items.write.mode("overwrite").format("delta").save(f"{silver_path}/order_items")

StatementMeta(DatapathSpark, 6, 20, Finished, Available)

## Orders

In [23]:
Order_Schema = StructType([
StructField("order_id", StringType(), True),
StructField("order_date", DateType(), True),
StructField("order_customer_id", StringType(), True),
StructField("order_status", StringType(), True)])

StatementMeta(DatapathSpark, 6, 21, Finished, Available)

In [24]:
dorders = (
  spark
  .read
  .format("csv")
  .option("header", "false")
  .option("delimiter", "|")
  .schema(Order_Schema)
  .load(f"{bronze_path}/orders")
)

StatementMeta(DatapathSpark, 6, 22, Finished, Available)

In [25]:
dorders.write.mode("overwrite").format("delta").save(f"{silver_path}/orders")

StatementMeta(DatapathSpark, 6, 23, Finished, Available)

## Customers

In [26]:
Curstomers_Schema = StructType([
StructField("customer_id", StringType(), True),
StructField("customer_fname", StringType(), True),
StructField("customer_lname", StringType(), True),
StructField("customer_email", StringType(), True),
StructField("customer_password", StringType(), True),
StructField("customer_street", StringType(), True),
StructField("customer_city", StringType(), True),
StructField("customer_state", StringType(), True),
StructField("order_zipcode", StringType(), True)])


StatementMeta(DatapathSpark, 6, 24, Finished, Available)

In [27]:
dcustomers = (
  spark
  .read
  .format("csv")
  .option("header", "false")
  .option("delimiter", "|")
  .schema(Curstomers_Schema)
  .load(f"{bronze_path}/customer")
)

StatementMeta(DatapathSpark, 6, 25, Finished, Available)

In [28]:
dcustomers.write.mode("overwrite").format("delta").save(f"{silver_path}/customer")

StatementMeta(DatapathSpark, 6, 26, Finished, Available)