# Query HDFS Data

In this notebook we will see how to query data from HDFS within SQL Server Master Instance of SQL 2019 BDC using <strong>PolyBase</strong>. We can also use **PySpark** to do this task.

## Set Database Context

In [None]:
set nocount on
use coe

## Create External Data Source

In [None]:
-- create external data source for HDFS inside SQLBDC
--

if not exists(select * from sys.external_data_sources where name = 'SqlStoragePool')
    create external data source SqlStoragePool
    with (location = 'sqlhdfs://controller-svc/default')

## Create External File Format

In [None]:
-- Create file format for parquet file with appropriate properties
--
if not exists(select * from sys.external_file_formats where name = 'parquet_file')
    create external file FORMAT parquet_file
    with (

        format_type = PARQUET
    )

## Create External Table

In [None]:
-- create a schema for external tables
--
-- this step should be permformed using new query window on the target database

## Store data from HDFS into SQL Data Pool

In [None]:
use COE 
go 
--eip-mas data
--

--1."GEN_M_Jobs"

if exists(select * from sys.external_tables where name = 'gen_m_jobs_pq_hdfs')
    drop EXTERNAL TABLE [COE].[eipmas].[gen_m_jobs_pq_hdfs]
begin 
    create EXTERNAL TABLE [COE].[eipmas].[gen_m_jobs_pq_hdfs]
    (
        "MJOB_Job_Code" varchar(8) not null,
        "MJOB_Description" varchar(100) not null,
        "MJOB_Long_Description" varchar(500) not null,
        "MJOB_AB_Code" int null,
        "MJOB_CPLT_Code" int null,
        "MJOB_Main_Sub_Dept" varchar(1) not null,
        "MJOB_Source_Tag" varchar(1) not null,
        "MJOB_Job_ID" varchar(4) null,
        "MJOB_ISActive" varchar(1) not null,
        "MJOB_Inserted_On" datetime2 not null,
        "MJOB_Inserted_By" int null,
        "MJOB_Updated_On" datetime2 null,
        "MJOB_Updated_By" int null,
        "MJOB_Is_EIP_Converted" varchar(1) null,
        "MJOB_Identity" int not null,
        "Batch_Id" bigint null,
        "Load_Date" datetime2 null
    )

  with 
    (
        DATA_SOURCE = SqlStoragePool,
        LOCATION = '/user/hive/warehouse/gen_m_jobs',
        FILE_FORMAT = parquet_file

    )
end;

--2. "GEN_M_Materials"

if exists(select * from sys.external_tables where name = 'gen_m_materials_pq_hdfs')
    drop EXTERNAL TABLE [COE].[eipmas].[gen_m_materials_pq_hdfs]
begin 
    create EXTERNAL TABLE [COE].[eipmas].[gen_m_materials_pq_hdfs]
    (
        "MMAT_Material_Code" varchar(15) not null,
        "MMAT_MG_Code" varchar(6) not null,
        "MMAT_Company_Code" int not null,
        "MMAT_Short_Name" varchar(150) null,
        "MMAT_Material_Description" varchar(500) not null,
        "MMAT_UOM_Code" int not null,
        "MMAT_Standard_Code" varchar(15) null,
        "MMAT_BTN_Code" varchar(50) null,
        "MMAT_Category_Type_Code" int null,
        "MMAT_Category_Type_Detail_Code" int null,
        "MMAT_ISActive" varchar(1) not null,
        "MMAT_Inserted_On" datetime2 not null,
        "MMAT_Inserted_By" int null,
        "MMAT_Updated_On" datetime2 null,
        "MMAT_Updated_By" int null,
        "MMAT_Is_Planning_Allowed" varchar(1) null,
        "Batch_Id" bigint null,
        "Load_Date" datetime2 null
    )

  with 
    (
        DATA_SOURCE = SqlStoragePool,
        LOCATION = '/user/hive/warehouse/gen_m_materials',
        FILE_FORMAT = parquet_file

    )
end;

--eip-pmp data
--

--1. "GEN_L_MATERIAL_CATEGORY_MATERIAL_GROUP"

if exists(select * from sys.external_tables where name = 'gen_l_material_category_material_group_pq_hdfs')
    drop EXTERNAL TABLE [COE].[eippmp].[gen_l_material_category_material_group_pq_hdfs]
begin 
    create EXTERNAL TABLE [COE].[eippmp].[gen_l_material_category_material_group_pq_hdfs]

    (

        "LMCMG_Material_Category_Code" varchar(15) not null,
        "LMCMG_MG_Code" varchar(30) not null,
        "LMCMG_Company_Code" int not null,
        "LMCMG_Inserted_On" datetime2 not null,
        "LMCMG_Inserted_By" int not null,
        "LMCMG_Updated_On" datetime2 null,
        "LMCMG_Updated_By" int null

    )

  with 
    (
        DATA_SOURCE = SqlStoragePool,
        LOCATION = '/user/hive/warehouse/gen_l_material_category_material_group',
        FILE_FORMAT = parquet_file

    )
end;

-- sqlmas data
--

--1. Gen_M_Vendors 
--

if exists(select * from sys.external_tables where name = 'gen_m_vendors_pq_hdfs')
    drop EXTERNAL TABLE [COE].[sqlmas].[gen_m_vendors_pq_hdfs]
begin 
    create EXTERNAL TABLE [COE].[sqlmas].[gen_m_vendors_pq_hdfs]

    (

        "Vendor_Code" varchar(15) null,
        "Vendor_Name" varchar(200) null,
        "Company_Code" int null,
        "IsActive" varchar(1) null

    )

  with 
    (
        DATA_SOURCE = SqlStoragePool,
        LOCATION = '/user/hive/warehouse/gen_m_vendors',
        FILE_FORMAT = parquet_file

    )
end;


In [None]:
use COE 
go 
-- count data from parquet file in hdfs
--

select count(*) as #scm_h_purchase_orders from eipscm.scm_h_purchase_orders_pq_hdfs
select count(*) as #scm_d_purchase_orders from eipscm.scm_d_purchase_orders_pq_hdfs
select count(*) as #scm_h_mrn_pq_hdfs from eipscm.scm_h_mrn_pq_hdfs
select count(*) as #scm_d_mrn_pq_hdfs from eipscm.scm_d_mrn_pq_hdfs
select count(*) as #scm_h_gin_pq_hdfs from eipscm.scm_h_gin_pq_hdfs
select count(*) as #scm_d_gin_pq_hdfs from eipscm.scm_d_gin_pq_hdfs
select count(*) as #scm_h_bill_of_entry_pq_hdfs from eipscm.scm_h_bill_of_entry_pq_hdfs
select count(*) as #scm_d_bill_of_entry_summary_pq_hdfs from eipscm.scm_d_bill_of_entry_summary_pq_hdfs
select count(*) as #scm_d_offer from eipscm.scm_d_offer_pq_hdfs
select count(*) as #scm_h_offer from eipscm.scm_h_offer_pq_hdfs
select count(*) as #gen_m_jobs_pq_hdfs from eipmas.gen_m_jobs_pq_hdfs
select count(*) as #gen_m_materials_pq_hdfs from eipmas.gen_m_materials_pq_hdfs
select count(*) as #gen_l_material_category_material_group_pq_hdfs from eippmp.gen_l_material_category_material_group_pq_hdfs