## Reading Data From ADLS Gen2 With App Details

In [None]:
# storage information
storageAccount = ""
containerName = ""
mountpoint = "/mnt/Gen2"

# client (app) information
clientID = ""
tenantID = ""
clientSecret = ""

In [None]:
# storage End point
storageEndpoint = "abfss://" + containerName + "@" + storageAccount + ".dfs.core.windows.net/"

# OAuth2 End point
oauth2Endpoint = "https://login.microsoftonline.com/" + tenantID + "/oauth2/token"

# configuration
configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": clientID,
    "fs.azure.account.oauth2.client.secret": clientSecret,
    "fs.azure.account.oauth2.client.endpoint": oauth2Endpoint
}

# connect and mount the data
try:
    dbutils.fs.mount(
        source = storageEndpoint,
        mount_point = mountpoint,
        extra_configs = configs
    )
except Exception as e:
    print("Error: \n", e)

In [None]:
%fs ls /mnt/Gen2

path,name,size,modificationTime
dbfs:/mnt/Gen2/Orders.csv,Orders.csv,217,1672387586000


In [None]:
# display data
display(dbutils.fs.ls("/mnt/Gen2"))


path,name,size,modificationTime
dbfs:/mnt/Gen2/Orders.csv,Orders.csv,217,1672387586000


In [None]:
df = spark.read.format("csv").option('header', True).load("dbfs:/mnt/Gen2/Orders.csv")


In [None]:
display(df.limit(5))

OrderNo,Quantity,Price
1,200,1000
2,201,2000
3,202,1300
4,203,890
5,204,2220


In [None]:
# unmount 
dbutils.fs.unmount("/mnt/Gen2")

/mnt/Gen2 has been unmounted.
Out[7]: True

## Reading Data From ADLS Gen2 Storage with Access Key Directly

In [None]:
# account details
filePath = "Orders.csv"
containerName = ""
storageAccount = ""
acct_info = "fs.azure.account.key.{}.dfs.core.windows.net".format(storageaccount)
accessKey = ""


# connect to storage folder
spark.conf.set(acct_info, accessKey)

In [None]:
# storage end point
storageEndpoint = "abfss://" + containerName + "@" + storageAccount + ".dfs.core.windows.net/" + filePath
dbutils.fs.ls(storageEndpoint)

Out[12]: [FileInfo(path='abfss://rawdata@mycookbookadlsgen2store.dfs.core.windows.net/Orders.csv', name='Orders.csv', size=217, modificationTime=1672387586000)]

In [None]:
orders_df = spark.read.format("csv").option("header", True).load(storageEndpoint)
display(orders_df.limit(5))

OrderNo,Quantity,Price
1,200,1000
2,201,2000
3,202,1300
4,203,890
5,204,2220


## Reading Data From ADLS Gen2 Directly With Client Details Without Mounting The Storage

In [None]:
# storage information
storageAccount = ""
containerName = ""
mountpoint = "/mnt/Gen2"
folderPath = ""

# client (app) information
clientID = ""
tenantID = ""
clientSecret = ""

# storage End point
storageEndpoint = "abfss://" + containerName + "@" + storageAccount + ".dfs.core.windows.net/" + folderPath

# OAuth2 End point
oauth2Endpoint = "https://login.microsoftonline.com/" + tenantID + "/oauth2/token"

# connection configurations
# set OAuth
spark.conf.set(
    "fs.azure.account.auth.type." + storageAccount + ".dfs.core.windows.net",
    "OAuth"
)
# set Authentication type
spark.conf.set(
    "fs.azure.account.oauth.provider.type."+storageAccount+".dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
)
# Configure client ID
spark.conf.set(
    "fs.azure.account.oauth2.client.id." + storageAccount + ".dfs.core.windows.net",
    clientID
)
# Configure Client Secret
spark.conf.set(
    "fs.azure.account.oauth2.client.secret."+storageAccount+".dfs.core.windows.net",
    clientSecret
)
# Configuration OAuth2 Endpoint
spark.conf.set(
    "fs.azure.account.oauth2.client.endpoint."+storageAccount+".dfs.core.windows.net",
    oauth2Endpoint
)


In [None]:
# define schema
from pyspark.sql.types import *

customer_schema = StructType([
    StructField("C_CUSTKEY", IntegerType()),
    StructField("C_NAME", StringType()),
    StructField("C_ADDRESS", StringType()),
    StructField("C_NATIONKEY", ShortType()),
    StructField("C_PHONE", StringType()),
    StructField("C_ACCTBAL", DoubleType()),
    StructField("C_MKTSEGMENT", StringType()),
    StructField("C_COMMENT", StringType())
])

In [None]:
# read data to a dataframe
df_direct = spark.read.format("csv").option("header", True)\
    .schema(customer_schema).load(storageEndpoint+"/part-0*.csv")

In [None]:
display(df_direct.limit(10))

C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
35165,Customer#000035165,eNQSvDTld1 f7JmY,0,10-173-541-5438,4767.46,AUTOMOBILE,special excuses. furiously pending packages
30597,Customer#000030597,S9s1dDut8Q,0,10-607-243-5581,-639.62,FURNITURE,lithely ruthless packages alongside of the blithely final p
42279,Customer#000042279,ABcVdNnA3JFB7bK5,0,10-934-981-2863,2236.39,MACHINERY,the even deposits sleep blithely packages. quickly express packages should have to detec
42578,Customer#000042578,l6VNaE7iSZFtkSC5fSuLeaoWTJgx5,0,10-281-998-8028,6429.8,BUILDING,y alongside of the platelets. regular deposits sleep fluffily blithely silent pinto beans: re
37854,Customer#000037854,dL6LCTLpY9hjLTrZ7g,0,10-909-820-4270,9549.78,BUILDING,inder blithely deposits. instructions nag quickly regular packages. regular requests
40053,Customer#000040053,qh8Q6gaffF73cm73K2R,0,10-593-423-2533,209.4,MACHINERY,c pinto beans. special instructions cajole fluffy
44060,Customer#000044060,"XXYMZ4Jd4PY3WJZA5bok4u 7oknfVG,rNobaef",0,10-519-920-9801,1150.68,BUILDING,according to the furiously bold instructions. regular ideas after th
49988,Customer#000049988,"oYYSmhl,K3t AwKzm5FmeEwz5lw hLf7z9m",0,10-704-487-3360,3708.73,BUILDING,"t quickly. pending, special accounts cajole furiously quick pinto beans. fluffily"
43569,Customer#000043569,"ANmTNESWDI17e2pG7j7min2Jm,vHJ",0,10-911-273-3629,3239.41,MACHINERY,"of the slyly even deposits. unusual, even theodolites about the slyly silent accounts bo"
27443,Customer#000027443,xOgk0us699smqWP3US4ufY MhkfbwNdJvCv,0,10-492-101-8357,4304.73,MACHINERY,ly bold accounts at the carefully final ideas nag slyly final accounts. express requests are fluffily


In [None]:
df_direct.count()

Out[18]: 75000

In [None]:
df_direct.printSchema()

root
 |-- C_CUSTKEY: integer (nullable = true)
 |-- C_NAME: string (nullable = true)
 |-- C_ADDRESS: string (nullable = true)
 |-- C_NATIONKEY: short (nullable = true)
 |-- C_PHONE: string (nullable = true)
 |-- C_ACCTBAL: double (nullable = true)
 |-- C_MKTSEGMENT: string (nullable = true)
 |-- C_COMMENT: string (nullable = true)



In [None]:
df_agg = df_direct.groupBy("C_MKTSEGMENT").sum("C_ACCTBAL")
df_agg.show()

+------------+-------------------+
|C_MKTSEGMENT|     sum(C_ACCTBAL)|
+------------+-------------------+
|   MACHINERY|6.734692830999999E7|
|  AUTOMOBILE|6.689497999000001E7|
|    BUILDING|6.816101067999995E7|
|   HOUSEHOLD|6.797432754000002E7|
|   FURNITURE|6.629614178000001E7|
+------------+-------------------+



In [None]:
df_agg.count()

Out[39]: 5

In [None]:
dest_folder = ""
destinationDir = "abfss://"+containerName+"@"+storageAccount+".dfs.core.windows.net/" + dest_folder
df_agg.write.mode("overwrite").option("header", True).csv(destinationDir)