### Read from ADLS
In Databricks it is easiest to mount the ADLS location. You can read from local filesystem or specially mounted folders depending on the environment you are in. Many different protocols that may require their own Spark extensions to be installed.

Common protocols:
- dbfs:/     Databricks filesystem
- file://    Local filesystem
- hdfs://    Hadoop filesystem
- abfss://   Azure Data Lake Storage (Gen2)
- wasbs://   Azure Blob Storage
- s3://      AWS S3 (several variations of this)
- gs://      Google Cloud Storage

In [0]:
# CSV
taxi_zone_path = "/mnt/adlsdemo/nyctaxi/lookups/taxi_zone"
csv_df = (
  spark.read
    .option("header","true")
    .option("inferSchema", "true")
    .csv("/databricks-datasets/nyctaxi/taxizone/taxi_zone_lookup.csv") 
 )
display(csv_df)# In Databricks it is easiest to mount the ADLS location

LocationID,Borough,Zone,service_zone
1,EWR,Newark Airport,EWR
2,Queens,Jamaica Bay,Boro Zone
3,Bronx,Allerton/Pelham Gardens,Boro Zone
4,Manhattan,Alphabet City,Yellow Zone
5,Staten Island,Arden Heights,Boro Zone
6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone
7,Queens,Astoria,Boro Zone
8,Queens,Astoria Park,Boro Zone
9,Queens,Auburndale,Boro Zone
10,Queens,Baisley Park,Boro Zone


In [0]:
# Parquet
yellow_parquet_path = "/mnt/adlsdemo/nyctaxi/tripdata/yellow_parquet"

parquet_df = (
  spark.read
    .parquet(yellow_parquet_path)
)
display(parquet_df)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_dt,dropoff_dt,tip_pct,year_month
2,2019-07-01 00:51:04,2019-07-01 00:51:33,1,0.0,1,N,193,193,1,2.5,0.5,0.5,1.14,0.0,0.3,4.94,2019-07-01,2019-07-01,0.2307692307692307,2019_07
2,2019-07-01 00:46:04,2019-07-01 01:05:46,1,4.16,1,N,234,25,2,16.5,0.5,0.5,0.0,0.0,0.3,20.3,2019-07-01,2019-07-01,0.0,2019_07
1,2019-07-01 00:25:09,2019-07-01 01:00:56,1,18.8,2,N,132,42,1,52.0,0.0,0.5,11.75,6.12,0.3,70.67,2019-07-01,2019-07-01,0.1662657421819725,2019_07
2,2019-07-01 00:33:32,2019-07-01 01:15:27,1,18.46,2,N,132,142,1,52.0,0.0,0.5,11.06,0.0,0.3,66.36,2019-07-01,2019-07-01,0.1666666666666666,2019_07
1,2019-07-01 00:00:55,2019-07-01 00:13:05,0,1.7,1,N,107,114,1,9.5,3.0,0.5,2.0,0.0,0.3,15.3,2019-07-01,2019-07-01,0.130718954248366,2019_07
1,2019-07-01 00:18:23,2019-07-01 00:30:42,0,2.1,1,N,125,45,1,10.0,3.0,0.5,2.75,0.0,0.3,16.55,2019-07-01,2019-07-01,0.1661631419939577,2019_07
1,2019-07-01 00:28:09,2019-07-01 00:51:00,1,2.4,1,N,142,68,1,13.0,3.0,0.5,3.35,0.0,0.3,20.15,2019-07-01,2019-07-01,0.1662531017369727,2019_07
1,2019-07-01 00:57:07,2019-07-01 01:11:41,1,3.0,1,N,246,141,2,12.5,3.0,0.5,0.0,0.0,0.3,16.3,2019-07-01,2019-07-01,0.0,2019_07
4,2019-07-01 00:06:16,2019-07-01 00:33:14,1,7.89,1,N,50,80,1,26.0,0.5,0.5,5.96,0.0,0.3,35.76,2019-07-01,2019-07-01,0.1666666666666666,2019_07
4,2019-07-01 00:37:19,2019-07-01 00:52:30,1,4.09,1,N,80,97,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3,2019-07-01,2019-07-01,0.0,2019_07


In [0]:
# Delta
yellow_delta_path = "/mnt/adlsdemo/nyctaxi/tripdata/yellow_delta"

delta_df = (
  spark.read
    .format("delta")
    .load(yellow_delta_path)
)
display(delta_df)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,year_month,pickup_dt,dropoff_dt,tip_pct,Borough,PickupZone,PickupServiceZone
1,2019-03-22 10:15:43,2019-03-22 10:35:51,1,3.6,1,N,229,74,1,15.5,2.5,0.5,3.75,0.0,0.3,22.55,2019_03,2019-03-22,2019-03-22,0.1662971175166297,Manhattan,Sutton Place/Turtle Bay North,Yellow Zone
1,2019-03-01 00:24:41,2019-03-01 00:25:31,1,0.0,1,N,145,145,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,2019_03,2019-03-01,2019-03-01,0.0,Queens,Long Island City/Hunters Point,Boro Zone
1,2019-03-22 10:38:54,2019-03-22 11:17:52,1,4.9,1,N,230,112,1,26.0,2.5,0.5,6.0,0.0,0.3,35.3,2019_03,2019-03-22,2019-03-22,0.1699716713881019,Manhattan,Times Sq/Theatre District,Yellow Zone
1,2019-03-01 00:25:27,2019-03-01 00:36:37,2,3.7,1,N,95,130,1,13.0,0.5,0.5,0.7,0.0,0.3,15.0,2019_03,2019-03-01,2019-03-01,0.0466666666666666,Queens,Forest Hills,Boro Zone
1,2019-03-22 10:13:24,2019-03-22 10:30:54,1,1.8,1,N,186,50,1,12.0,2.5,0.5,3.05,0.0,0.3,18.35,2019_03,2019-03-22,2019-03-22,0.1662125340599455,Manhattan,Penn Station/Madison Sq West,Yellow Zone
1,2019-03-01 00:05:21,2019-03-01 00:38:23,1,14.1,1,N,249,28,1,41.0,3.0,0.5,10.1,5.76,0.3,60.66,2019_03,2019-03-01,2019-03-01,0.1665018133860863,Manhattan,West Village,Yellow Zone
1,2019-03-22 10:50:52,2019-03-22 11:04:47,1,0.7,1,N,68,246,1,9.5,2.5,0.5,2.55,0.0,0.3,15.35,2019_03,2019-03-22,2019-03-22,0.1661237785016286,Manhattan,East Chelsea,Yellow Zone
1,2019-03-01 00:48:55,2019-03-01 01:06:03,1,9.6,1,N,138,98,2,27.0,0.5,0.5,0.0,0.0,0.3,28.3,2019_03,2019-03-01,2019-03-01,0.0,Queens,LaGuardia Airport,Airports
1,2019-03-22 10:47:54,2019-03-22 10:53:21,1,0.7,1,N,125,231,2,5.5,2.5,0.5,0.0,0.0,0.3,8.8,2019_03,2019-03-22,2019-03-22,0.0,Manhattan,Hudson Sq,Yellow Zone
1,2019-03-01 00:11:42,2019-03-01 00:16:40,1,0.8,1,N,48,48,1,5.5,3.0,0.5,3.0,0.0,0.3,12.3,2019_03,2019-03-01,2019-03-01,0.2439024390243902,Manhattan,Clinton East,Yellow Zone


### Read from JDBC

In [0]:
database = "StackOverflow2010"
db_host_name = "sandbox-2-sqlserver.database.windows.net"
db_url = f"jdbc:sqlserver://{db_host_name};databaseName={database}"
db_user = dbutils.secrets.get("demo", "sql-user-stackoverflow") # databricks
db_password = dbutils.secrets.get("demo", "sql-pwd-stackoverflow") #databricks
  
table = "Users"

df = (
    spark.read
    .format("jdbc")
    .option("url", db_url)
    .option("dbtable", table)
    .option("user", db_user)
    .option("password", db_password)
    .load()
)

display(df)

Id,AboutMe,Age,CreationDate,DisplayName,DownVotes,EmailHash,LastAccessDate,Location,Reputation,UpVotes,Views,WebsiteUrl,AccountId
-1,"Hi, I'm not really a person. I'm a background process that helps keep this site clean! I do things like Randomly poke old unanswered questions every hour so they get some attention Own community questions and answers so nobody gets unnecessary reputation from them Own downvotes on spam/evil posts that get permanently deleted Own suggested edits from anonymous users Remove abandoned questions",,2008-07-31T00:00:00.000+0000,Community,980920,,2008-08-26T00:16:53.810+0000,on the server farm,1,274835,649,http://meta.stackexchange.com/,-1
1,"Stack Overflow Valued Associate #00001 Wondering how our software development process works? Take a look! Find me on twitter, or read my blog. Don't say I didn't warn you because I totally did. However, I no longer work at Stack Exchange, Inc. I'll miss you all. Well, some of you, anyway. :)",,2008-07-31T14:22:31.287+0000,Jeff Atwood,1309,,2018-08-29T02:34:22.893+0000,"El Cerrito, CA",44300,3367,408587,http://www.codinghorror.com/blog/,1
2,Developer on the Stack Overflow team. Find me on Twitter Stack Overflow Valued Associate #00003,,2008-07-31T14:22:31.287+0000,Geoff Dalgas,88,,2018-08-23T17:31:56.427+0000,"Corvallis, OR",3491,650,23966,http://stackoverflow.com,2
3,"Developer on the Stack Overflow team. Was dubbed SALTY SAILOR by Jeff Atwood, as filth and flarn would oft-times fly when dealing with a particularly nasty bug! Twitter me: jarrod_dixon Email me: jarrod.m.dixon@gmail.com",,2008-07-31T14:22:31.287+0000,Jarrod Dixon,100,,2018-08-30T20:56:23.897+0000,"Raleigh, NC, United States",13418,7285,24396,http://jarroddixon.com,3
4,"I am: the co-founder and CEO of Stack Overflow the co-founder of Fog Creek Software the creator of Trello (now owned by Atlassian) You can find me on my rarely-updated blog, Joel on Software.",,2008-07-31T14:22:31.317+0000,Joel Spolsky,96,,2018-08-14T22:18:15.227+0000,"New York, NY",28768,797,73755,http://www.joelonsoftware.com/,4
5,"Technical Evangelist at Microsoft, specializing in ASP.NET MVC. I don't use this site anymore because the moderators close or delete far too many of the useful questions.",,2008-07-31T14:22:31.317+0000,Jon Galloway,34,,2018-08-29T16:48:35.993+0000,"San Diego, CA",39172,781,11700,http://weblogs.asp.net/jgalloway/,5
8,"This is a puppet test account I use to validate ""regular user"" stuff on the site -- Jeff Atwood",,2008-07-31T21:33:24.057+0000,Eggs McLaren,9,,2018-04-09T02:04:55.577+0000,,942,12,6372,,6
9,Independent software engineer,,2008-07-31T21:35:26.517+0000,Kevin Dente,4,,2018-08-30T18:18:03.423+0000,"Oakland, CA",14337,46,4949,http://weblogs.asp.net/kdente,7
10,"I'm not takin' my sneakers off! Actually, I'm a test account, used to help debug problems here on StackOverflow.",,2008-07-31T21:57:06.240+0000,Sneakers O'Toole,0,,2018-06-08T05:11:12.523+0000,"Morganton, North Carolina United States",101,0,3678,https://www.youtube.com/watch?v=OcSKd13mKUY,8
11,,,2008-08-01T00:59:11.147+0000,Anonymous User,0,,2008-08-01T00:59:11.147+0000,,1890,0,2123,,561854


###Read from SQL Server (special driver)

In [0]:
  # On Databricks, need to add library for com.microsoft.azure:spark-mssql-connector_2.12:1.2.0 and set secrets
database = "StackOverflow2010"
db_host_name = "sandbox-2-sqlserver.database.windows.net"
db_url = f"jdbc:sqlserver://{db_host_name};databaseName={database}"
db_user = dbutils.secrets.get("demo", "sql-user-stackoverflow") # databricks
db_password = dbutils.secrets.get("demo", "sql-pwd-stackoverflow") #databricks

table = "Users"

sql = f"select Location, count(1) as record_count from {table} group by Location having count(1) > 50"

df = (
    spark.read
    .format("com.microsoft.sqlserver.jdbc.spark")
    .option("url", db_url)
#     .option("dbtable", table)
  .option("query", sql)
    .option("user", db_user)
    .option("password", db_password)
    .load()
)

display(df)

Location,record_count
,169193
,15820
"Adelaide, Australia",124
"Ahmadabad, India",159
Alabama,52
"Amsterdam, Netherlands",295
"Amsterdam, The Netherlands",169
"Ankara, Turkey",66
"Ann Arbor, MI",97
"Antwerp, Belgium",55


### Read from Snowflake

In [0]:
user = dbutils.secrets.get("demo", "snowflake-user")
password = dbutils.secrets.get("demo", "snowflake-password")
snowflake_url = "https://ux22775.west-us-2.azure.snowflakecomputing.com/"
snowflake_database = "CITIBIKE"
snowflake_schema = "PUBLIC"
snowflake_cluster = "COMPUTE_WH"

# snowflake connection options
options = {
  "sfUrl": snowflake_url,
  "sfUser": user,
  "sfPassword": password,
  "sfDatabase": snowflake_database,
  "sfSchema": snowflake_schema,
  "sfWarehouse": snowflake_cluster
}

df = spark.read \
  .format("snowflake") \
  .options(**options) \
  .option("dbtable", "spark_test") \
  .load()

display(df)

### Read with Pandas DataFrame

In [0]:
%pip install xlrd openpyxl

In [0]:
import pandas as pd
pandas_df = pd.read_excel("/dbfs/data/TaxiExample.xlsx")
spark_df = spark.createDataFrame(pandas_df)
display(spark_df)

LocationID,Borough,Zone
1,EWR,Newark Airport
2,Queens,Jamaica Bay
3,Bronx,Allerton/Pelham Gardens
4,Manhattan,Alphabet City


### Read from XML
I am sorry you have to deal with XML, but here is how.

*Requires installing com.databricks:spark-xml_2.12:0.15.0*

In [0]:
%scala
val file = "Users"
val table = file.toLowerCase()
val df = spark.read.format("xml").option("samplingRatio", 0.001).option("rootTag", table).option("rowTag", "row").option("inferSchema", "true").load(s"dbfs:/tmp/${file}.xml")