#### **Filtering Specific File Types**

- If you're interested in files of a **specific type**, for example, **.csv** files, you can filter the results:

In [0]:
from pyspark.sql.functions import col

In [0]:
%fs ls dbfs:/FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/Emp_Hash-1.csv,Emp_Hash-1.csv,3312,1733110041000
dbfs:/FileStore/tables/Emp_Hash-2.csv,Emp_Hash-2.csv,6365,1733125960000
dbfs:/FileStore/tables/Emp_Hash-3.csv,Emp_Hash-3.csv,6385,1733126482000
dbfs:/FileStore/tables/Emp_Hash.csv,Emp_Hash.csv,3310,1733108841000
dbfs:/FileStore/tables/Flatten Nested Array.json,Flatten Nested Array.json,3756,1718618620000
dbfs:/FileStore/tables/Generate_Random_Data/,Generate_Random_Data/,0,0
dbfs:/FileStore/tables/InterviewQuestions/,InterviewQuestions/,0,0
dbfs:/FileStore/tables/MarketPrice.csv,MarketPrice.csv,19528,1719656208000
dbfs:/FileStore/tables/MultiLineJSON.json/,MultiLineJSON.json/,0,0
dbfs:/FileStore/tables/MultiLineJSON01.json/,MultiLineJSON01.json/,0,0


In [0]:
dbutils.fs.ls("dbfs:/FileStore/tables/")

[FileInfo(path='dbfs:/FileStore/tables/Emp_Hash-1.csv', name='Emp_Hash-1.csv', size=3312, modificationTime=1733110041000),
 FileInfo(path='dbfs:/FileStore/tables/Emp_Hash-2.csv', name='Emp_Hash-2.csv', size=6365, modificationTime=1733125960000),
 FileInfo(path='dbfs:/FileStore/tables/Emp_Hash-3.csv', name='Emp_Hash-3.csv', size=6385, modificationTime=1733126482000),
 FileInfo(path='dbfs:/FileStore/tables/Emp_Hash.csv', name='Emp_Hash.csv', size=3310, modificationTime=1733108841000),
 FileInfo(path='dbfs:/FileStore/tables/Flatten Nested Array.json', name='Flatten Nested Array.json', size=3756, modificationTime=1718618620000),
 FileInfo(path='dbfs:/FileStore/tables/Generate_Random_Data/', name='Generate_Random_Data/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/InterviewQuestions/', name='InterviewQuestions/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/MarketPrice.csv', name='MarketPrice.csv', size=19528, modificationTime=1719656208000),
 

why anyone wants to use the **dbutils package**, instead of the **Magic Command %fs** in a databricks notebook?
- dbutils package provides greater flexibility as it can be combined with other native programming languages like **python, R, SCALA**.

In [0]:
# output in tabular format
display(dbutils.fs.ls("dbfs:/FileStore/tables/"))

path,name,size,modificationTime
dbfs:/FileStore/tables/Emp_Hash-1.csv,Emp_Hash-1.csv,3312,1733110041000
dbfs:/FileStore/tables/Emp_Hash-2.csv,Emp_Hash-2.csv,6365,1733125960000
dbfs:/FileStore/tables/Emp_Hash-3.csv,Emp_Hash-3.csv,6385,1733126482000
dbfs:/FileStore/tables/Emp_Hash.csv,Emp_Hash.csv,3310,1733108841000
dbfs:/FileStore/tables/Flatten Nested Array.json,Flatten Nested Array.json,3756,1718618620000
dbfs:/FileStore/tables/Generate_Random_Data/,Generate_Random_Data/,0,0
dbfs:/FileStore/tables/InterviewQuestions/,InterviewQuestions/,0,0
dbfs:/FileStore/tables/MarketPrice.csv,MarketPrice.csv,19528,1719656208000
dbfs:/FileStore/tables/MultiLineJSON.json/,MultiLineJSON.json/,0,0
dbfs:/FileStore/tables/MultiLineJSON01.json/,MultiLineJSON01.json/,0,0


**Method 1: Using Databricks Utilities (dbutils)**
- **dbutils.fs.ls()** to list files and filter **.csv** files.

In [0]:
# List all files in the directory
files = dbutils.fs.ls("dbfs:/FileStore/tables/")

In [0]:
# Filter only CSV files
csv_files = [file for file in files if file.name.endswith('.csv')]

for csv_file in csv_files:
    print(csv_file)

FileInfo(path='dbfs:/FileStore/tables/Emp_Hash-1.csv', name='Emp_Hash-1.csv', size=3312, modificationTime=1733110041000)
FileInfo(path='dbfs:/FileStore/tables/Emp_Hash-2.csv', name='Emp_Hash-2.csv', size=6365, modificationTime=1733125960000)
FileInfo(path='dbfs:/FileStore/tables/Emp_Hash-3.csv', name='Emp_Hash-3.csv', size=6385, modificationTime=1733126482000)
FileInfo(path='dbfs:/FileStore/tables/Emp_Hash.csv', name='Emp_Hash.csv', size=3310, modificationTime=1733108841000)
FileInfo(path='dbfs:/FileStore/tables/MarketPrice.csv', name='MarketPrice.csv', size=19528, modificationTime=1719656208000)
FileInfo(path='dbfs:/FileStore/tables/Question7.csv', name='Question7.csv', size=154, modificationTime=1725816645000)
FileInfo(path='dbfs:/FileStore/tables/RunningData_Rev03.csv', name='RunningData_Rev03.csv', size=1216, modificationTime=1719810946000)
FileInfo(path='dbfs:/FileStore/tables/SalesData_Rev02.csv', name='SalesData_Rev02.csv', size=472, modificationTime=1719810784000)
FileInfo(path

In [0]:
# Filter only CSV files
csv_files_path = [file.path for file in files if file.name.endswith(".csv")]
print(csv_files_path)

['dbfs:/FileStore/tables/Emp_Hash-1.csv', 'dbfs:/FileStore/tables/Emp_Hash-2.csv', 'dbfs:/FileStore/tables/Emp_Hash-3.csv', 'dbfs:/FileStore/tables/Emp_Hash.csv', 'dbfs:/FileStore/tables/MarketPrice.csv', 'dbfs:/FileStore/tables/Question7.csv', 'dbfs:/FileStore/tables/RunningData_Rev03.csv', 'dbfs:/FileStore/tables/SalesData_Rev02.csv', 'dbfs:/FileStore/tables/SalesData_Rev03.csv', 'dbfs:/FileStore/tables/Sales_Collect.csv', 'dbfs:/FileStore/tables/Sales_Collect_Rev03.csv', 'dbfs:/FileStore/tables/StringToMaptype-1.csv', 'dbfs:/FileStore/tables/StringToMaptype.csv', 'dbfs:/FileStore/tables/StructType-4.csv', 'dbfs:/FileStore/tables/StructType-5.csv', 'dbfs:/FileStore/tables/StructType.csv', 'dbfs:/FileStore/tables/booleantype-1.csv', 'dbfs:/FileStore/tables/booleantype-2.csv', 'dbfs:/FileStore/tables/booleantype-3.csv', 'dbfs:/FileStore/tables/booleantype-4.csv', 'dbfs:/FileStore/tables/booleantype-5.csv', 'dbfs:/FileStore/tables/booleantype.csv', 'dbfs:/FileStore/tables/cross_join-1.c

In [0]:
# Display CSV file paths
for csv_file in csv_files_path:
    print(csv_file)

dbfs:/FileStore/tables/Emp_Hash-1.csv
dbfs:/FileStore/tables/Emp_Hash-2.csv
dbfs:/FileStore/tables/Emp_Hash-3.csv
dbfs:/FileStore/tables/Emp_Hash.csv
dbfs:/FileStore/tables/MarketPrice.csv
dbfs:/FileStore/tables/Question7.csv
dbfs:/FileStore/tables/RunningData_Rev03.csv
dbfs:/FileStore/tables/SalesData_Rev02.csv
dbfs:/FileStore/tables/SalesData_Rev03.csv
dbfs:/FileStore/tables/Sales_Collect.csv
dbfs:/FileStore/tables/Sales_Collect_Rev03.csv
dbfs:/FileStore/tables/StringToMaptype-1.csv
dbfs:/FileStore/tables/StringToMaptype.csv
dbfs:/FileStore/tables/StructType-4.csv
dbfs:/FileStore/tables/StructType-5.csv
dbfs:/FileStore/tables/StructType.csv
dbfs:/FileStore/tables/booleantype-1.csv
dbfs:/FileStore/tables/booleantype-2.csv
dbfs:/FileStore/tables/booleantype-3.csv
dbfs:/FileStore/tables/booleantype-4.csv
dbfs:/FileStore/tables/booleantype-5.csv
dbfs:/FileStore/tables/booleantype.csv
dbfs:/FileStore/tables/cross_join-1.csv
dbfs:/FileStore/tables/cross_join.csv
dbfs:/FileStore/tables/cros

##### **Accessing Specific File Attributes**

- Each object returned by dbutils.fs.ls has attributes like **path, name, size, and modificationTime**. You can access these individually:

In [0]:
for file in files:
    print(f"Path: {file.path}, Name: {file.name}, Size: {file.size}, Modified: {file.modificationTime}")

Path: dbfs:/FileStore/tables/Emp_Hash-1.csv, Name: Emp_Hash-1.csv, Size: 3312, Modified: 1733110041000
Path: dbfs:/FileStore/tables/Emp_Hash-2.csv, Name: Emp_Hash-2.csv, Size: 6365, Modified: 1733125960000
Path: dbfs:/FileStore/tables/Emp_Hash-3.csv, Name: Emp_Hash-3.csv, Size: 6385, Modified: 1733126482000
Path: dbfs:/FileStore/tables/Emp_Hash.csv, Name: Emp_Hash.csv, Size: 3310, Modified: 1733108841000
Path: dbfs:/FileStore/tables/Flatten Nested Array.json, Name: Flatten Nested Array.json, Size: 3756, Modified: 1718618620000
Path: dbfs:/FileStore/tables/Generate_Random_Data/, Name: Generate_Random_Data/, Size: 0, Modified: 0
Path: dbfs:/FileStore/tables/InterviewQuestions/, Name: InterviewQuestions/, Size: 0, Modified: 0
Path: dbfs:/FileStore/tables/MarketPrice.csv, Name: MarketPrice.csv, Size: 19528, Modified: 1719656208000
Path: dbfs:/FileStore/tables/MultiLineJSON.json/, Name: MultiLineJSON.json/, Size: 0, Modified: 0
Path: dbfs:/FileStore/tables/MultiLineJSON01.json/, Name: Multi

**Method 2: Using PySpark (spark.read.format())**

In [0]:
# Read all files from directory
df = spark.read.format("csv").option("header", True).load("dbfs:/FileStore/tables/*")
display(df)

Series_reference0,Period1,Data_value2,STATUS3,UNITS4,Service,Department,description,industry,level,size,line_code,value,Business,Footnotes,Series_reference15,Period16,Data_value17,STATUS18,UNITS19,MAGNTUDE,Subject,Group,Series_title_1,Series_title_2,Series_title_3,Series_title_4,Series_title_5
CPIQ.SE9A,1914.06,12.86967374,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,total,0,6�19 employees,,35562,Number of businesses,1 and 2 and 3 and 4 and 6,RNAA.SG01NAC34B0101AA1,2000.03,327.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland
CPIQ.SE9A,1914.09,0.0,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,total,0,20�49 employees,,8796,Number of businesses,1 and 2 and 3 and 4 and 6,RNAA.SG01NAC34B0101AA1,2001.03,459.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland
CPIQ.SE9A,1914.12,0.0,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,total,0,50�99 employees,,2529,Number of businesses,1 and 2 and 3 and 4 and 6,RNAA.SG01NAC34B0101AA1,2002.03,510.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland
CPIQ.SE9A,1915.03,0.0,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,total,0,100+ employees,,2100,Number of businesses,1 and 2 and 3 and 4 and 6,RNAA.SG01NAC34B0101AA1,2003.03,334.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland
CPIQ.SE9A,1915.06,0.0,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,"Agriculture, forestry, & fishing",1,total,,3756,Number of businesses,1 and 2 and 3 and 6,RNAA.SG01NAC34B0101AA1,2004.03,372.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland
CPIQ.SE9A,1915.09,13.92498695,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,Agriculture,2,total,,2460,Number of businesses,1 and 2 and 3 and 6,RNAA.SG01NAC34B0101AA1,2005.03,356.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland
CPIQ.SE9A,1915.12,0.0,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,Commercial fishing,2,total,,57,Number of businesses,1 and 2 and 3 and 6,RNAA.SG01NAC34B0101AA1,2006.03,276.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland
CPIQ.SE9A,1916.03,0.0,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,Forestry & logging,2,total,,258,Number of businesses,1 and 2 and 3 and 6,RNAA.SG01NAC34B0101AA1,2007.03,336.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland
CPIQ.SE9A,1916.06,0.0,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,"Agriculture, forestry, & fishing support services",2,total,,981,Number of businesses,1 and 2 and 3 and 6,RNAA.SG01NAC34B0101AA1,2008.03,512.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland
CPIQ.SE9A,1916.09,15.03177896,FINAL,Index,CPI,CPI All Groups for New Zealand,Total number of businesses,Mining,1,total,,120,Number of businesses,1 and 2 and 3 and 5 and 6,RNAA.SG01NAC34B0101AA1,2009.03,368.0,FINAL,Dollars,6,Regional Gross Domestic Product - RNA,"Gross domestic product, by region and industry",Gross Domestic Product - production measure,Northland,Agriculture,"Forestry, Fishing, and Mining",Newzeland


In [0]:
from pyspark.sql.functions import input_file_name

# Add a column with file names
df = df.withColumn("file_path", input_file_name())

# Show unique file paths (CSV files)
df.select("file_path").distinct().display()

file_path
dbfs:/FileStore/tables/timestamp_millis-3.csv
dbfs:/FileStore/tables/titanic.csv
dbfs:/FileStore/tables/except.csv
dbfs:/FileStore/tables/MarketPrice.csv
dbfs:/FileStore/tables/Sales_Collect_Rev03.csv
dbfs:/FileStore/tables/iterate_columns.csv
dbfs:/FileStore/tables/except-2.csv
dbfs:/FileStore/tables/except-1.csv
dbfs:/FileStore/tables/syntax.jpg
dbfs:/FileStore/tables/to_json.csv


#### **databricks-datasets**

In [0]:
%fs ls

path,name,size,modificationTime
dbfs:/FileStore/,FileStore/,0,0
dbfs:/content/,content/,0,0
dbfs:/data/,data/,0,0
dbfs:/databricks/,databricks/,0,0
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/local_disk0/,local_disk0/,0,0
dbfs:/user/,user/,0,0


In [0]:
%fs ls dbfs:/databricks-datasets/

path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,0
dbfs:/databricks-datasets/README.md,README.md,976,1532468253000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455043490000
dbfs:/databricks-datasets/adult/,adult/,0,0
dbfs:/databricks-datasets/airlines/,airlines/,0,0
dbfs:/databricks-datasets/amazon/,amazon/,0,0
dbfs:/databricks-datasets/asa/,asa/,0,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,0
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,0


**a) airlines**

In [0]:
dbutils.fs.ls('dbfs:/databricks-datasets/airlines/')

[FileInfo(path='dbfs:/databricks-datasets/airlines/README.md', name='README.md', size=1089, modificationTime=1454697889000),
 FileInfo(path='dbfs:/databricks-datasets/airlines/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1436493184000),
 FileInfo(path='dbfs:/databricks-datasets/airlines/part-00000', name='part-00000', size=67108879, modificationTime=1436493184000),
 FileInfo(path='dbfs:/databricks-datasets/airlines/part-00001', name='part-00001', size=67108862, modificationTime=1436493185000),
 FileInfo(path='dbfs:/databricks-datasets/airlines/part-00002', name='part-00002', size=67108930, modificationTime=1436493185000),
 FileInfo(path='dbfs:/databricks-datasets/airlines/part-00003', name='part-00003', size=67108804, modificationTime=1436493186000),
 FileInfo(path='dbfs:/databricks-datasets/airlines/part-00004', name='part-00004', size=67108908, modificationTime=1436493186000),
 FileInfo(path='dbfs:/databricks-datasets/airlines/part-00005', name='part-00005', size=67108890, mo

In [0]:
for files in dbutils.fs.ls('dbfs:/databricks-datasets/airlines/'):
    print (files)

FileInfo(path='dbfs:/databricks-datasets/airlines/README.md', name='README.md', size=1089, modificationTime=1454697889000)
FileInfo(path='dbfs:/databricks-datasets/airlines/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1436493184000)
FileInfo(path='dbfs:/databricks-datasets/airlines/part-00000', name='part-00000', size=67108879, modificationTime=1436493184000)
FileInfo(path='dbfs:/databricks-datasets/airlines/part-00001', name='part-00001', size=67108862, modificationTime=1436493185000)
FileInfo(path='dbfs:/databricks-datasets/airlines/part-00002', name='part-00002', size=67108930, modificationTime=1436493185000)
FileInfo(path='dbfs:/databricks-datasets/airlines/part-00003', name='part-00003', size=67108804, modificationTime=1436493186000)
FileInfo(path='dbfs:/databricks-datasets/airlines/part-00004', name='part-00004', size=67108908, modificationTime=1436493186000)
FileInfo(path='dbfs:/databricks-datasets/airlines/part-00005', name='part-00005', size=67108890, modificationTime=

In [0]:
for files in dbutils.fs.ls('dbfs:/databricks-datasets/airlines/'):
    print (files.name)

README.md
_SUCCESS
part-00000
part-00001
part-00002
part-00003
part-00004
part-00005
part-00006
part-00007
part-00008
part-00009
part-00010
part-00011
part-00012
part-00013
part-00014
part-00015
part-00016
part-00017
part-00018
part-00019
part-00020
part-00021
part-00022
part-00023
part-00024
part-00025
part-00026
part-00027
part-00028
part-00029
part-00030
part-00031
part-00032
part-00033
part-00034
part-00035
part-00036
part-00037
part-00038
part-00039
part-00040
part-00041
part-00042
part-00043
part-00044
part-00045
part-00046
part-00047
part-00048
part-00049
part-00050
part-00051
part-00052
part-00053
part-00054
part-00055
part-00056
part-00057
part-00058
part-00059
part-00060
part-00061
part-00062
part-00063
part-00064
part-00065
part-00066
part-00067
part-00068
part-00069
part-00070
part-00071
part-00072
part-00073
part-00074
part-00075
part-00076
part-00077
part-00078
part-00079
part-00080
part-00081
part-00082
part-00083
part-00084
part-00085
part-00086
part-00087
part-00088
pa

**b) cctvVideos**

In [0]:
dbutils.fs.ls('dbfs:/databricks-datasets/cctvVideos/')

[FileInfo(path='dbfs:/databricks-datasets/cctvVideos/labels/', name='labels/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/databricks-datasets/cctvVideos/mp4/', name='mp4/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/databricks-datasets/cctvVideos/readme.md', name='readme.md', size=2411, modificationTime=1544123649000),
 FileInfo(path='dbfs:/databricks-datasets/cctvVideos/test/', name='test/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/databricks-datasets/cctvVideos/train/', name='train/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/databricks-datasets/cctvVideos/train_images/', name='train_images/', size=0, modificationTime=0)]

In [0]:
for files in dbutils.fs.ls('dbfs:/databricks-datasets/cctvVideos/'):
    if files.name.endswith('/'):
        print (files.name)

labels/
mp4/
test/
train/
train_images/
