In [0]:
spark

----------------- Read The File

In [0]:
write_df = spark.read.format('csv')\
                     .option('delimeter', ',')\
                     .option('header', 'true')\
                     .option('mode', 'PERMISSIVE')\
                     .load('/FileStore/tables/datafile_for_write.txt')
write_df.show()

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  1|  Manish| 26| 75000|  INDIA|     m|
|  2|  Nikita| 23|100000|    USA|     f|
|  3|  Pritam| 22|150000|  INDIA|     m|
|  4|Prantosh| 17|200000|  JAPAN|     m|
|  5|  Vikash| 31|300000|    USA|     m|
|  6|   Rahul| 55|300000|  INDIA|     m|
|  7|    Raju| 67|540000|    USA|     m|
|  8| Praveen| 28| 70000|  JAPAN|     m|
|  9|     Dev| 32|150000|  JAPAN|     m|
| 10|  Sherin| 16| 25000| RUSSIA|     f|
| 11|    Ragu| 12| 35000|  INDIA|     f|
| 12|   Sweta| 43|200000|  INDIA|     f|
| 13| Raushan| 48|650000|    USA|     m|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|
| 15| Prakash| 52|750000|  INDIA|     m|
+---+--------+---+------+-------+------+



------------------ Write File Data

In [0]:
write_df.write.format('csv')\
           .option('header', 'true')\
           .mode('overwrite')\
           .save('/FileStore/tables/csv_write')

In [0]:
%fs
ls /FileStore/tables/csv_write

path,name,size,modificationTime
dbfs:/FileStore/tables/csv_write/_committed_2080155463350979958,_committed_2080155463350979958,112,1726139659000
dbfs:/FileStore/tables/csv_write/_committed_3473671417335480626,_committed_3473671417335480626,210,1726139880000
dbfs:/FileStore/tables/csv_write/_committed_5602635085231312888,_committed_5602635085231312888,375,1726143196000
dbfs:/FileStore/tables/csv_write/_committed_6057950584906712320,_committed_6057950584906712320,548,1726141680000
dbfs:/FileStore/tables/csv_write/_committed_833114350144638594,_committed_833114350144638594,372,1726140045000
dbfs:/FileStore/tables/csv_write/_committed_vacuum3790993138825360321,_committed_vacuum3790993138825360321,96,1726141681000
dbfs:/FileStore/tables/csv_write/_started_5602635085231312888,_started_5602635085231312888,0,1726143196000
dbfs:/FileStore/tables/csv_write/_started_6057950584906712320,_started_6057950584906712320,0,1726141680000
dbfs:/FileStore/tables/csv_write/part-00000-tid-5602635085231312888-d8c406fe-35f6-4b9a-b802-361ce349a8e1-52-1-c000.csv,part-00000-tid-5602635085231312888-d8c406fe-35f6-4b9a-b802-361ce349a8e1-52-1-c000.csv,430,1726143196000
dbfs:/FileStore/tables/csv_write/transform_write_df.csv/,transform_write_df.csv/,0,0


--------------- Write File With Repartitioning

In [0]:
write_df = write_df.repartition(3)
write_df.write.format('csv')\
              .option('header', 'true')\
              .mode('overwrite')\
              .save('/FileStore/tables/csv_write')

In [0]:
%fs
ls "/FileStore/tables/csv_write"

path,name,size,modificationTime
dbfs:/FileStore/tables/csv_write/_committed_2080155463350979958,_committed_2080155463350979958,112,1726139659000
dbfs:/FileStore/tables/csv_write/_committed_3473671417335480626,_committed_3473671417335480626,210,1726139880000
dbfs:/FileStore/tables/csv_write/_committed_5602635085231312888,_committed_5602635085231312888,375,1726143196000
dbfs:/FileStore/tables/csv_write/_committed_6057950584906712320,_committed_6057950584906712320,548,1726141680000
dbfs:/FileStore/tables/csv_write/_committed_6822810924080459208,_committed_6822810924080459208,375,1726143245000
dbfs:/FileStore/tables/csv_write/_committed_833114350144638594,_committed_833114350144638594,372,1726140045000
dbfs:/FileStore/tables/csv_write/_committed_vacuum3790993138825360321,_committed_vacuum3790993138825360321,96,1726141681000
dbfs:/FileStore/tables/csv_write/_started_5602635085231312888,_started_5602635085231312888,0,1726143196000
dbfs:/FileStore/tables/csv_write/_started_6057950584906712320,_started_6057950584906712320,0,1726141680000
dbfs:/FileStore/tables/csv_write/_started_6822810924080459208,_started_6822810924080459208,0,1726143245000


------------------- Data Partitioning

In [0]:
# partition by address column only
write_df.write.format('csv')\
              .option('header', 'true')\
              .mode('overwrite')\
              .partitionBy('address')\
              .save('/FileStore/tables/partition_by_address')

In [0]:
%fs
ls /FileStore/tables/partition_by_address

path,name,size,modificationTime
dbfs:/FileStore/tables/partition_by_address/_SUCCESS,_SUCCESS,0,1726144723000
dbfs:/FileStore/tables/partition_by_address/address=INDIA/,address=INDIA/,0,0
dbfs:/FileStore/tables/partition_by_address/address=JAPAN/,address=JAPAN/,0,0
dbfs:/FileStore/tables/partition_by_address/address=RUSSIA/,address=RUSSIA/,0,0
dbfs:/FileStore/tables/partition_by_address/address=USA/,address=USA/,0,0


In [0]:
india_df = spark.read.format('csv')\
                     .option('inferschema', 'true')\
                     .option('header', 'true')\
                     .option('mode', 'PERMISSIVE')\
                     .load('/FileStore/tables/partition_by_address')
india_df.show()

+---+--------+---+------+------+-------+
| id|    name|age|salary|gender|address|
+---+--------+---+------+------+-------+
|  1|  Manish| 26| 75000|     m|  INDIA|
|  3|  Pritam| 22|150000|     m|  INDIA|
|  6|   Rahul| 55|300000|     m|  INDIA|
| 11|    Ragu| 12| 35000|     f|  INDIA|
| 12|   Sweta| 43|200000|     f|  INDIA|
| 15| Prakash| 52|750000|     m|  INDIA|
|  2|  Nikita| 23|100000|     f|    USA|
|  5|  Vikash| 31|300000|     m|    USA|
|  7|    Raju| 67|540000|     m|    USA|
| 13| Raushan| 48|650000|     m|    USA|
|  4|Prantosh| 17|200000|     m|  JAPAN|
|  8| Praveen| 28| 70000|     m|  JAPAN|
|  9|     Dev| 32|150000|     m|  JAPAN|
| 10|  Sherin| 16| 25000|     f| RUSSIA|
| 14|  Mukesh| 36| 95000|     m| RUSSIA|
+---+--------+---+------+------+-------+



In [0]:
# partition by id column only but it will be not duplicate so it will participate in multiple part which is not good. so that we should use bucket at that time.
write_df.write.format('csv')\
              .option('header', 'true')\
              .mode('overwrite')\
              .partitionBy('id')\
              .save('/FileStore/tables/partition_by_id')

In [0]:
dbutils.fs.ls('/FileStore/tables/partition_by_id')

Out[38]: [FileInfo(path='dbfs:/FileStore/tables/partition_by_id/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1726147749000),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_id/id=1/', name='id=1/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_id/id=10/', name='id=10/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_id/id=11/', name='id=11/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_id/id=12/', name='id=12/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_id/id=13/', name='id=13/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_id/id=14/', name='id=14/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_id/id=15/', name='id=15/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_id/id=2/', name='id=2/', size=0, modificationTime=0),
 FileIn

In [0]:
# partition by multiple column like address and gender
write_df.write.format('csv')\
              .option('header', 'true')\
              .mode('overwrite')\
              .partitionBy('address', 'gender')\
              .save('/FileStore/tables/partition_by_address_gender')

In [0]:
dbutils.fs.ls('/FileStore/tables/partition_by_address_gender')

Out[42]: [FileInfo(path='dbfs:/FileStore/tables/partition_by_address_gender/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1726147903000),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_address_gender/address=INDIA/', name='address=INDIA/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_address_gender/address=JAPAN/', name='address=JAPAN/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_address_gender/address=RUSSIA/', name='address=RUSSIA/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_address_gender/address=USA/', name='address=USA/', size=0, modificationTime=0)]

In [0]:
dbutils.fs.ls('/FileStore/tables/partition_by_address_gender/address=INDIA')

Out[43]: [FileInfo(path='dbfs:/FileStore/tables/partition_by_address_gender/address=INDIA/gender=f/', name='gender=f/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/partition_by_address_gender/address=INDIA/gender=m/', name='gender=m/', size=0, modificationTime=0)]

---------- Data Bucketing

In [0]:
# partition by multiple column like address and gender
write_df.write.format('csv')\
              .option('header', 'true')\
              .option('path', '/FileStore/tables/bucket_by_id')\
              .mode('overwrite')\
              .bucketBy(3, 'id')\
              .saveAsTable('bucket_by_id_table')

In [0]:
%fs
ls /FileStore/tables/bucket_by_id/

path,name,size,modificationTime
dbfs:/FileStore/tables/bucket_by_id/_SUCCESS,_SUCCESS,0,1726148503000
dbfs:/FileStore/tables/bucket_by_id/_committed_3073423319325188797,_committed_3073423319325188797,309,1726148503000
dbfs:/FileStore/tables/bucket_by_id/_started_3073423319325188797,_started_3073423319325188797,0,1726148502000
dbfs:/FileStore/tables/bucket_by_id/part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-1_00000.c000.csv,part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-1_00000.c000.csv,239,1726148502000
dbfs:/FileStore/tables/bucket_by_id/part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-2_00001.c000.csv,part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-2_00001.c000.csv,172,1726148503000
dbfs:/FileStore/tables/bucket_by_id/part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-3_00002.c000.csv,part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-3_00002.c000.csv,87,1726148503000


In [0]:
dbutils.fs.ls('/FileStore/tables/bucket_by_id')

Out[51]: [FileInfo(path='dbfs:/FileStore/tables/bucket_by_id/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1726148503000),
 FileInfo(path='dbfs:/FileStore/tables/bucket_by_id/_committed_3073423319325188797', name='_committed_3073423319325188797', size=309, modificationTime=1726148503000),
 FileInfo(path='dbfs:/FileStore/tables/bucket_by_id/_started_3073423319325188797', name='_started_3073423319325188797', size=0, modificationTime=1726148502000),
 FileInfo(path='dbfs:/FileStore/tables/bucket_by_id/part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-1_00000.c000.csv', name='part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-1_00000.c000.csv', size=239, modificationTime=1726148502000),
 FileInfo(path='dbfs:/FileStore/tables/bucket_by_id/part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-2_00001.c000.csv', name='part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-2_00001.c000.csv', siz

In [0]:
# read the bucket data
bucket_df = spark.read.format('csv')\
                     .option('inferschema', 'true')\
                     .option('header', 'true')\
                     .option('mode', 'PERMISSIVE')\
                     .load('/FileStore/tables/bucket_by_id/part-00000-tid-3073423319325188797-2626bfcf-c82b-4b74-8307-0fc7f23e0e0b-140-1_00000.c000.csv')
bucket_df.show()

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  1|  Manish| 26| 75000|  INDIA|     m|
|  4|Prantosh| 17|200000|  JAPAN|     m|
|  5|  Vikash| 31|300000|    USA|     m|
|  6|   Rahul| 55|300000|  INDIA|     m|
|  7|    Raju| 67|540000|    USA|     m|
|  9|     Dev| 32|150000|  JAPAN|     m|
| 11|    Ragu| 12| 35000|  INDIA|     f|
| 13| Raushan| 48|650000|    USA|     m|
+---+--------+---+------+-------+------+



------------- Learn Bucket Pruning