### READ AND WRITE OPERATIONS PART 2
####---------- TELECOM OPERATIONS --------------------

In [0]:
%python
from pyspark.sql.session import SparkSession
print(spark)#already instantiated by databricks
spark1=SparkSession.builder.getOrCreate()
print(spark1)#we instantiated

In [0]:
%sql
create catalog if not exists telecom_catalog_assign;
create schema if not exists telecom_catalog_assign.landing_zone;
create volume if not exists telecom_catalog_assign.landing_zone.landing_vol;

In [0]:
for folder in [
      "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/",
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/",
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/",
]:
  dbutils.fs.mkdirs(folder)


In [0]:
for subfolder in [
       "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1",
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2"
]:
  dbutils.fs.mkdirs(subfolder)

####DBFS
1. DBFS / FileStore (Old approach)
**What it is**
DBFS (Databricks File System) is a workspace-level file abstraction
FileStore is a publicly accessible subfolder of DBFS
Mainly designed for experimentation, demos, notebooks
**Key characteristics**
Not governed by Unity Catalog
No fine-grained access control (only workspace-level permissions)
No table-level or column-level lineage
FileStore files can be exposed via public URLs
Weak auditability
**Typical usage**
Temporary files
Sample datasets
Notebook outputs
Quick testing
**Why it’s not prod-ready**
❌ No centralized governance
❌ No row/column/file-level security
❌ Hard to audit “who accessed what”
❌ Not compliant for sensitive data

#### VOLUMES
**What it is**
Volumes are governed storage objects under Unity Catalog
They provide secure file storage similar to tables
Backed by cloud storage (ADLS / S3 / GCS)

**Key characteristics**
Fully integrated with Unity Catalog
Supports fine-grained access control
Audited and tracked
Secure, no public URLs
Clear ownership and lifecycle management
**Typical usage**
Ingestion landing zones
Raw / bronze data
ML artifacts
Regulated datasets
Production pipelines
**Why it is prod-ready**
✅ Central governance
✅ Strong security & compliance
✅ Auditing & lineage
✅ Works across all Databricks workspaces

#### why volume instead of dbfs?
DBFS/FileStore is meant for development and experimentation, while Volumes are Unity Catalog–governed, secure, auditable storage objects designed for production and regulated data. Production teams prefer Volumes because they provide fine-grained access control, auditability, and compliance that DBFS cannot offer.

##### b. Why production teams prefer Volumes for regulated data?
**1. Regulated data needs governance**
Regulated data includes:
PII (Aadhaar, PAN, phone, email)
Financial data
Healthcare data
Customer records
Production teams must answer:
Who accessed this data?
When was it accessed?
Was access authorized?
➡️ DBFS cannot answer these questions reliably
➡️ Volumes can

**2. Fine-grained access control (critical)**
With Volumes, teams can:
GRANT READ FILES ON VOLUME main.sales.raw_data TO analyst_role;
This means:
Only authorized roles can read/write
Access can be revoked instantly
No accidental exposure

DBFS:
Either you have workspace access or you don’t
No file-level control

**3. Audit & compliance (non-negotiable)**
Regulators require:
Audit logs
Access history
Ownership tracking
Volumes provide:
✅ Who accessed which file
✅ Which pipeline wrote the data
✅ When access happened
DBFS:
❌ Weak or no audit trail

**4. Separation of concerns (clean architecture)**
Raw data  →  Processed data  →  Curated data
Volumes help enforce this:
Raw volumes (restricted)
Processed volumes (controlled)
Curated tables (consumer-facing)
DBFS mixes everything → chaos in prod.

**5. Future-proof & multi-workspace support**

Volumes:
Work across multiple Databricks workspaces
Central governance via Unity Catalog
Scales for enterprise growth

DBFS:
Tied to a single workspace
Legacy approach



---------------------------------------------------------------------------------

In [0]:
customer_csv = '''
101,Arun,31,Chennai,PREPAID
102,Meera,45,Bangalore,POSTPAID
103,Irfan,29,Hyderabad,PREPAID
104,Raj,52,Mumbai,POSTPAID
105,,27,Delhi,PREPAID
106,Sneha,abc,Pune,PREPAID
'''
# we use put as we are dealing with smaller dataset
#dbutils.fs.put() is used to create or overwrite small text-based files in Databricks storage (DBFS or Volumes) by writing string content directly.
'''Because overwrite=False is a safety flag:
Prevents accidental data loss
Forces you to explicitly allow replacement
When overwrite=False, dbutils.fs.put() will fail if the file already exists, protecting against accidental overwrites; when True, it replaces the existing file.
'''
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv",customer_csv,overwrite=True)

#tsv-tab separeated values
usage_tsv = '''customer_id\tvoice_mins\tdata_mb\tsms_count
101\t320\t1500\t20
102\t120\t4000\t5
103\t540\t600\t52
104\t45\t200\t2
105\t0\t0\t0
'''
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage.tsv",usage_tsv,overwrite=True)

tower_logs_region1 = '''event_id|customer_id|tower_id|signal_strength|timestamp
5001|101|TWR01|-80|2025-01-10 10:21:54
5004|104|TWR05|-75|2025-01-10 11:01:12
'''
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/tower_logs_region1.csv",tower_logs_region1,overwrite=True)

tower_logs_region2 = '''event_id|customer_id|tower_id|signal_strength|timestamp
6001|102|TWR01|-90|2025-02-15 10:21:54
6004|106|TWR05|-55|2025-02-15 11:01:12
'''
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2/tower_logs_region2.csv",tower_logs_region1,overwrite=True)





In [0]:
paths = [
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer",
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage",
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower"
]

for path in paths:
    files = dbutils.fs.ls(path)
    if files:
        print(f"{path} → {len(files)} files found")
    else:
        print(f"{path} → No files found!")

In [0]:
#Read all tower logs using: Path glob filter (example: *.csv) Multiple paths input Recursive lookup
'''
Purpose: Read all files in a folder and its nested subfolders.
Default behavior: Spark only reads files in the top-level folder.
Option: Set .option("recursiveFileLookup", "true") to include subfolders.
'''
df=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region*",recursiveFileLookup=True)
print(f"Total rows in all tower logs: {df.count()}")
#Demonstrate these 3 reads separately: Using pathGlobFilter Using list of paths in spark.read.csv([path1, path2]) Using .option("recursiveFileLookup","true")
df=spark.read.option("recursiveFileLookup","true").csv(['/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/tower_logs_region1.csv','/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2/tower_logs_region2.csv'])
print(f"Total rows in all tower logs: {df.count()}")

df=spark.read.options(header=True,inferSchema=True,recursiveFileLookup=True,pathGlobFilter="*.csv",sep='|').format('csv').load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region*")
df.show(2)

In [0]:
'''Try the Customer, Usage files with the option and options using read.csv and format function:
header=false, inferSchema=false
or
header=true, inferSchema=true'''
df=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv",header=True,inferSchema=False)
df.printSchema()
df.show()
df=spark.read.options(header=True,inferSchema=True).csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv")
df.printSchema()
df.show()

#if infereschema is false all the columns will be treated as string,and also the first row of the dataset is set as the header check the columns names
#How schema inference handled “abc” in age? it is treated as a string column instead of int

In [0]:
#Apply column names using string using toDF function for customer data
df=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv",header=False,inferSchema=True).toDF("customer_id","name","age","city","plan")
df.printSchema()
df.show()
#Apply column names and datatype using the schema function for usage data
schema_data="""
customer_id int,
name string,
age string,
city string
"""
df=spark.read.schema(schema_data).csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage.tsv",header=True,sep="\t")
df.printSchema()
df.show()
    # IF HEADER FALSE IN HERE FIRST ROW IS HEADER INFO GIVEN IN DATASET BECOZ OF WHICH IT ADDED AS ROW IN THE DATAFRAME
    #||||here is nullable is not given so we can zero added in those places||||

In [0]:
#Apply column names and datatype using the StructType with IntegerType, StringType, TimestampType and other classes for towers data
from pyspark.sql.types import *
schema_data=StructType([
    StructField("event_id",IntegerType(),True),
    StructField("customer_id",IntegerType(),True),
    StructField("tower_id",StringType(),True),
    StructField("signalstrength",IntegerType(),True),
    StructField("timestamp",TimestampType(),True)]
)

df=spark.read.schema(schema_data).csv(f"/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region*",header=True,sep="|")
df.printSchema()
df.show()
