In [0]:
#spark object is pre-provided. 
#spark.version shows Spark runtime.
print("Hello Databricks from", spark.version)

In [0]:
#dbutils.fs is the Databricks File System utility
#It helps you interact with files stored in:
    #DBFS (Databricks File System), Azure Data Lake (ADLS) or S3 buckets mounted in DBFS, Local cluster storage paths, etc.
#Think of it like Unix ls or Windows Explorer, but inside the Databricks workspace.

#dbutils.fs.ls(path) -> lists all files and directories inside the given path.

#display() is a Databricks notebook function that renders structured output in a rich table rather than plain text.

#In Databricks, / (root) is the root of DBFS (Databricks File System).
    #It is NOT the root of the cluster's local Linux filesystem.
    #DBFS is a virtual filesystem built on top of cloud storage.

display(dbutils.fs.ls("/"))
display(dbutils.fs.ls("/user/")) #User-specific workspace directory

#The output table helps you understand the directory structure inside DBFS, where notebooks, datasets, mounts, and uploaded files live.


In [0]:
display(dbutils.fs.ls("/databricks-datasets/"))
#This is a special directory in DBFS that contains sample datasets provided by Databricks (e.g., airline data, retail data, public datasets).
    #It’s often used for learning, demos, and experimentation.


display(dbutils.fs.ls("/databricks-datasets/airlines/"))
#This lists the contents of the directory under /databricks-datasets/airlines/, which is a sample folder provided by Databricks.

#What are the parts that we can see in the output of this command??
    #Part Files: In distributed computing environments like Databricks, files are often split into multiple smaller partitioned files (or part files) when writing large datasets. Each of these smaller files contains a subset of the data.
    #part-00000 is a subset of the entire airline data stored in a distributed fashion across multiple files. Each part contains a portion of the data, which, when combined, represents the full dataset.

#Where Does This Come From?
#Spark Write Operations: When Spark writes out a dataset to a distributed storage system (like DBFS), it often breaks the dataset into several part files based on the number of partitions in the data. These files are typically named part-00000, part-00001, etc., and you would find them in a directory where the data was saved.
#Example Use Case: You might see part files in the /databricks-datasets/airlines/ directory if someone used Spark to process and save a large airline dataset in a partitioned manner.

In [0]:
#What is a df (dataframe)??????????????
    #A DataFrame in Spark is a distributed collection of data organized into named columns.
    #It is similar to a table in a relational database or a data frame in Pandas (for Python).
    #DataFrames in Spark are immutable (you can't modify them directly) 
        #and distributed, meaning that they are partitioned across multiple nodes in a Spark cluster for parallel processing.


#Creating a DataFrame
    #You can create a DataFrame in multiple ways, such as reading data from external files (CSV, Parquet, etc.) or from a Spark SQL query.

#Reading a CSV file:
    #Read data from an existing dataset into a DataFrame
my_first_df = spark.read.csv("dbfs:/databricks-datasets/airlines/part-00000", header=True, inferSchema=True) 

#Always Verify the path or use an existing dataset.
    #read (DataFrameReader): Provides methods to load data from various formats (CSV, JSON, Parquet, JDBC, etc.)
    #header=True (column names): Interprets the first row as column headers, not data. If False, Spark assigns default column names like _c0, _c1, etc.
    #inferSchema=True (type detection): Spark scans the data to guess each column’s type (e.g., integer, double, timestamp).
        #Without this, all columns are read as strings.

# Show the first 5 rows of the DataFrame
my_first_df.show(5)


#-------If we want to try and load the entire dataset instead of just one subpart - Takes a lot of time due to HUGE amount of data-----#
#   my_first_df1 = spark.read.csv("dbfs:/databricks-datasets/airlines/", header=True, inferSchema=True)
#   my_first_df1.show(5)


my_first_df.printSchema()  # Displays the structure of the DataFrame

my_first_df.columns  # List of column names

my_first_df.describe().show()  # Summary statistics (like count, mean, stddev, min, max)

#-------------------------------------------------------------------------------------------------------

#Hope you understood the flow : File system → file → DataFrame → display.


In [0]:
# Select a single column
my_first_df.select("Year").show()

#Showcase distinct values of a particular column
my_first_df.select("ArrTime").distinct().show()

# Select multiple columns
my_first_df.select("Year", "Origin").show()

# Select and alias a column
my_first_df.select(my_first_df["Dest"].alias("Destination")).show()

#The state of df persists across cells because it's kept in the same session’s memory.
    #If you restart your cluster or session, though, that state is lost, and you'd need to re-run the cell that initializes the DataFrame.
    #As long as the session is live, any variable (including DataFrames) you create in one cell will be available to other cells without needing to re-run or redefine them.


#FILTERING DATA......
# Filter rows based on a condition
my_first_df.filter(my_first_df["DayofMonth"] > 10).show()

# You can also use SQL expressions for filtering
my_first_df.filter("DayofMonth > 10").show()



#SORTING DATA.......
    #You can sort the DataFrame based on one or more columns.

# Sort in ascending order (default)
my_first_df.orderBy("DayofMonth").show()

# Sort in descending order
my_first_df.orderBy(my_first_df["DayofMonth"].desc()).show()

# Sort by multiple columns
my_first_df.orderBy("DayofMonth", my_first_df["DayofWeek"].desc()).show()