In [0]:
spark # spark object is pre defined here

<pyspark.sql.connect.session.SparkSession at 0xff751ab97050>

In [0]:
data = [("Pyspark","30 Days"),("Mysql","30 Days")]
df = spark.createDataFrame(data)
df.show()

+-------+-------+
|     _1|     _2|
+-------+-------+
|Pyspark|30 Days|
|  Mysql|30 Days|
+-------+-------+



In [0]:
# We are providing column name 
data = [("Pyspark","30 Days"),("Mysql","30 Days"),("Python","30 Days")]

# definiing column name here
columns = ["courseName","duration"] 
df = spark.createDataFrame(data,columns)
df.show()

+----------+--------+
|courseName|duration|
+----------+--------+
|   Pyspark| 30 Days|
|     Mysql| 30 Days|
|    Python| 30 Days|
+----------+--------+



In [0]:
# to See number of rows from Data
df.count()

# to see column name only 
df.columns

# to count number of oclumns 
len(df.columns)

# to see Schema of Dataframe
df.printSchema()

root
 |-- courseName: string (nullable = true)
 |-- duration: string (nullable = true)



In [0]:
# creating dataframe from python object

data = [{"coursename":"Pyspark","duration":"30 Days"},
        {"coursename":"Mysql","duration":"30 Days"},
        {"coursename":"Python","duration":"30 Days"}
        ]

# creating Dataframe

df = spark.createDataFrame(data)

df.show()

df.printSchema()

+----------+--------+
|coursename|duration|
+----------+--------+
|   Pyspark| 30 Days|
|     Mysql| 30 Days|
|    Python| 30 Days|
+----------+--------+

root
 |-- coursename: string (nullable = true)
 |-- duration: string (nullable = true)



In [0]:
# Create a temporary view on top of the DataFrame.
# This allows us to run SQL queries directly on the DataFrame.
df.createTempView("course_view")

# Use Spark SQL to query the temporary view and create a new DataFrame.
# Here we are selecting only the 'coursename' column from the view.
df2 = spark.sql("SELECT coursename FROM course_view")

# Display the result of the new DataFrame.
df2.show()


+----------+
|coursename|
+----------+
|   Pyspark|
|     Mysql|
|    Python|
+----------+



In [0]:
# Reading Data From Files

# File Path: /Volumes/workspace/default/my_data/employee_data.csv

# Read the CSV file into a DataFrame.
# header=True       → Treats the first row as column names.
# inferSchema=True  → Automatically detects and assigns correct data types.
# Without inferSchema, Spark reads all columns as STRING by default.
emp_data = (
    spark.read
         .csv(
             "/Volumes/workspace/default/my_data/employee_data.csv",
             header=True,
             inferSchema=True
         )
)

# Display the contents of the DataFrame.
emp_data.show()

# Print the schema of the DataFrame to verify inferred data types.
emp_data.printSchema()


+---+------+------+---------+----------+----------+
| id|  name|salary|  address|department|joineddate|
+---+------+------+---------+----------+----------+
|  1|  John| 50000|   Mumbai|        IT|2021-01-15|
|  2|  Emma| 62000|    Delhi|        HR|2020-03-10|
|  3|   Raj| 45000|     Pune|   Finance|2022-07-20|
|  4| Priya| 70000|Bangalore|        IT|2019-11-05|
|  5|  Alex| 55000|Hyderabad| Marketing|2021-05-12|
|  6|  Sara| 48000|  Chennai|   Finance|2020-08-25|
|  7|Nikhil| 72000|Ahmedabad|        HR|2022-02-14|
|  8| Aisha| 53000|  Kolkata|        IT|2021-09-30|
|  9|   Tom| 60000|    Surat| Marketing|2023-01-18|
| 10|  Riya| 58000|   Jaipur|   Finance|2020-12-02|
| 11|Vikram| 65000|   Nagpur|        HR|2019-04-22|
| 12|  Zara| 49000|   Indore|        IT|2022-10-11|
+---+------+------+---------+----------+----------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- departme

In [0]:
# creating Dataframe using json file

# file path - /Volumes/workspace/default/my_data/employee_data_jsonlines.json
emp_json = spark \
            .read \
            .json("/Volumes/workspace/default/my_data/employee_data_jsonlines.json")

emp_json.show()

emp_json.printSchema()

+---------+----------+---+----------+------+------+
|  address|department| id|joineddate|  name|salary|
+---------+----------+---+----------+------+------+
|   Mumbai|        IT|  1|2021-01-15|  John| 50000|
|    Delhi|        HR|  2|2020-03-10|  Emma| 62000|
|     Pune|   Finance|  3|2022-07-20|   Raj| 45000|
|Bangalore|        IT|  4|2019-11-05| Priya| 70000|
|Hyderabad| Marketing|  5|2021-05-12|  Alex| 55000|
|  Chennai|   Finance|  6|2020-08-25|  Sara| 48000|
|Ahmedabad|        HR|  7|2022-02-14|Nikhil| 72000|
|  Kolkata|        IT|  8|2021-09-30| Aisha| 53000|
|    Surat| Marketing|  9|2023-01-18|   Tom| 60000|
|   Jaipur|   Finance| 10|2020-12-02|  Riya| 58000|
|   Nagpur|        HR| 11|2019-04-22|Vikram| 65000|
|   Indore|        IT| 12|2022-10-11|  Zara| 49000|
+---------+----------+---+----------+------+------+

root
 |-- address: string (nullable = true)
 |-- department: string (nullable = true)
 |-- id: long (nullable = true)
 |-- joineddate: string (nullable = true)
 |-- na

In [0]:
# Creating a DataFrame from a multiline JSON file

# File Path: /Volumes/workspace/default/my_data/employee_data.json

# Read the JSON file into a DataFrame.
# multiLine=True → Required when the JSON file is formatted as an array or
#                  spans multiple lines instead of JSON Lines format.
# Note: Spark automatically infers schema for JSON files (inferSchema=True by default).
emp_json = (
    spark.read
         .json(
             "/Volumes/workspace/default/my_data/employee_data.json",
             multiLine=True
         )
)

# Display the contents of the DataFrame.
emp_json.show()

# Print the schema to verify column names and data types.
emp_json.printSchema()

# Note: In JSON files, Spark treats inferSchema=True by default,
#       so you don't need to mention it explicitly.


+---------+----------+---+----------+------+------+
|  address|department| id|joineddate|  name|salary|
+---------+----------+---+----------+------+------+
|   Mumbai|        IT|  1|2021-01-15|  John| 50000|
|    Delhi|        HR|  2|2020-03-10|  Emma| 62000|
|     Pune|   Finance|  3|2022-07-20|   Raj| 45000|
|Bangalore|        IT|  4|2019-11-05| Priya| 70000|
|Hyderabad| Marketing|  5|2021-05-12|  Alex| 55000|
|  Chennai|   Finance|  6|2020-08-25|  Sara| 48000|
|Ahmedabad|        HR|  7|2022-02-14|Nikhil| 72000|
|  Kolkata|        IT|  8|2021-09-30| Aisha| 53000|
|    Surat| Marketing|  9|2023-01-18|   Tom| 60000|
|   Jaipur|   Finance| 10|2020-12-02|  Riya| 58000|
|   Nagpur|        HR| 11|2019-04-22|Vikram| 65000|
|   Indore|        IT| 12|2022-10-11|  Zara| 49000|
+---------+----------+---+----------+------+------+

root
 |-- address: string (nullable = true)
 |-- department: string (nullable = true)
 |-- id: long (nullable = true)
 |-- joineddate: string (nullable = true)
 |-- na

In [0]:
# Genric Approach to laod data from csv

df = spark \
    .read \
    .format("csv") \
    .option("header","True") \
    .option("inferSchema","True") \
    .load("/Volumes/workspace/default/my_data/employee_data.csv")

df.show()

+---+------+------+---------+----------+----------+
| id|  name|salary|  address|department|joineddate|
+---+------+------+---------+----------+----------+
|  1|  John| 50000|   Mumbai|        IT|2021-01-15|
|  2|  Emma| 62000|    Delhi|        HR|2020-03-10|
|  3|   Raj| 45000|     Pune|   Finance|2022-07-20|
|  4| Priya| 70000|Bangalore|        IT|2019-11-05|
|  5|  Alex| 55000|Hyderabad| Marketing|2021-05-12|
|  6|  Sara| 48000|  Chennai|   Finance|2020-08-25|
|  7|Nikhil| 72000|Ahmedabad|        HR|2022-02-14|
|  8| Aisha| 53000|  Kolkata|        IT|2021-09-30|
|  9|   Tom| 60000|    Surat| Marketing|2023-01-18|
| 10|  Riya| 58000|   Jaipur|   Finance|2020-12-02|
| 11|Vikram| 65000|   Nagpur|        HR|2019-04-22|
| 12|  Zara| 49000|   Indore|        IT|2022-10-11|
+---+------+------+---------+----------+----------+



In [0]:
# genric Approach to load data from json multiline files    

df_json = spark \
    .read \
    .format("json") \
    .option("inferSchema","True") \
    .option("multiline","True") \
    .load("/Volumes/workspace/default/my_data/employee_data.json")

df_json.show()

df_json.printSchema()

+---------+----------+---+----------+------+------+
|  address|department| id|joineddate|  name|salary|
+---------+----------+---+----------+------+------+
|   Mumbai|        IT|  1|2021-01-15|  John| 50000|
|    Delhi|        HR|  2|2020-03-10|  Emma| 62000|
|     Pune|   Finance|  3|2022-07-20|   Raj| 45000|
|Bangalore|        IT|  4|2019-11-05| Priya| 70000|
|Hyderabad| Marketing|  5|2021-05-12|  Alex| 55000|
|  Chennai|   Finance|  6|2020-08-25|  Sara| 48000|
|Ahmedabad|        HR|  7|2022-02-14|Nikhil| 72000|
|  Kolkata|        IT|  8|2021-09-30| Aisha| 53000|
|    Surat| Marketing|  9|2023-01-18|   Tom| 60000|
|   Jaipur|   Finance| 10|2020-12-02|  Riya| 58000|
|   Nagpur|        HR| 11|2019-04-22|Vikram| 65000|
|   Indore|        IT| 12|2022-10-11|  Zara| 49000|
+---------+----------+---+----------+------+------+

root
 |-- address: string (nullable = true)
 |-- department: string (nullable = true)
 |-- id: long (nullable = true)
 |-- joineddate: string (nullable = true)
 |-- na

### inferSchema=True → Spark automatically infers the data types of columns based on the data in the file.

⚠️ Problems with this Approach

- Spark must scan the data to infer data types.
-       If the file is small, this is fine — but for large datasets, scanning the entire file increases processing time.

- This can lead to performance issues, especially in production where data volumes are very high.

- There is also a risk of incorrect type inference.
-         For example, if initial rows contain null or inconsistent values, Spark may assume the wrong data type.

- Because of these limitations, inferSchema=True is not recommended in production workloads

- For this we can write our own Schema and provide it

## **TO Define Our Own Schema** 
- StructType - Represnt Schema
- StructFiled - Represnt Each Column Details - Column Details Means - column name , Datatype,nullable

Schema = StructType
    ([
    StructField("name",StringType(),True)
    ])



###**** InFering Schema For CSV Files

In [0]:
from  pyspark.sql.types import *

# define Schema
emp_schema = StructType([
    StructField("id",IntegerType(),True), # true means null value is allowed
    StructField("name",StringType(),True),
    StructField("salary",IntegerType(),True),
    StructField("address",StringType(),True),
    StructField("department",StringType(),True),
    StructField("joineddate",DateType(),True),
])

# Genric Approach to laod data from csv
df = spark \
    .read \
    .format("csv") \
    .option("header","True") \
    .schema(emp_schema) \
    .load("/Volumes/workspace/default/my_data/employee_data.csv")

df.show()
df.printSchema()

+---+------+------+---------+----------+----------+
| id|  name|salary|  address|department|joineddate|
+---+------+------+---------+----------+----------+
|  1|  John| 50000|   Mumbai|        IT|2021-01-15|
|  2|  Emma| 62000|    Delhi|        HR|2020-03-10|
|  3|   Raj| 45000|     Pune|   Finance|2022-07-20|
|  4| Priya| 70000|Bangalore|        IT|2019-11-05|
|  5|  Alex| 55000|Hyderabad| Marketing|2021-05-12|
|  6|  Sara| 48000|  Chennai|   Finance|2020-08-25|
|  7|Nikhil| 72000|Ahmedabad|        HR|2022-02-14|
|  8| Aisha| 53000|  Kolkata|        IT|2021-09-30|
|  9|   Tom| 60000|    Surat| Marketing|2023-01-18|
| 10|  Riya| 58000|   Jaipur|   Finance|2020-12-02|
| 11|Vikram| 65000|   Nagpur|        HR|2019-04-22|
| 12|  Zara| 49000|   Indore|        IT|2022-10-11|
+---+------+------+---------+----------+----------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- departme

### **InFering Schema For Json Files**

In [0]:
# genric Approach to load data from json multiline files  

# define Schema 
emp_schema = StructType([
  StructField("id",IntegerType(),True),
  StructField("name",StringType(),True),
  StructField("salary",IntegerType(),True),
  StructField("address",StringType(),True),
  StructField("department",StringType(),True),
  StructField("joineddate",DateType(),True),
])  

df_json = spark \
    .read \
    .format("json") \
    .schema(emp_schema) \
    .option("multiline","True") \
    .load("/Volumes/workspace/default/my_data/employee_data.json")

df_json.show()

df_json.printSchema()

+---+------+------+---------+----------+----------+
| id|  name|salary|  address|department|joineddate|
+---+------+------+---------+----------+----------+
|  1|  John| 50000|   Mumbai|        IT|2021-01-15|
|  2|  Emma| 62000|    Delhi|        HR|2020-03-10|
|  3|   Raj| 45000|     Pune|   Finance|2022-07-20|
|  4| Priya| 70000|Bangalore|        IT|2019-11-05|
|  5|  Alex| 55000|Hyderabad| Marketing|2021-05-12|
|  6|  Sara| 48000|  Chennai|   Finance|2020-08-25|
|  7|Nikhil| 72000|Ahmedabad|        HR|2022-02-14|
|  8| Aisha| 53000|  Kolkata|        IT|2021-09-30|
|  9|   Tom| 60000|    Surat| Marketing|2023-01-18|
| 10|  Riya| 58000|   Jaipur|   Finance|2020-12-02|
| 11|Vikram| 65000|   Nagpur|        HR|2019-04-22|
| 12|  Zara| 49000|   Indore|        IT|2022-10-11|
+---+------+------+---------+----------+----------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- departme

## Writing Data

### Wrtiting Data To Csv Files

In [0]:
from  pyspark.sql.types import *

# define Schema
emp_schema = StructType([
    StructField("id",IntegerType(),True), # true means null value is allowed
    StructField("name",StringType(),True),
    StructField("salary",IntegerType(),True),
    StructField("address",StringType(),True),
    StructField("department",StringType(),True),
    StructField("joineddate",DateType(),True),
])

# Genric Approach to laod data from csv
df = spark \
    .read \
    .format("csv") \
    .option("header","True") \
    .schema(emp_schema) \
    .load("/Volumes/workspace/default/my_data/employee_data.csv")

df.write.csv("/Volumes/workspace/default/process_data/csv2",header=True)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-8139818508839875>, line 21[0m
[1;32m     13[0m [38;5;66;03m# Genric Approach to laod data from csv[39;00m
[1;32m     14[0m df [38;5;241m=[39m spark \
[1;32m     15[0m     [38;5;241m.[39mread \
[1;32m     16[0m     [38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mcsv[39m[38;5;124m"[39m) \
[1;32m     17[0m     [38;5;241m.[39moption([38;5;124m"[39m[38;5;124mheader[39m[38;5;124m"[39m,[38;5;124m"[39m[38;5;124mTrue[39m[38;5;124m"[39m) \
[1;32m     18[0m     [38;5;241m.[39mschema(emp_schema) \
[1;32m     19[0m     [38;5;241m.[39mload([38;5;124m"[39m[38;5;124m/Volumes/workspace/default/my_data/employee_data.csv[39m[38;5;124m"[39m)
[0;32m---> 21[0m df[38;5;241m.[39mwrite[38;5;241m.[39mcsv([38;5;124m"[39m[38;5;124m/Volumes/workspace/default/proc

### Writing Data To Json Files

In [0]:

# genric Approach to load data from json multiline files  

# define Schema 
emp_schema = StructType([
  StructField("id",IntegerType(),True),
  StructField("name",StringType(),True),
  StructField("salary",IntegerType(),True),
  StructField("address",StringType(),True),
  StructField("department",StringType(),True),
  StructField("joineddate",DateType(),True),
])  

df_json = spark \
    .read \
    .format("json") \
    .schema(emp_schema) \
    .option("multiline","True") \
    .load("/Volumes/workspace/default/my_data/employee_data.json")

df_json.show()

# df_json.write.json("/Volumes/workspace/default/process_data/json") 



## Genric Approach To write Data

In [0]:
df_json.write \
    .format("csv") \
    .mode("overwrite") \
    .save("/Volumes/workspace/default/process_data/json")

In [0]:
df.write \
    .format("csv") \
    .mode("overwrite") \
    .option("header","True") \
    .save("/Volumes/workspace/default/process_data/csv2")

## File Format
### Row-based storage
- Data is stored row by row (examples: CSV, JSON).
- File size is larger because there is no efficient compression or encoding.
- Human readable.
- Not suitable for analytics or large-scale aggregations (slow for column-based operations).

### Columnar-based storage
- Data is stored column by column (example: Parquet).
- File size is smaller because it uses compression and encoding.
- Not human readable.
- Highly suitable for analytics (faster for aggregation, filtering, and scanning specific columns).


In [0]:
df.show()

+---+------+------+---------+----------+----------+
| id|  name|salary|  address|department|joineddate|
+---+------+------+---------+----------+----------+
|  1|  John| 50000|   Mumbai|        IT|2021-01-15|
|  2|  Emma| 62000|    Delhi|        HR|2020-03-10|
|  3|   Raj| 45000|     Pune|   Finance|2022-07-20|
|  4| Priya| 70000|Bangalore|        IT|2019-11-05|
|  5|  Alex| 55000|Hyderabad| Marketing|2021-05-12|
|  6|  Sara| 48000|  Chennai|   Finance|2020-08-25|
|  7|Nikhil| 72000|Ahmedabad|        HR|2022-02-14|
|  8| Aisha| 53000|  Kolkata|        IT|2021-09-30|
|  9|   Tom| 60000|    Surat| Marketing|2023-01-18|
| 10|  Riya| 58000|   Jaipur|   Finance|2020-12-02|
| 11|Vikram| 65000|   Nagpur|        HR|2019-04-22|
| 12|  Zara| 49000|   Indore|        IT|2022-10-11|
+---+------+------+---------+----------+----------+



In [0]:
# wrtiting data to csv 
df.write \
    .format("csv") \
    .mode("overwrite") \
    .option("header","True") \
    .save("/Volumes/workspace/default/process_data/csv2")

# writing data to parquet
df.write \
    .format("parquet") \
    .mode("overwrite") \
    .save("/Volumes/workspace/default/process_data/parquet")