In [1]:
from pyspark.sql import SparkSession
from datetime import date,datetime
spark=SparkSession.builder.appName("pyspark practise-1").getOrCreate()

In [21]:
data=[(1,'alice',100.78,True,date(2025,5,12),datetime(2025,5,12,14,30,0)),(2,'bob',200.89,False,date(2025,1,1),datetime(2025,1,1,14,30,0)), (3,'charlie',30.870,True,date(2026,1,1),datetime(2025,1,1,14,30,0))]

df1=spark.createDataFrame(data,schema="id int,name string,salary float,self_employed boolean,start_date date,last_login timestamp")
df1.show()
df1.printSchema()
df1.head(2)

+---+-------+------+-------------+----------+-------------------+
| id|   name|salary|self_employed|start_date|         last_login|
+---+-------+------+-------------+----------+-------------------+
|  1|  alice|100.78|         true|2025-05-12|2025-05-12 14:30:00|
|  2|    bob|200.89|        false|2025-01-01|2025-01-01 14:30:00|
|  3|charlie| 30.87|         true|2026-01-01|2025-01-01 14:30:00|
+---+-------+------+-------------+----------+-------------------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- self_employed: boolean (nullable = true)
 |-- start_date: date (nullable = true)
 |-- last_login: timestamp (nullable = true)



[Row(id=1, name='alice', salary=100.77999877929688, self_employed=True, start_date=datetime.date(2025, 5, 12), last_login=datetime.datetime(2025, 5, 12, 14, 30)),
 Row(id=2, name='bob', salary=200.88999938964844, self_employed=False, start_date=datetime.date(2025, 1, 1), last_login=datetime.datetime(2025, 1, 1, 14, 30))]

In [6]:
# Reading CSV file without header with schema

df2=spark.read.format('csv').option('header','false').schema('id int,event_date date,count int').load('./csvFiles/sample1.csv')

df2.printSchema()
df2.head(3)

root
 |-- id: integer (nullable = true)
 |-- event_date: date (nullable = true)
 |-- count: integer (nullable = true)



[Row(id=1, event_date=datetime.date(2025, 1, 1), count=2),
 Row(id=1, event_date=datetime.date(2025, 1, 2), count=3),
 Row(id=2, event_date=datetime.date(2025, 1, 2), count=5)]

In [7]:
# Reading CSV file with header with schema
df2=spark.read.format('csv').option('header','true').option('inferSchema','true').load('./csvFiles/sample2.csv')
df2.printSchema()
df2.head(3)
# df2=spark.read.format('csv').option('header','true').option('inferSchema','true').option('delimiter',',').load('sample2.csv')
# df2.printSchema()

root
 |-- roll_no: integer (nullable = true)
 |-- admission_date: date (nullable = true)
 |-- marks: integer (nullable = true)



[Row(roll_no=1, admission_date=datetime.date(2025, 1, 1), marks=2),
 Row(roll_no=1, admission_date=datetime.date(2025, 1, 2), marks=3),
 Row(roll_no=2, admission_date=datetime.date(2025, 1, 2), marks=5)]

In [None]:
# Reading JSON file with/without schema
#df=spark.read.format('json').option('inferSchema','true').option('multiline','true').load('./csvFiles/sample1.json')
df=spark.read.format('json').schema('name string,state string').option('multiline','true').load('./csvFiles/sample1.json')
#schema('name string,state string')
df.printSchema()
df.head(2)



root
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)



[Row(name='dk', state='active')]

In [26]:
#selecting columns from dataframe
df1=spark.createDataFrame([[1,2,3],[4,5,6],[7,8,9]],schema="col1 int,col2 int,col3 int")
print("Dataframe df1:")
#df1.show()
print("Selecting specific columns from df1:")
#df1.select('col1','col3').show()
from pyspark.sql.functions import col
# Using col function to select columns
#df1.select(col('col1').alias('col1_alias'),col('col2').alias('col2_alias')).show()

#simple aggregation functions
from pyspark.sql.functions import max,min,avg,sum,least,greatest
df1.select(max(col('col1')).alias('max_col1'),min(col('col2')).alias('min_col2'),avg(col('col3')).alias('avg_col3'),sum('col1')).show()

df1.select(least('col1','col2','col3'),greatest('col1','col2','col3')).show()

Dataframe df1:
Selecting specific columns from df1:
+--------+--------+--------+---------+
|max_col1|min_col2|avg_col3|sum(col1)|
+--------+--------+--------+---------+
|       7|       2|     6.0|       12|
+--------+--------+--------+---------+

+-----------------------+--------------------------+
|least(col1, col2, col3)|greatest(col1, col2, col3)|
+-----------------------+--------------------------+
|                      1|                         3|
|                      4|                         6|
|                      7|                         9|
+-----------------------+--------------------------+



In [48]:
# Adding a new column to existing dataframe
df1=spark.createDataFrame([['student1',45,33,24,'good'],['student2',5,3,20,'bad'],['student3',50,50,50,'good']],schema='name string,maths int,science int,english int,remarks string')
df1.show()
# Adding a new column with total marks 
# df2=df1.withColumn('total_marks',col('maths')+col('science')+col('english'))
# df2.show()
# Adding a new column with total marks and percentage
# df2=df1.withColumn('total_marks',col('maths')+col('science')+col('english')).withColumn('percentage',(col('total_marks')*100)/150)
# df2.show()

#1) using withColumn to add a new column

# Adding a new column with total marks and percentage calculated in a complex way
from pyspark.sql.functions import when,least,lit
df2=df1.withColumn('total_marks',col('maths')+col('science')+col('english')).withColumn('percentage',
                                                                                        when(col('remarks')=='good',(least(col('total_marks')+20,lit(150))*100)/150)
                                                                                        .when(col('remarks')=='bad',(col('total_marks')*100)/150)
                                                                                        .otherwise(0)
                                                                                        )
df2.show()
#note: least function returns the least value among the columns passed to it.
#note: lit function is used to create a column with a literal/constant value.
#note: when-otherwise in pyspark is similar to if-else condition in python.

#1) using withColumns to add a new column

from pyspark.sql.functions import greatest,least
# Adding a new column with highest and lowest marks among the subjects
df3=df2.withColumns({
    'highest_marks':greatest(col('maths'),col('science'),col('english')),
    'lowest_marks':least(col('maths'),col('science'),col('english'))
})
df3.show()

+--------+-----+-------+-------+-------+
|    name|maths|science|english|remarks|
+--------+-----+-------+-------+-------+
|student1|   45|     33|     24|   good|
|student2|    5|      3|     20|    bad|
|student3|   50|     50|     50|   good|
+--------+-----+-------+-------+-------+

+--------+-----+-------+-------+-------+-----------+------------------+
|    name|maths|science|english|remarks|total_marks|        percentage|
+--------+-----+-------+-------+-------+-----------+------------------+
|student1|   45|     33|     24|   good|        102| 81.33333333333333|
|student2|    5|      3|     20|    bad|         28|18.666666666666668|
|student3|   50|     50|     50|   good|        150|             100.0|
+--------+-----+-------+-------+-------+-----------+------------------+

+--------+-----+-------+-------+-------+-----------+------------------+-------------+------------+
|    name|maths|science|english|remarks|total_marks|        percentage|highest_marks|lowest_marks|
+--------

In [52]:
# Dropping a columns from dataframe
from pyspark.sql.functions import col
df1=spark.createDataFrame([[1,2,3],[4,5,6]],schema="col1 int,col2 int,col3 int")
df1.drop(col('col2'),col('col3')).show()  # Dropping a column from dataframe

+----+
|col1|
+----+
|   1|
|   4|
+----+



In [None]:
#BASIC FILTERING
df1=spark.createDataFrame([['student1',45,33,24,'good'],['student2',5,3,20,'bad'],['student3',50,50,50,'good']],schema='name string,maths int,science int,english int,remarks string')
df1.show()
from pyspark.sql.functions import col
df1.filter(col('remarks')=='good').show()
## Multiple conditions require parentheses around each condition
df1.filter((col('remarks')=='good') & (col('science')>45)).show()

df1.filter((col('remarks')=='bad') | (col('english')<25)).show()

+--------+-----+-------+-------+-------+
|    name|maths|science|english|remarks|
+--------+-----+-------+-------+-------+
|student1|   45|     33|     24|   good|
|student2|    5|      3|     20|    bad|
|student3|   50|     50|     50|   good|
+--------+-----+-------+-------+-------+

+--------+-----+-------+-------+-------+
|    name|maths|science|english|remarks|
+--------+-----+-------+-------+-------+
|student1|   45|     33|     24|   good|
|student3|   50|     50|     50|   good|
+--------+-----+-------+-------+-------+

+--------+-----+-------+-------+-------+
|    name|maths|science|english|remarks|
+--------+-----+-------+-------+-------+
|student3|   50|     50|     50|   good|
+--------+-----+-------+-------+-------+

+--------+-----+-------+-------+-------+
|    name|maths|science|english|remarks|
+--------+-----+-------+-------+-------+
|student1|   45|     33|     24|   good|
|student2|    5|      3|     20|    bad|
+--------+-----+-------+-------+-------+



In [None]:
#STRING FILTERING
df1=spark.createDataFrame([['Ravi',100],['Rahul',200],['sunny',40],['R123',100],['xxxx',100]],schema='name string,salary int')
from pyspark.sql.functions import col
df1.show()
# like operator is similar to sql like operator
print('names starting with R')
df1.filter(col('name').like('R%')).show()  # names starting with R
print('names having 5 letters')
df1.filter(col('name').like('_____')).show()  # names having 5 letters
# regex filtering
print('names having 5 letters-using regex')
df1.filter(col('name').rlike('^.{5}$')).show()  # names having 5 letters
print('names having only small letters')
df1.filter(col('name').rlike('^[a-z]+$')).show() #names having only small letters
print('names starting with capital letter, and followed by 3 digits')
df1.filter(col('name').rlike('^[A-Z][0-9]{3}$')).show() #names starting with capital letter, and followed by 3 digits
print('names having exactly 4 small letters')
df1.filter(col('name').rlike('^[a-z]{4}$')).show()

# Explanation of regex patterns used above:
# ^: Matches the beginning of the string.

# $: Matches the end of the string.

# .: Matches any single character.

# *: Matches the preceding character zero or more times.

# +: Matches the preceding character one or more times.

# [ ]: Matches any one of the characters inside the brackets.

# |: Acts as an OR condition.


+-----+------+
| name|salary|
+-----+------+
| Ravi|   100|
|Rahul|   200|
|sunny|    40|
| R123|   100|
| xxxx|   100|
+-----+------+

names starting with R
+-----+------+
| name|salary|
+-----+------+
| Ravi|   100|
|Rahul|   200|
| R123|   100|
+-----+------+

names having 5 letters
+-----+------+
| name|salary|
+-----+------+
|Rahul|   200|
|sunny|    40|
+-----+------+

names having 5 letters-using regex
+-----+------+
| name|salary|
+-----+------+
|Rahul|   200|
|sunny|    40|
+-----+------+

names having only small letters
+-----+------+
| name|salary|
+-----+------+
|sunny|    40|
| xxxx|   100|
+-----+------+

names starting with capital letter, and followed by 3 digits
+----+------+
|name|salary|
+----+------+
|R123|   100|
+----+------+

names having exactly 4 small letters
+----+------+
|name|salary|
+----+------+
|xxxx|   100|
+----+------+



In [None]:
# dealing with null values
df1=spark.createDataFrame([['Ravi',100],['Rahul',None],['sunny',40],['kk',100],['xxxx',100]],schema='name string,salary int')
df1.show()
from pyspark.sql.functions import col
print("iltering rows where salary is null")
df1.filter(col('salary').isNull()).show()  # Filtering rows where salary is null
print("Filtering rows where salary is not null")
df1.filter(col('salary').isNotNull()).show()  # Filtering rows where salary is not null
print("Dropping all rows with null values in any column")
df1.dropna().show()
print("Dropping any rows with null values in name column")
df1.dropna(subset=['name']).show() #subset needs a list of strings, so don't use col() here

print("Filling all null values, for all columns with a specific value: 'N/A' for all columns  ")
df1.fillna('N/A').show() #this will NOT fill nulls in int column, since 'N/A' is string

print("Filling all null values, for all columns with a specific value: 0 for all columns  ")
df1.fillna(0).show() #this will fill nulls in int column, since 0 is int

print('modified df:')
df1=spark.createDataFrame([['Ravi',100],['Rahul',None],[None,40],['kk',100],['xxxx',100]],schema='name string,salary int')
df1.show()
print("Filling null values in specific columns with specific values")
df1.fillna({
    'name':'Unknown',
    'salary':0
}).show()


+-----+------+
| name|salary|
+-----+------+
| Ravi|   100|
|Rahul|  NULL|
|sunny|    40|
|   kk|   100|
| xxxx|   100|
+-----+------+

iltering rows where salary is null
+-----+------+
| name|salary|
+-----+------+
|Rahul|  NULL|
+-----+------+

Filtering rows where salary is not null
+-----+------+
| name|salary|
+-----+------+
| Ravi|   100|
|sunny|    40|
|   kk|   100|
| xxxx|   100|
+-----+------+

Dropping all rows with null values in any column
+-----+------+
| name|salary|
+-----+------+
| Ravi|   100|
|sunny|    40|
|   kk|   100|
| xxxx|   100|
+-----+------+

Dropping any rows with null values in name column
+-----+------+
| name|salary|
+-----+------+
| Ravi|   100|
|Rahul|  NULL|
|sunny|    40|
|   kk|   100|
| xxxx|   100|
+-----+------+

Filling all null values, for all columns with a specific value: 'N/A' for all columns  
+-----+------+
| name|salary|
+-----+------+
| Ravi|   100|
|Rahul|  NULL|
|sunny|    40|
|   kk|   100|
| xxxx|   100|
+-----+------+

Filling all 

In [19]:
df1=spark.createDataFrame([['rr',100],['rr',90],['sunny',90],['kk',100],['kk',100]],schema='name string,salary int')
df1.show()
from pyspark.sql.functions import col

print("Removing duplicate rows from dataframe")
df1.distinct().show()

print("Removing duplicate rows based on specific column(s), ie name column")
df1.dropDuplicates(subset=['name']).show() # gives more control, you can pass a list of columns based on which duplicates should be removed.
#if subset is not passed, it behaves like distinct()

df1.filter(col('name').isin(['rr','sunny'])).show()  # Filtering rows where name is in the given list

+-----+------+
| name|salary|
+-----+------+
|   rr|   100|
|   rr|    90|
|sunny|    90|
|   kk|   100|
|   kk|   100|
+-----+------+

Removing duplicate rows from dataframe
+-----+------+
| name|salary|
+-----+------+
|   rr|   100|
|   rr|    90|
|sunny|    90|
|   kk|   100|
+-----+------+

Removing duplicate rows based on specific column(s), ie name column
+-----+------+
| name|salary|
+-----+------+
|   rr|   100|
|sunny|    90|
|   kk|   100|
+-----+------+

+-----+------+
| name|salary|
+-----+------+
|   rr|   100|
|   rr|    90|
|sunny|    90|
+-----+------+

