#### **BooleanType()**

 - BooleanType() helps you specify that a column should only have two values: **true or false**.

- Using the **right data type** can **optimize performance** because Spark knows exactly how to handle **boolean** operations efficiently.

- The BooleanType is a fundamental data type in Spark, used to represent Boolean values, which can be **either True or False**. This data type is commonly used when dealing with **binary decisions, filtering data based on conditions**, and performing logical operations in Spark.


In [0]:
a = True   # 1
b = False  # 0
c = a + b  # 1 + 0 = 1
print(c)

1


In [0]:
print(True + True)

2


In [0]:
print(False + False)

0


In [0]:
%fs ls /FileStore/tables

path,name,size,modificationTime
dbfs:/FileStore/tables/Flatten Nested Array.json,Flatten Nested Array.json,3756,1718618620000
dbfs:/FileStore/tables/MarketPrice-1.csv,MarketPrice-1.csv,19528,1719656512000
dbfs:/FileStore/tables/MarketPrice.csv,MarketPrice.csv,19528,1719656208000
dbfs:/FileStore/tables/MultiLineJSON.json/,MultiLineJSON.json/,0,0
dbfs:/FileStore/tables/MultiLineJSON01.json/,MultiLineJSON01.json/,0,0
dbfs:/FileStore/tables/MultiLineJSON1.json/,MultiLineJSON1.json/,0,0
dbfs:/FileStore/tables/MultiLineJSON123.json/,MultiLineJSON123.json/,0,0
dbfs:/FileStore/tables/MultiLineJSON2.json/,MultiLineJSON2.json/,0,0
dbfs:/FileStore/tables/Question7.csv,Question7.csv,154,1725816645000
dbfs:/FileStore/tables/RunningData_Rev02.csv,RunningData_Rev02.csv,1222,1719810609000


In [0]:
df = spark.read.csv("dbfs:/FileStore/tables/booleantype-4.csv", header=True, inferSchema=True)
display(df.limit(10))

Name,Boolean,Sensex_Category,Course_Name,Effective_Date,Start_Date,Graduated,Salary
Amit,0,Top,python,6-Feb-23,14-Jan-23,True,1598000
Shikhar,1,Top,Data Science,6-Feb-23,14-Jan-23,False,98756078
Shivani,0,Top,Accounting,8-Jan-24,7-Oct-23,True,3550000
Pooja,1,Top,Soft skills,8-Jan-24,7-Oct-23,True,98760432
Vinay,0,Top,ADF,6-Mar-23,7-Feb-23,False,99803421
Sharma,1,Top,ADB,6-Mar-23,7-Feb-23,True,33240000
Sonu,1,Top,ML,6-Jan-25,9-Jan-24,True,15670000
Rakesh,0,Top,databricks,6-Jan-25,9-Jan-24,True,1598000
Roshan,1,Top,Soft skills,6-Apr-23,7-Mar-23,True,18894320
Rithvik,1,Top,python,6-Apr-23,7-Mar-23,True,3550000


In [0]:
from pyspark.sql.functions import col, when, desc, asc
from pyspark.sql.types import BooleanType
import pyspark.sql.functions as F

#### **How to count a boolean in grouped data**

In [0]:
df_agg = df.groupBy("Course_Name").agg(F.round(F.avg('Salary'),2), F.count('Graduated').alias('Count_Graduated'))
display(df_agg.orderBy(asc('Course_Name')))

Course_Name,"round(avg(Salary), 2)",Count_Graduated
ADB,62256968.67,3
ADF,74366340.33,3
Accounting,24617366.67,3
Azure,40324333.33,3
DEVOPS,224757500.0,4
Data Science,76918692.67,3
GIT,86602000.0,3
ML,67457833.33,3
PySpark,38142566.67,3
Soft skills,56979253.67,6


#### **1) convert integer (data type) to boolean (data type)**

In [0]:
df_cast = df.withColumn("Boolean", col("Boolean").cast(BooleanType()))
display(df_cast.limit(10))

Name,Boolean,Sensex_Category,Course_Name,Effective_Date,Start_Date,Graduated,Salary
Amit,False,Top,python,6-Feb-23,14-Jan-23,True,1598000
Shikhar,True,Top,Data Science,6-Feb-23,14-Jan-23,False,98756078
Shivani,False,Top,Accounting,8-Jan-24,7-Oct-23,True,3550000
Pooja,True,Top,Soft skills,8-Jan-24,7-Oct-23,True,98760432
Vinay,False,Top,ADF,6-Mar-23,7-Feb-23,False,99803421
Sharma,True,Top,ADB,6-Mar-23,7-Feb-23,True,33240000
Sonu,True,Top,ML,6-Jan-25,9-Jan-24,True,15670000
Rakesh,False,Top,databricks,6-Jan-25,9-Jan-24,True,1598000
Roshan,True,Top,Soft skills,6-Apr-23,7-Mar-23,True,18894320
Rithvik,True,Top,python,6-Apr-23,7-Mar-23,True,3550000


#### **2) Filter a dataframe by a boolean column**

In [0]:
df_filter = df_cast.filter(col('Graduated') == True)
display(df_filter.limit(10))

Name,Boolean,Sensex_Category,Course_Name,Effective_Date,Start_Date,Graduated,Salary
Amit,False,Top,python,6-Feb-23,14-Jan-23,True,1598000
Shivani,False,Top,Accounting,8-Jan-24,7-Oct-23,True,3550000
Pooja,True,Top,Soft skills,8-Jan-24,7-Oct-23,True,98760432
Sharma,True,Top,ADB,6-Mar-23,7-Feb-23,True,33240000
Sonu,True,Top,ML,6-Jan-25,9-Jan-24,True,15670000
Rakesh,False,Top,databricks,6-Jan-25,9-Jan-24,True,1598000
Roshan,True,Top,Soft skills,6-Apr-23,7-Mar-23,True,18894320
Rithvik,True,Top,python,6-Apr-23,7-Mar-23,True,3550000
Sreedhar,False,Top,Azure,6-Jul-23,7-Apr-23,True,15670000
Vikas,True,Social,Data Science,6-Jan-23,4-Jan-23,True,98760000


In [0]:
df_bool = df_cast.withColumn("Boolean01", F.lit(True))\
                 .withColumn("Boolean02", F.lit("True").cast('boolean'))\
                 .withColumn("Boolean03", F.lit(1).cast('boolean'))\
                 .withColumn("Boolean04", F.lit(0).cast('boolean'))\
                 .withColumn("Boolean05", F.lit("T").cast('boolean'))\
                 .withColumn("Boolean06", F.lit("t").cast('boolean'))

display(df_bool.limit(10))

Name,Boolean,Sensex_Category,Course_Name,Effective_Date,Start_Date,Graduated,Salary,Boolean01,Boolean02,Boolean03,Boolean04,Boolean05,Boolean06
Amit,False,Top,python,6-Feb-23,14-Jan-23,True,1598000,True,True,True,False,True,True
Shikhar,True,Top,Data Science,6-Feb-23,14-Jan-23,False,98756078,True,True,True,False,True,True
Shivani,False,Top,Accounting,8-Jan-24,7-Oct-23,True,3550000,True,True,True,False,True,True
Pooja,True,Top,Soft skills,8-Jan-24,7-Oct-23,True,98760432,True,True,True,False,True,True
Vinay,False,Top,ADF,6-Mar-23,7-Feb-23,False,99803421,True,True,True,False,True,True
Sharma,True,Top,ADB,6-Mar-23,7-Feb-23,True,33240000,True,True,True,False,True,True
Sonu,True,Top,ML,6-Jan-25,9-Jan-24,True,15670000,True,True,True,False,True,True
Rakesh,False,Top,databricks,6-Jan-25,9-Jan-24,True,1598000,True,True,True,False,True,True
Roshan,True,Top,Soft skills,6-Apr-23,7-Mar-23,True,18894320,True,True,True,False,True,True
Rithvik,True,Top,python,6-Apr-23,7-Mar-23,True,3550000,True,True,True,False,True,True


#### **3) convert boolean (data type) to integer (data type)**

In [0]:
from pyspark.sql.functions import when

# convert Boolean column to integer column
df_new = df_bool.withColumn('Graduated_int', when(df_bool.Graduated==True, 1).otherwise(0))\
                .select('Graduated', 'Graduated_int')

# view new DataFrame
display(df_new.limit(10))

Graduated,Graduated_int
True,1
False,0
True,1
True,1
False,0
True,1
True,1
True,1
True,1
True,1


In [0]:
# convert Boolean column to integer column
df_bool_status = df_cast.withColumn('Course_Name_int', F.when(F.col("Course_Name") == F.lit("python"), F.lit("true")).otherwise(F.lit("false")))

# view new DataFrame
display(df_bool_status.limit(10))

Name,Boolean,Sensex_Category,Course_Name,Effective_Date,Start_Date,Graduated,Salary,Course_Name_int
Amit,False,Top,python,6-Feb-23,14-Jan-23,True,1598000,True
Shikhar,True,Top,Data Science,6-Feb-23,14-Jan-23,False,98756078,False
Shivani,False,Top,Accounting,8-Jan-24,7-Oct-23,True,3550000,False
Pooja,True,Top,Soft skills,8-Jan-24,7-Oct-23,True,98760432,False
Vinay,False,Top,ADF,6-Mar-23,7-Feb-23,False,99803421,False
Sharma,True,Top,ADB,6-Mar-23,7-Feb-23,True,33240000,False
Sonu,True,Top,ML,6-Jan-25,9-Jan-24,True,15670000,False
Rakesh,False,Top,databricks,6-Jan-25,9-Jan-24,True,1598000,False
Roshan,True,Top,Soft skills,6-Apr-23,7-Mar-23,True,18894320,False
Rithvik,True,Top,python,6-Apr-23,7-Mar-23,True,3550000,True


In [0]:
# convert Boolean column to integer column
df_bool_status_st = df_cast.withColumn('Course_Name_int', F.col("Course_Name") == F.lit("python"))

# view new DataFrame
display(df_bool_status_st.limit(10))

Name,Boolean,Sensex_Category,Course_Name,Effective_Date,Start_Date,Graduated,Salary,Course_Name_int
Amit,False,Top,python,6-Feb-23,14-Jan-23,True,1598000,True
Shikhar,True,Top,Data Science,6-Feb-23,14-Jan-23,False,98756078,False
Shivani,False,Top,Accounting,8-Jan-24,7-Oct-23,True,3550000,False
Pooja,True,Top,Soft skills,8-Jan-24,7-Oct-23,True,98760432,False
Vinay,False,Top,ADF,6-Mar-23,7-Feb-23,False,99803421,False
Sharma,True,Top,ADB,6-Mar-23,7-Feb-23,True,33240000,False
Sonu,True,Top,ML,6-Jan-25,9-Jan-24,True,15670000,False
Rakesh,False,Top,databricks,6-Jan-25,9-Jan-24,True,1598000,False
Roshan,True,Top,Soft skills,6-Apr-23,7-Mar-23,True,18894320,False
Rithvik,True,Top,python,6-Apr-23,7-Mar-23,True,3550000,True
