# 0. **Install PySpark**:

In [8]:
!pip install pyspark



# 1. **Create SparkSession**:

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, IntegerType

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

- Initializes a Spark session with the application name 'SparkByExamples.com'.


# 2. **Sample Data and Schema**:


In [10]:
simpleData = [("James", "34", "true", "M", "3000.6089"),
              ("Michael", "33", "true", "F", "3300.8067"),
              ("Robert", "37", "false", "M", "5000.5034")]

columns = ["firstname", "age", "isGraduated", "gender", "salary"]

df = spark.createDataFrame(data=simpleData, schema=columns)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- age: string (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

+---------+---+-----------+------+---------+
|firstname|age|isGraduated|gender|salary   |
+---------+---+-----------+------+---------+
|James    |34 |true       |M     |3000.6089|
|Michael  |33 |true       |F     |3300.8067|
|Robert   |37 |false      |M     |5000.5034|
+---------+---+-----------+------+---------+



- Defines sample data and schema.
- Creates a DataFrame from the sample data and schema.
- Prints the schema and displays the DataFrame.


# 3. **Casting Salary to DoubleType**:


In [11]:
df1 = df.withColumn("salary", df.salary.cast('double'))
df1.printSchema()
df1.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- age: string (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)

+---------+---+-----------+------+---------+
|firstname|age|isGraduated|gender|salary   |
+---------+---+-----------+------+---------+
|James    |34 |true       |M     |3000.6089|
|Michael  |33 |true       |F     |3300.8067|
|Robert   |37 |false      |M     |5000.5034|
+---------+---+-----------+------+---------+



- Casts the `salary` column to `double` using the string type name.


# 4. **Corrected Typo in DoubleType**:


In [12]:
df2 = df.withColumn("salary", df.salary.cast(DoubleType()))
df2.printSchema()
df2.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- age: string (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)

+---------+---+-----------+------+---------+
|firstname|age|isGraduated|gender|salary   |
+---------+---+-----------+------+---------+
|James    |34 |true       |M     |3000.6089|
|Michael  |33 |true       |F     |3300.8067|
|Robert   |37 |false      |M     |5000.5034|
+---------+---+-----------+------+---------+



- Casts the `salary` column to `DoubleType()`.


# 5. **Casting with col and DoubleType**:


In [13]:
from pyspark.sql.functions import col,round,expr

df3 = df.withColumn("salary", col("salary").cast('double'))
df3.printSchema()
df3.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- age: string (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)

+---------+---+-----------+------+---------+
|firstname|age|isGraduated|gender|salary   |
+---------+---+-----------+------+---------+
|James    |34 |true       |M     |3000.6089|
|Michael  |33 |true       |F     |3300.8067|
|Robert   |37 |false      |M     |5000.5034|
+---------+---+-----------+------+---------+



# 6. **Rounding the Salary to 2 Decimal Places**:


In [14]:
df4 = df.withColumn("salary", round(col("salary").cast(DoubleType()), 2))
df4.printSchema()
df4.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- age: string (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)

+---------+---+-----------+------+-------+
|firstname|age|isGraduated|gender|salary |
+---------+---+-----------+------+-------+
|James    |34 |true       |M     |3000.61|
|Michael  |33 |true       |F     |3300.81|
|Robert   |37 |false      |M     |5000.5 |
+---------+---+-----------+------+-------+



- Casts the `salary` column to `DoubleType()` and rounds the values to 2 decimal places.


# 7. **Using selectExpr for Casting**:


In [15]:
df5 = df.selectExpr("firstname", "isGraduated", "cast(salary as double) as salary")
df5.printSchema()
df5.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- salary: double (nullable = true)

+---------+-----------+---------+
|firstname|isGraduated|salary   |
+---------+-----------+---------+
|James    |true       |3000.6089|
|Michael  |true       |3300.8067|
|Robert   |false      |5000.5034|
+---------+-----------+---------+



# 8. **Using SQL for Casting**:


In [17]:
df.createOrReplaceTempView("CastExample")

df6 = spark.sql("SELECT firstname, isGraduated, CAST(salary AS DOUBLE) as salary FROM CastExample")
df6.printSchema()
df6.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- salary: double (nullable = true)

+---------+-----------+---------+
|firstname|isGraduated|salary   |
+---------+-----------+---------+
|James    |true       |3000.6089|
|Michael  |true       |3300.8067|
|Robert   |false      |5000.5034|
+---------+-----------+---------+

