In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder\
                    .master("local[3]")\
                    .appName("case_otherwise")\
                    .getOrCreate()

23/12/03 12:09:36 WARN Utils: Your hostname, debian resolves to a loopback address: 127.0.1.1; using 192.168.1.18 instead (on interface wlp0s20f3)
23/12/03 12:09:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/12/03 12:09:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
opt = {"header":True, "inferSchema":True, "escape":"\"", "mode":"permissive"}
employee_df = spark.read.format("csv")\
                        .options(**opt)\
                        .load("employee_data.csv")

In [6]:
# employee_df.show()
# employee_df.count()
employee_df.printSchema()

root
 |-- Emp ID: integer (nullable = true)
 |-- Name Prefix: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Middle Initial: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- E_Mail: string (nullable = true)
 |-- Father's_Name: string (nullable = true)
 |-- Mother's_Name: string (nullable = true)
 |-- Mother's_Maiden_Name: string (nullable = true)
 |-- Date_of_Birth: string (nullable = true)
 |-- Time_of_Birth: string (nullable = true)
 |-- Age_in_Yrs: double (nullable = true)
 |-- Weight_in_Kgs.: integer (nullable = true)
 |-- Date_of_Joining: string (nullable = true)
 |-- Quarter_of_Joining: string (nullable = true)
 |-- Half_of_Joining: string (nullable = true)
 |-- Year_of_Joining: integer (nullable = true)
 |-- Month_of_Joining: integer (nullable = true)
 |-- Month_Name_of_Joining: string (nullable = true)
 |-- Short_Month: string (nullable = true)
 |-- Day_of_Joining: integer (nullable = true)
 |-

In [7]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
employee_df.select("First Name", "Last Name", "Date_of_Birth")\
    .withColumn(
        "new_date", to_date(col("Date_of_Birth"), "MM/dd/yyyy")
    ).show()

+----------+---------+-------------+----------+
|First Name|Last Name|Date_of_Birth|  new_date|
+----------+---------+-------------+----------+
|  Serafina|Bumgarner|    9/21/1982|1982-09-21|
|  Juliette|     Rojo|   05-08-1967|      null|
|     Milan| Krawczyk|   04-04-1980|      null|
|     Elmer|    Jason|   04-09-1996|      null|
|     Zelda|   Forest|   11/27/1959|1959-11-27|
|     Rhett|      Wan|    7/14/1976|1976-07-14|
|       Hal|   Farrow|    3/15/1967|1967-03-15|
|       Del|Fernandez|    8/13/1991|1991-08-13|
|     Corey|  Jackman|   04-12-1959|      null|
|      Bibi|  Paddock|   10/20/1991|1991-10-20|
|      Eric|  Manning|   11-02-1980|      null|
|   Renetta|   Hafner|    1/29/1975|1975-01-29|
|       Paz|  Pearman|    2/28/1960|1960-02-28|
|    Ardath|   Forman|   11-12-1982|      null|
|     Nanci|   Osorio|   07-09-1982|      null|
|  Maricela|   Simard|    7/21/1988|1988-07-21|
|   Avelina|   Stoner|   10-01-1988|      null|
| Christene| Mattison|    9/14/1990|1990

In [15]:
df = employee_df.select("First Name","Date_of_Birth").withColumn(
    "formated_Date",
    when(col("Date_of_Birth").contains("/"),
         to_date(col("Date_of_Birth"), "MM/dd/yyyy")
         )
    .when(col("Date_of_Birth").contains("-"),
          to_date(col("Date_of_Birth"), "MM-dd-yyyy")
        )
    .otherwise("None")
        )
df.show()

+----------+-------------+-------------+
|First Name|Date_of_Birth|formated_Date|
+----------+-------------+-------------+
|  Serafina|    9/21/1982|   1982-09-21|
|  Juliette|   05-08-1967|   1967-05-08|
|     Milan|   04-04-1980|   1980-04-04|
|     Elmer|   04-09-1996|   1996-04-09|
|     Zelda|   11/27/1959|   1959-11-27|
|     Rhett|    7/14/1976|   1976-07-14|
|       Hal|    3/15/1967|   1967-03-15|
|       Del|    8/13/1991|   1991-08-13|
|     Corey|   04-12-1959|   1959-04-12|
|      Bibi|   10/20/1991|   1991-10-20|
|      Eric|   11-02-1980|   1980-11-02|
|   Renetta|    1/29/1975|   1975-01-29|
|       Paz|    2/28/1960|   1960-02-28|
|    Ardath|   11-12-1982|   1982-11-12|
|     Nanci|   07-09-1982|   1982-07-09|
|  Maricela|    7/21/1988|   1988-07-21|
|   Avelina|   10-01-1988|   1988-10-01|
| Christene|    9/14/1990|   1990-09-14|
|    Stefan|    3/23/1990|   1990-03-23|
|   Gillian|    1/17/1960|   1960-01-17|
+----------+-------------+-------------+
only showing top

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_utc_timestamp

# Create SparkSession
spark = SparkSession.builder \
    .appName("CST to IST Conversion") \
    .getOrCreate()

# Sample input data with CST timestamps
data = [("2024-03-28 10:00:00",),
        ("2024-03-29 15:30:00",),
        ("2024-03-30 18:45:00",)]

# Create DataFrame
df = spark.createDataFrame(data, ["cst_timestamp"])

# Convert CST to IST
result_df = df.withColumn("ist_timestamp", from_utc_timestamp("cst_timestamp", "IST"))

# Show input DataFrame
print("Input DataFrame:")
df.show()

# Show output DataFrame
print("Output DataFrame after converting CST to IST:")
result_df.show()


24/03/28 20:14:06 WARN Utils: Your hostname, debian resolves to a loopback address: 127.0.1.1; using 192.168.1.13 instead (on interface wlp0s20f3)
24/03/28 20:14:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/28 20:14:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Input DataFrame:


                                                                                

+-------------------+
|      cst_timestamp|
+-------------------+
|2024-03-28 10:00:00|
|2024-03-29 15:30:00|
|2024-03-30 18:45:00|
+-------------------+

Output DataFrame after converting CST to IST:
+-------------------+-------------------+
|      cst_timestamp|      ist_timestamp|
+-------------------+-------------------+
|2024-03-28 10:00:00|2024-03-28 15:30:00|
|2024-03-29 15:30:00|2024-03-29 21:00:00|
|2024-03-30 18:45:00|2024-03-31 00:15:00|
+-------------------+-------------------+

