#### **Question**
**How to convert custom date into timestamp?**

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import regexp_extract, col, from_unixtime, timestamp_millis, substring, length

In [0]:
data = [("/Date(1493596800000)/", "/Date(2840054400000)/", "/Date(1540857600000)/"),
        ("/Date(1537920000000)/", "/Date(2871676800000)/", "/Date(1540944000000)/"),
        ("/Date(1493510400000)/", "/Date(2871590400000)/", "/Date(1541376000000)/"),
        ("/Date(1522540800000)/", "/Date(1548028800000)/", "/Date(1541462400000)/"),
        ("/Date(1522540800000)/", "/Date(2840054400000)/", "/Date(1541548800000)/"),
        ("/Date(1493596800000)/", "/Date(2366755200000)/", "/Date(1541635200000)/")
        ]
columns = ["d1", "d2", "d3"]

df = spark.createDataFrame(data, columns)
display(df)

d1,d2,d3
/Date(1493596800000)/,/Date(2840054400000)/,/Date(1540857600000)/
/Date(1537920000000)/,/Date(2871676800000)/,/Date(1540944000000)/
/Date(1493510400000)/,/Date(2871590400000)/,/Date(1541376000000)/
/Date(1522540800000)/,/Date(1548028800000)/,/Date(1541462400000)/
/Date(1522540800000)/,/Date(2840054400000)/,/Date(1541548800000)/
/Date(1493596800000)/,/Date(2366755200000)/,/Date(1541635200000)/


**Method 01**
- Using **from_unixtime + substring + cast**

In [0]:
df_substr = df\
.withColumn("d1_new", substring('d1', 7, length('d1')-8)) \
.withColumn("d2_new", substring('d2', 7, length('d2')-8)) \
.withColumn("d3_new", substring('d3', 7, length('d3')-8))

display(df_substr)

d1,d2,d3,d1_new,d2_new,d3_new
/Date(1493596800000)/,/Date(2840054400000)/,/Date(1540857600000)/,1493596800000,2840054400000,1540857600000
/Date(1537920000000)/,/Date(2871676800000)/,/Date(1540944000000)/,1537920000000,2871676800000,1540944000000
/Date(1493510400000)/,/Date(2871590400000)/,/Date(1541376000000)/,1493510400000,2871590400000,1541376000000
/Date(1522540800000)/,/Date(1548028800000)/,/Date(1541462400000)/,1522540800000,1548028800000,1541462400000
/Date(1522540800000)/,/Date(2840054400000)/,/Date(1541548800000)/,1522540800000,2840054400000,1541548800000
/Date(1493596800000)/,/Date(2366755200000)/,/Date(1541635200000)/,1493596800000,2366755200000,1541635200000


In [0]:
input = "/Date(1493596800000)/"
len(input)

21

- Each row in data contains **3 string fields** like: "/Date(1493596800000)/"
- These strings are **timestamps in Unix Epoch milliseconds** format, **wrapped inside /Date(...)/**
      
       "/Date(1493596800000)/"` → actual timestamp is `1493596800000`

- **substring('d1', 7, length('d1') - 8)**
  - substring **extracts a portion of a string** from a column.


     "/Date(1493596800000)/"

      INDEX of full string    =>    1   2   3   4   5   6   7   8   9   10   11   12    13   14   15   16   17   18   19   20   21

                                    /   D   a   t   e   (   1   4   9    3    5    9     6    8    0    0    0    0    0    )    /

      INDEX of full substring =>                            1   2   3    4    5    6     7    8    9   10   11   12   13

- The **timestamp starts** at **position 7** right after **/Date(**

- The closing **)/** is at the **end**.
- The **total length** of the **string is 21**.
- **length('d1') - 8 = 21-8 = 13** ensures we exclude **)/** from the end.

**Why -8?**
- **length('d1')** gives **21** for **/Date(1493596800000)/**.
- We **start** extracting at **position 7**.
- The remaining portion to extract is **21 - 8 = 13** (which correctly gives **1493596800000**).

In [0]:
from pyspark.sql.functions import expr

df_epoch_wexpr = df\
.withColumn("dt1_default_str", from_unixtime(expr("cast(substring(d1, 7, length(d1)-8) as bigint)/1000"))) \
.withColumn("dt2_default_str", from_unixtime(expr("cast(substring(d2, 7, length(d2)-8) as bigint)/1000"))) \
.withColumn("dt3_default_str", from_unixtime(expr("cast(substring(d3, 7, length(d3)-8) as bigint)/1000"))) \
.withColumn("dt1_default_timestamp", from_unixtime(expr("cast(substring(d1, 7, length(d1)-8) as bigint)/1000")).cast("timestamp")) \
.withColumn("dt2_default_timestamp", from_unixtime(expr("cast(substring(d2, 7, length(d2)-8) as bigint)/1000")).cast("timestamp")) \
.withColumn("dt3_default_timestamp", from_unixtime(expr("cast(substring(d3, 7, length(d3)-8) as bigint)/1000")).cast("timestamp")) \
.withColumn("dt1_cust_str", from_unixtime(expr("cast(substring(d1, 7, length(d1)-8) as bigint) / 1000"), "yyyy-MM-dd'T'HH:mm:ss'Z'")) \
.withColumn("dt2_cust_str", from_unixtime(expr("cast(substring(d2, 7, length(d2)-8) as bigint) / 1000"), "yyyy-MM-dd'T'HH:mm:ss'Z'")) \
.withColumn("dt3_cust_str", from_unixtime(expr("cast(substring(d3, 7, length(d3)-8) as bigint) / 1000"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))

display(df_epoch_wexpr)

d1,d2,d3,dt1_default_str,dt2_default_str,dt3_default_str,dt1_default_timestamp,dt2_default_timestamp,dt3_default_timestamp,dt1_cust_str,dt2_cust_str,dt3_cust_str
/Date(1493596800000)/,/Date(2840054400000)/,/Date(1540857600000)/,2017-05-01 00:00:00,2059-12-31 00:00:00,2018-10-30 00:00:00,2017-05-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-10-30T00:00:00.000Z,2017-05-01T00:00:00Z,2059-12-31T00:00:00Z,2018-10-30T00:00:00Z
/Date(1537920000000)/,/Date(2871676800000)/,/Date(1540944000000)/,2018-09-26 00:00:00,2060-12-31 00:00:00,2018-10-31 00:00:00,2018-09-26T00:00:00.000Z,2060-12-31T00:00:00.000Z,2018-10-31T00:00:00.000Z,2018-09-26T00:00:00Z,2060-12-31T00:00:00Z,2018-10-31T00:00:00Z
/Date(1493510400000)/,/Date(2871590400000)/,/Date(1541376000000)/,2017-04-30 00:00:00,2060-12-30 00:00:00,2018-11-05 00:00:00,2017-04-30T00:00:00.000Z,2060-12-30T00:00:00.000Z,2018-11-05T00:00:00.000Z,2017-04-30T00:00:00Z,2060-12-30T00:00:00Z,2018-11-05T00:00:00Z
/Date(1522540800000)/,/Date(1548028800000)/,/Date(1541462400000)/,2018-04-01 00:00:00,2019-01-21 00:00:00,2018-11-06 00:00:00,2018-04-01T00:00:00.000Z,2019-01-21T00:00:00.000Z,2018-11-06T00:00:00.000Z,2018-04-01T00:00:00Z,2019-01-21T00:00:00Z,2018-11-06T00:00:00Z
/Date(1522540800000)/,/Date(2840054400000)/,/Date(1541548800000)/,2018-04-01 00:00:00,2059-12-31 00:00:00,2018-11-07 00:00:00,2018-04-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-11-07T00:00:00.000Z,2018-04-01T00:00:00Z,2059-12-31T00:00:00Z,2018-11-07T00:00:00Z
/Date(1493596800000)/,/Date(2366755200000)/,/Date(1541635200000)/,2017-05-01 00:00:00,2044-12-31 00:00:00,2018-11-08 00:00:00,2017-05-01T00:00:00.000Z,2044-12-31T00:00:00.000Z,2018-11-08T00:00:00.000Z,2017-05-01T00:00:00Z,2044-12-31T00:00:00Z,2018-11-08T00:00:00Z


- Converting **Unix Time** to a **Human-Readable Format** of timestamp.

      +----------+-------------------+
      |unix_time |timestamp          |
      +----------+-------------------+
      |1648974310|2023-04-03 09:45:10|
      +----------+-------------------+

- **Returns:** string of **default: yyyy-MM-dd HH:mm:ss**

In [0]:
from pyspark.sql.functions import from_unixtime, substring, length, col

df_epoch_woexpr = df \
.withColumn("dt1_default_str", from_unixtime((substring("d1", 7, length("d1") - 8).cast("bigint") / 1000))) \
.withColumn("dt2_default_str", from_unixtime((substring("d2", 7, length("d2") - 8).cast("bigint") / 1000))) \
.withColumn("dt3_default_str", from_unixtime((substring("d3", 7, length("d3") - 8).cast("bigint") / 1000))) \
.withColumn("dt1_default_timestamp", from_unixtime((substring("d1", 7, length("d1") - 8).cast("bigint") / 1000)).cast("timestamp")) \
.withColumn("dt2_default_timestamp", from_unixtime((substring("d2", 7, length("d2") - 8).cast("bigint") / 1000)).cast("timestamp")) \
.withColumn("dt3_default_timestamp", from_unixtime((substring("d3", 7, length("d3") - 8).cast("bigint") / 1000)).cast("timestamp")) \
.withColumn("dt1_cust_str", from_unixtime((substring("d1", 7, length("d1") - 8).cast("bigint") / 1000), "yyyy-MM-dd'T'HH:mm:ss'Z'")) \
.withColumn("dt2_cust_str", from_unixtime((substring("d2", 7, length("d2") - 8).cast("bigint") / 1000), "yyyy-MM-dd'T'HH:mm:ss'Z'")) \
.withColumn("dt3_cust_str", from_unixtime((substring("d3", 7, length("d3") - 8).cast("bigint") / 1000), "yyyy-MM-dd'T'HH:mm:ss'Z'"))

display(df_epoch_woexpr)

d1,d2,d3,dt1_default_str,dt2_default_str,dt3_default_str,dt1_default_timestamp,dt2_default_timestamp,dt3_default_timestamp,dt1_cust_str,dt2_cust_str,dt3_cust_str
/Date(1493596800000)/,/Date(2840054400000)/,/Date(1540857600000)/,2017-05-01 00:00:00,2059-12-31 00:00:00,2018-10-30 00:00:00,2017-05-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-10-30T00:00:00.000Z,2017-05-01T00:00:00Z,2059-12-31T00:00:00Z,2018-10-30T00:00:00Z
/Date(1537920000000)/,/Date(2871676800000)/,/Date(1540944000000)/,2018-09-26 00:00:00,2060-12-31 00:00:00,2018-10-31 00:00:00,2018-09-26T00:00:00.000Z,2060-12-31T00:00:00.000Z,2018-10-31T00:00:00.000Z,2018-09-26T00:00:00Z,2060-12-31T00:00:00Z,2018-10-31T00:00:00Z
/Date(1493510400000)/,/Date(2871590400000)/,/Date(1541376000000)/,2017-04-30 00:00:00,2060-12-30 00:00:00,2018-11-05 00:00:00,2017-04-30T00:00:00.000Z,2060-12-30T00:00:00.000Z,2018-11-05T00:00:00.000Z,2017-04-30T00:00:00Z,2060-12-30T00:00:00Z,2018-11-05T00:00:00Z
/Date(1522540800000)/,/Date(1548028800000)/,/Date(1541462400000)/,2018-04-01 00:00:00,2019-01-21 00:00:00,2018-11-06 00:00:00,2018-04-01T00:00:00.000Z,2019-01-21T00:00:00.000Z,2018-11-06T00:00:00.000Z,2018-04-01T00:00:00Z,2019-01-21T00:00:00Z,2018-11-06T00:00:00Z
/Date(1522540800000)/,/Date(2840054400000)/,/Date(1541548800000)/,2018-04-01 00:00:00,2059-12-31 00:00:00,2018-11-07 00:00:00,2018-04-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-11-07T00:00:00.000Z,2018-04-01T00:00:00Z,2059-12-31T00:00:00Z,2018-11-07T00:00:00Z
/Date(1493596800000)/,/Date(2366755200000)/,/Date(1541635200000)/,2017-05-01 00:00:00,2044-12-31 00:00:00,2018-11-08 00:00:00,2017-05-01T00:00:00.000Z,2044-12-31T00:00:00.000Z,2018-11-08T00:00:00.000Z,2017-05-01T00:00:00Z,2044-12-31T00:00:00Z,2018-11-08T00:00:00Z


**Method 02**
- Using **timestamp_millis + substring + cast**

In [0]:
from pyspark.sql.functions import substring, length, col, timestamp_millis

df_epoch_millis = df \
.withColumn("millis_d1", substring("d1", 7, length("d1") - 8).cast("bigint")) \
.withColumn("millis_d2", substring("d2", 7, length("d2") - 8).cast("bigint")) \
.withColumn("millis_d3", substring("d3", 7, length("d3") - 8).cast("bigint")) \
.withColumn("dt1_default_timestamp", timestamp_millis(col("millis_d1"))) \
.withColumn("dt2_default_timestamp", timestamp_millis(col("millis_d2"))) \
.withColumn("dt3_default_timestamp", timestamp_millis(col("millis_d3"))) \
.withColumn("dt1_cust_str", timestamp_millis(col("millis_d1")).cast("string")) \
.withColumn("dt2_cust_str", timestamp_millis(col("millis_d2")).cast("string")) \
.withColumn("dt3_cust_str", timestamp_millis(col("millis_d3")).cast("string"))

display(df_epoch_millis)

d1,d2,d3,millis_d1,millis_d2,millis_d3,dt1_default_timestamp,dt2_default_timestamp,dt3_default_timestamp,dt1_cust_str,dt2_cust_str,dt3_cust_str
/Date(1493596800000)/,/Date(2840054400000)/,/Date(1540857600000)/,1493596800000,2840054400000,1540857600000,2017-05-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-10-30T00:00:00.000Z,2017-05-01 00:00:00,2059-12-31 00:00:00,2018-10-30 00:00:00
/Date(1537920000000)/,/Date(2871676800000)/,/Date(1540944000000)/,1537920000000,2871676800000,1540944000000,2018-09-26T00:00:00.000Z,2060-12-31T00:00:00.000Z,2018-10-31T00:00:00.000Z,2018-09-26 00:00:00,2060-12-31 00:00:00,2018-10-31 00:00:00
/Date(1493510400000)/,/Date(2871590400000)/,/Date(1541376000000)/,1493510400000,2871590400000,1541376000000,2017-04-30T00:00:00.000Z,2060-12-30T00:00:00.000Z,2018-11-05T00:00:00.000Z,2017-04-30 00:00:00,2060-12-30 00:00:00,2018-11-05 00:00:00
/Date(1522540800000)/,/Date(1548028800000)/,/Date(1541462400000)/,1522540800000,1548028800000,1541462400000,2018-04-01T00:00:00.000Z,2019-01-21T00:00:00.000Z,2018-11-06T00:00:00.000Z,2018-04-01 00:00:00,2019-01-21 00:00:00,2018-11-06 00:00:00
/Date(1522540800000)/,/Date(2840054400000)/,/Date(1541548800000)/,1522540800000,2840054400000,1541548800000,2018-04-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-11-07T00:00:00.000Z,2018-04-01 00:00:00,2059-12-31 00:00:00,2018-11-07 00:00:00
/Date(1493596800000)/,/Date(2366755200000)/,/Date(1541635200000)/,1493596800000,2366755200000,1541635200000,2017-05-01T00:00:00.000Z,2044-12-31T00:00:00.000Z,2018-11-08T00:00:00.000Z,2017-05-01 00:00:00,2044-12-31 00:00:00,2018-11-08 00:00:00


**Method 03**
- Using **regexp_extract + from_unixtime**

In [0]:
# Extract the numeric timestamp from the string and convert to timestamp
df2 = df\
.withColumn("d1", from_unixtime(regexp_extract(col("d1"), r"(\d+)", 1).cast("bigint")/1000).cast("timestamp")) \
.withColumn("d2", from_unixtime(regexp_extract(col("d2"), r"(\d+)", 1).cast("bigint")/1000).cast("timestamp")) \
.withColumn("d3", from_unixtime(regexp_extract(col("d3"), r"(\d+)", 1).cast("bigint")/1000).cast("timestamp"))

# Display the transformed DataFrame
display(df2)

d1,d2,d3
2017-05-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-10-30T00:00:00.000Z
2018-09-26T00:00:00.000Z,2060-12-31T00:00:00.000Z,2018-10-31T00:00:00.000Z
2017-04-30T00:00:00.000Z,2060-12-30T00:00:00.000Z,2018-11-05T00:00:00.000Z
2018-04-01T00:00:00.000Z,2019-01-21T00:00:00.000Z,2018-11-06T00:00:00.000Z
2018-04-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-11-07T00:00:00.000Z
2017-05-01T00:00:00.000Z,2044-12-31T00:00:00.000Z,2018-11-08T00:00:00.000Z


- **regexp_extract(col("from_date"), r"(\d+)", 1)** => Extracts the **numeric** part from the **string**.

- **.cast("bigint")** => Converts it into a **long** integer.

- **/ 1000** => Converts **milliseconds to seconds**.

- **from_unixtime(...).cast("timestamp")** => Converts it into a proper **timestamp**.

**Method 4**
- Using **regexp_replace + cast**.
- Replaces **all non-numeric characters** using **regexp_replace**.

In [0]:
from pyspark.sql.functions import regexp_replace

df4 = df\
.withColumn("d1", from_unixtime(regexp_replace(col("d1"), "[^0-9]", "").cast("bigint")/1000).cast("timestamp")) \
.withColumn("d2", from_unixtime(regexp_replace(col("d2"), "[^0-9]", "").cast("bigint")/1000).cast("timestamp")) \
.withColumn("d3", from_unixtime(regexp_replace(col("d3"), "[^0-9]", "").cast("bigint")/1000).cast("timestamp"))

display(df4)

d1,d2,d3
2017-05-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-10-30T00:00:00.000Z
2018-09-26T00:00:00.000Z,2060-12-31T00:00:00.000Z,2018-10-31T00:00:00.000Z
2017-04-30T00:00:00.000Z,2060-12-30T00:00:00.000Z,2018-11-05T00:00:00.000Z
2018-04-01T00:00:00.000Z,2019-01-21T00:00:00.000Z,2018-11-06T00:00:00.000Z
2018-04-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-11-07T00:00:00.000Z
2017-05-01T00:00:00.000Z,2044-12-31T00:00:00.000Z,2018-11-08T00:00:00.000Z


**Method 5**
- Using **translate + cast**
- Uses **translate()** to **remove unwanted characters** efficiently.

In [0]:
from pyspark.sql.functions import translate

df3 = df\
.withColumn("d1", from_unixtime(translate(col("d1"), "/Date()", "").cast("bigint") / 1000).cast("timestamp")) \
.withColumn("d2", from_unixtime(translate(col("d2"), "/Date()", "").cast("bigint") / 1000).cast("timestamp")) \
.withColumn("d3", from_unixtime(translate(col("d3"), "/Date()", "").cast("bigint") / 1000).cast("timestamp"))

display(df3)

d1,d2,d3
2017-05-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-10-30T00:00:00.000Z
2018-09-26T00:00:00.000Z,2060-12-31T00:00:00.000Z,2018-10-31T00:00:00.000Z
2017-04-30T00:00:00.000Z,2060-12-30T00:00:00.000Z,2018-11-05T00:00:00.000Z
2018-04-01T00:00:00.000Z,2019-01-21T00:00:00.000Z,2018-11-06T00:00:00.000Z
2018-04-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-11-07T00:00:00.000Z
2017-05-01T00:00:00.000Z,2044-12-31T00:00:00.000Z,2018-11-08T00:00:00.000Z


**Method 6**
- Using **UDF (User-Defined Function)**

In [0]:
date_str = "/Date(1493596800000)/"
millis = int(date_str.strip("/Date()/"))
display(millis)

1493596800000

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import TimestampType
import datetime

def convert_to_timestamp(date_str):
    millis = int(date_str.strip("/Date()/"))
    return datetime.datetime.fromtimestamp(millis / 1000)

convert_udf = udf(convert_to_timestamp, TimestampType())

df5 = df.withColumn("d1", convert_udf(col("d1"))) \
        .withColumn("d2", convert_udf(col("d2"))) \
        .withColumn("d3", convert_udf(col("d3")))

display(df5)

d1,d2,d3
2017-05-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-10-30T00:00:00.000Z
2018-09-26T00:00:00.000Z,2060-12-31T00:00:00.000Z,2018-10-31T00:00:00.000Z
2017-04-30T00:00:00.000Z,2060-12-30T00:00:00.000Z,2018-11-05T00:00:00.000Z
2018-04-01T00:00:00.000Z,2019-01-21T00:00:00.000Z,2018-11-06T00:00:00.000Z
2018-04-01T00:00:00.000Z,2059-12-31T00:00:00.000Z,2018-11-07T00:00:00.000Z
2017-05-01T00:00:00.000Z,2044-12-31T00:00:00.000Z,2018-11-08T00:00:00.000Z
