In [1]:
!pip install -q pyspark spark-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.5/579.5 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import sparknlp
from sparknlp.annotator import DocumentAssembler, DateMatcher, MultiDateMatcher
from pyspark.sql.types import StringType
from pyspark.ml import Pipeline

spark = sparknlp.start()
spark

## Comparing DateMatcher and MultiDateMatcher

In [None]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

date = DateMatcher() \
.setInputCols("document") \
.setOutputCol("date") \
.setOutputFormat("yyyy/MM/dd")

multiDate = MultiDateMatcher() \
.setInputCols("document") \
.setOutputCol("multi_date") \
.setOutputFormat("MM/dd/yy")

pipeline = Pipeline().setStages([
    documentAssembler,
    date,
    multiDate
])

text_list = ["See you on next monday.",  "She was born on 02/03/1966.", "The project started yesterday and will finish next year.",
             "She will graduate by July 2023.", "She will visit doctor tomorrow and next month again."]


spark_df = spark.createDataFrame(text_list, StringType()).toDF("text")



In [None]:
result = pipeline.fit(spark_df).transform(spark_df)
result.selectExpr("text", "date.result as date", "multi_date.result as multi_date").show(truncate=False)

+--------------------------------------------------------+------------+--------------------+
|text                                                    |date        |multi_date          |
+--------------------------------------------------------+------------+--------------------+
|See you on next monday.                                 |[2024/09/23]|[09/23/24]          |
|She was born on 02/03/1966.                             |[1966/02/03]|[02/03/66]          |
|The project started yesterday and will finish next year.|[2025/09/16]|[09/16/25, 09/15/24]|
|She will graduate by July 2023.                         |[2023/07/01]|[07/01/23]          |
|She will visit doctor tomorrow and next month again.    |[2024/10/16]|[10/16/24, 09/17/24]|
+--------------------------------------------------------+------------+--------------------+



In [None]:
result = pipeline.fit(spark_df).transform(spark_df)
result.selectExpr("text", "date.result", "multi_date.result").show(truncate=False)

+--------------------------------------------------------+------------+--------------------+
|text                                                    |result      |result              |
+--------------------------------------------------------+------------+--------------------+
|See you on next monday.                                 |[2024/09/23]|[09/23/24]          |
|She was born on 02/03/1966.                             |[1966/02/03]|[02/03/66]          |
|The project started yesterday and will finish next year.|[2025/09/16]|[09/16/25, 09/15/24]|
|She will graduate by July 2023.                         |[2023/07/01]|[07/01/23]          |
|She will visit doctor tomorrow and next month again.    |[2024/10/16]|[10/16/24, 09/17/24]|
+--------------------------------------------------------+------------+--------------------+



In [None]:
result.select("date", "multi_date").show(truncate=False)

+-------------------------------------------------+----------------------------------------------------------------------------------------------+
|date                                             |multi_date                                                                                    |
+-------------------------------------------------+----------------------------------------------------------------------------------------------+
|[{date, 11, 18, 2024/09/23, {sentence -> 0}, []}]|[{date, 11, 18, 09/23/24, {sentence -> 0}, []}]                                               |
|[{date, 16, 25, 1966/02/03, {sentence -> 0}, []}]|[{date, 16, 25, 02/03/66, {sentence -> 0}, []}]                                               |
|[{date, 46, 54, 2025/09/16, {sentence -> 0}, []}]|[{date, 46, 54, 09/16/25, {sentence -> 0}, []}, {date, 20, 28, 09/15/24, {sentence -> 0}, []}]|
|[{date, 21, 29, 2023/07/01, {sentence -> 0}, []}]|[{date, 21, 29, 07/01/23, {sentence -> 0}, []}]                    

## Relative Dates

DateMatcher and MultiDateMatcher annotators return relative dates as actual dates. But in this situation, we need to provide a reference point for the date. To accomplish this, an anchor date should be set, so the actual date can be calculated. These reference date parameters can be set by setAnchorDateDay(), setAnchorDateMonth(), setAnchorDateYear().

If an anchor date parameter is not set, the current day or current month or current year will be set as the default value.

In [None]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

multiDate = MultiDateMatcher() \
    .setInputCols("document") \
    .setOutputCol("multi_date") \
    .setOutputFormat("MM/dd/yyyy")\
    .setAnchorDateYear(2005)\
    .setAnchorDateMonth(1)\
    .setAnchorDateDay(13)\

multiDate_no_day = MultiDateMatcher() \
.setInputCols("document") \
.setOutputCol("multi_date_no_day") \
.setOutputFormat("MM/dd/yyyy") \
.setAnchorDateYear(2005) \
.setAnchorDateMonth(1) \

pipeline = Pipeline().setStages([
    documentAssembler,
    multiDate,
    multiDate_no_day])


text_list = ["See you on next monday.",  "She was born on 02/03/1966.", "The project started on yesterday and will finish next year.",
             "She will graduate by July 2023.", "She will visit doctor tomorrow and next month again."]

spark_df = spark.createDataFrame(text_list, StringType()).toDF("text")

In [None]:
result = pipeline.fit(spark_df).transform(spark_df)
result.selectExpr("text", "multi_date.result as multi_date", "multi_date_no_day.result as multi_date_no_day").show(truncate=False)

+-----------------------------------------------------------+------------------------+------------------------+
|text                                                       |multi_date              |multi_date_no_day       |
+-----------------------------------------------------------+------------------------+------------------------+
|See you on next monday.                                    |[01/17/2005]            |[01/17/2005]            |
|She was born on 02/03/1966.                                |[02/03/1966]            |[02/03/1966]            |
|The project started on yesterday and will finish next year.|[01/13/2006, 01/12/2005]|[01/16/2006, 01/15/2005]|
|She will graduate by July 2023.                            |[07/01/2023]            |[07/01/2023]            |
|She will visit doctor tomorrow and next month again.       |[02/13/2005, 01/14/2005]|[02/16/2005, 01/17/2005]|
+-----------------------------------------------------------+------------------------+------------------

## Date Formats

In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

multiDate_1 = MultiDateMatcher() \
    .setInputCols("document") \
    .setOutputCol("multi_date_1") \
    .setOutputFormat("MM/dd/yy")

multiDate_2 = MultiDateMatcher() \
    .setInputCols("document") \
    .setOutputCol("multi_date_2") \
    .setOutputFormat("MMMM dd, yyyy")


multiDate_3 = MultiDateMatcher() \
    .setInputCols("document") \
    .setOutputCol("multi_date_3") \
    .setInputFormats(["dd/MM/yyyy"]) \
    .setOutputFormat(", EEEEMM/dd/yyyy")

pipeline = Pipeline().setStages([
  documentAssembler,
  multiDate_1,
  multiDate_2,
  multiDate_3
])

text_list = ["See you on 1st December 2004.",  "She was born on 02/03/1966.", "The project started on yesterday and will finish next year.",
             "She will graduate by July 2023.", "She will visit doctor tomorrow and next month again."]

spark_df = spark.createDataFrame(text_list, StringType()).toDF("text")

In [None]:
result = pipeline.fit(spark_df).transform(spark_df)
result.selectExpr("text", "multi_date_1.result as multi_date_1", "multi_date_2.result as multi_date_2", "multi_date_3.result as multi_date_3").show(truncate=False)

+-----------------------------------------------------------+--------------------+----------------------------------------+-----------------------+
|text                                                       |multi_date_1        |multi_date_2                            |multi_date_3           |
+-----------------------------------------------------------+--------------------+----------------------------------------+-----------------------+
|See you on 1st December 2004.                              |[12/01/04]          |[December 01, 2004]                     |[]                     |
|She was born on 02/03/1966.                                |[02/03/66]          |[February 03, 1966]                     |[, Wednesday03/02/1966]|
|The project started on yesterday and will finish next year.|[09/16/25, 09/15/24]|[September 16, 2025, September 15, 2024]|[]                     |
|She will graduate by July 2023.                            |[07/01/23]          |[July 01, 2023]               

## Missing Days

Sometimes in a date expression, days are not specified. For example "She will graduate by July 2023". In this situation one can set a default day value for missing days using setDefaultDayWhenMissing. If it is not set, default value is 1.

In [4]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

multiDate = MultiDateMatcher() \
    .setInputCols("document") \
    .setOutputCol("date")

multiDate_missing_day_set = MultiDateMatcher() \
    .setInputCols("document") \
    .setOutputCol("date_missing_day_set") \
    .setDefaultDayWhenMissing(15)

pipeline = Pipeline().setStages([
    documentAssembler,
    multiDate,
    multiDate_missing_day_set
])

text_list = ["See you on 1st December 2004.",  "She was born on 02/03/1966.", "The project started on yesterday and will finish next year.",
             "She will graduate by July 2023.", "She will visit doctor tomorrow and next month again."]

spark_df = spark.createDataFrame(text_list, StringType()).toDF("text")

result = pipeline.fit(spark_df).transform(spark_df)
result.selectExpr("text", "date.result as date", "date_missing_day_set.result as date_missing_day_set").show(truncate=False)

+-----------------------------------------------------------+------------------------+------------------------+
|text                                                       |date                    |date_missing_day_set    |
+-----------------------------------------------------------+------------------------+------------------------+
|See you on 1st December 2004.                              |[2004/12/01]            |[2004/12/01]            |
|She was born on 02/03/1966.                                |[1966/02/03]            |[1966/02/03]            |
|The project started on yesterday and will finish next year.|[2025/09/17, 2024/09/16]|[2025/09/17, 2024/09/16]|
|She will graduate by July 2023.                            |[2023/07/01]            |[2023/07/15]            |
|She will visit doctor tomorrow and next month again.       |[2024/10/17, 2024/09/18]|[2024/10/17, 2024/09/18]|
+-----------------------------------------------------------+------------------------+------------------

## Other Languages
Date matchers can be used with other languages. Its default value is "en"-English.

In [5]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

multiDate = MultiDateMatcher() \
    .setInputCols("document") \
    .setOutputCol("multi_date") \
    .setOutputFormat("yyyy/MM/dd") \
    .setSourceLanguage("de")

pipeline = Pipeline().setStages([
    documentAssembler,
    multiDate
])

spark_df = spark.createDataFrame([["Das letzte zahlungsdatum dieser rechnung ist der 4. mai 1998."], ["Wir haben morgen eine prüfung."]]).toDF("text")

result = pipeline.fit(spark_df).transform(spark_df)
result.selectExpr("text", "multi_date.result as date").show(truncate=False)

+-------------------------------------------------------------+------------+
|text                                                         |date        |
+-------------------------------------------------------------+------------+
|Das letzte zahlungsdatum dieser rechnung ist der 4. mai 1998.|[1998/05/04]|
|Wir haben morgen eine prüfung.                               |[2024/09/18]|
+-------------------------------------------------------------+------------+



Date matchers can extract dates from other languages. In the above German example, the first row contains an actual date while the second one has a relative date (morgen means tomorrow in English). They are formatted in the desired output format.