##### locate()

- The **locate()** function in PySpark is used to find the **position of a substring** within a **string**.

- It works just like SQL's **INSTR() or POSITION()** functions.

- The position is **not zero based**, but **1 based index**. Returns **0 if substr could not be found in str**.

- Locate the position of the **first occurrence** of substr in a string column, after position pos.

- If **more than one occurrence** is there in a string. It will result the **position** of the **first occurrence**.

##### Syntax

     locate(substr, str[, pos])

**substr:** the substring to find

**str:** the column where you want to search

**pos (optional):** the position to start searching from (1-based index)

In [0]:
from pyspark.sql.functions import substring, concat, lit, col, expr, locate

In [0]:
data = [(1, "Weekly", "Repeat=yearly;term=1;endtime=20240131T000000Z;pos=1;day=TH;month=2;"),
        (2, "Assurance test", "Repeat=yearly;term=1;monthday=12;month=1;endtime=20230112T000000Z;"),
        (3, "MSC Audit", "Repeat=yearly;term=2;COUNT=10;month=8;monthday=19;"),
        (4, "Regression Test", "Repeat=monthly;term=2;COUNT=2;monthday=9;"),
        (5, "Lab Safety", "Repeat=monthly;term=3;COUNT=10;"),
        (6, "Testing", "Repeat=monthly;term=6;"),
        (7, "test", "Repeat=monthly;term=1;")]

schema = ["Id", "Type", "Pattern"]

df = spark.createDataFrame(data, schema)
display(df)
df.printSchema()

Id,Type,Pattern
1,Weekly,Repeat=yearly;term=1;endtime=20240131T000000Z;pos=1;day=TH;month=2;
2,Assurance test,Repeat=yearly;term=1;monthday=12;month=1;endtime=20230112T000000Z;
3,MSC Audit,Repeat=yearly;term=2;COUNT=10;month=8;monthday=19;
4,Regression Test,Repeat=monthly;term=2;COUNT=2;monthday=9;
5,Lab Safety,Repeat=monthly;term=3;COUNT=10;
6,Testing,Repeat=monthly;term=6;
7,test,Repeat=monthly;term=1;


root
 |-- Id: long (nullable = true)
 |-- Type: string (nullable = true)
 |-- Pattern: string (nullable = true)



In [0]:
df1 = df.withColumn("loc", locate(";", col("Pattern")))
display(df1)

Id,Type,Pattern,loc
1,Weekly,Repeat=yearly;term=1;endtime=20240131T000000Z;pos=1;day=TH;month=2;,14
2,Assurance test,Repeat=yearly;term=1;monthday=12;month=1;endtime=20230112T000000Z;,14
3,MSC Audit,Repeat=yearly;term=2;COUNT=10;month=8;monthday=19;,14
4,Regression Test,Repeat=monthly;term=2;COUNT=2;monthday=9;,15
5,Lab Safety,Repeat=monthly;term=3;COUNT=10;,15
6,Testing,Repeat=monthly;term=6;,15
7,test,Repeat=monthly;term=1;,15


In [0]:
df_01 = df.withColumn("loc1", locate(";", col("Pattern"), 1)) \
          .withColumn("Repeat", expr("substring(Pattern, 0, (loc1-1))")) \
          .withColumn("Frequency", expr("substring(Pattern, loc1+1, length(Pattern))"))

df_01_Repeat = df_01.select("Id", "Type", "Pattern", "loc1", "Repeat", "Frequency")
display(df_01_Repeat)
df_01_Repeat.printSchema()

Id,Type,Pattern,loc1,Repeat,Frequency
1,Weekly,Repeat=yearly;term=1;endtime=20240131T000000Z;pos=1;day=TH;month=2;,14,Repeat=yearly,term=1;endtime=20240131T000000Z;pos=1;day=TH;month=2;
2,Assurance test,Repeat=yearly;term=1;monthday=12;month=1;endtime=20230112T000000Z;,14,Repeat=yearly,term=1;monthday=12;month=1;endtime=20230112T000000Z;
3,MSC Audit,Repeat=yearly;term=2;COUNT=10;month=8;monthday=19;,14,Repeat=yearly,term=2;COUNT=10;month=8;monthday=19;
4,Regression Test,Repeat=monthly;term=2;COUNT=2;monthday=9;,15,Repeat=monthly,term=2;COUNT=2;monthday=9;
5,Lab Safety,Repeat=monthly;term=3;COUNT=10;,15,Repeat=monthly,term=3;COUNT=10;
6,Testing,Repeat=monthly;term=6;,15,Repeat=monthly,term=6;
7,test,Repeat=monthly;term=1;,15,Repeat=monthly,term=1;


root
 |-- Id: long (nullable = true)
 |-- Type: string (nullable = true)
 |-- Pattern: string (nullable = true)
 |-- loc1: integer (nullable = true)
 |-- Repeat: string (nullable = true)
 |-- Frequency: string (nullable = true)



In [0]:
df_02 = df_01.withColumn("loc2", locate(";", col("Frequency"), 1)) \
             .withColumn("term", expr("substring(Frequency, 0, (loc2-1))")) \
             .withColumn("Frequency2", expr("substring(Frequency, loc2+1, length(Frequency))"))

df_02_term = df_02.select("Id", "Type", "Frequency", "Repeat", "loc2", "term", "Frequency2")
display(df_02_term)
df_02_term.printSchema()

Id,Type,Frequency,Repeat,loc2,term,Frequency2
1,Weekly,term=1;endtime=20240131T000000Z;pos=1;day=TH;month=2;,Repeat=yearly,7,term=1,endtime=20240131T000000Z;pos=1;day=TH;month=2;
2,Assurance test,term=1;monthday=12;month=1;endtime=20230112T000000Z;,Repeat=yearly,7,term=1,monthday=12;month=1;endtime=20230112T000000Z;
3,MSC Audit,term=2;COUNT=10;month=8;monthday=19;,Repeat=yearly,7,term=2,COUNT=10;month=8;monthday=19;
4,Regression Test,term=2;COUNT=2;monthday=9;,Repeat=monthly,7,term=2,COUNT=2;monthday=9;
5,Lab Safety,term=3;COUNT=10;,Repeat=monthly,7,term=3,COUNT=10;
6,Testing,term=6;,Repeat=monthly,7,term=6,
7,test,term=1;,Repeat=monthly,7,term=1,


root
 |-- Id: long (nullable = true)
 |-- Type: string (nullable = true)
 |-- Frequency: string (nullable = true)
 |-- Repeat: string (nullable = true)
 |-- loc2: integer (nullable = true)
 |-- term: string (nullable = true)
 |-- Frequency2: string (nullable = true)



In [0]:
df_03 = df_02.withColumn("loc3", locate(";", col("Frequency2"), 1)) \
             .withColumn("count", expr("substring(Frequency2, 0, (loc3-1))")) \
             .withColumn("Frequency3", expr("substring(Frequency2, loc3+1, length(Frequency2))"))

df_03_term = df_03.select("Id", "Type", "Frequency2", "Repeat", "term", "loc3", "count", "Frequency3")
display(df_03_term)
df_03_term.printSchema()

Id,Type,Frequency2,Repeat,term,loc3,count,Frequency3
1,Weekly,endtime=20240131T000000Z;pos=1;day=TH;month=2;,Repeat=yearly,term=1,25,endtime=20240131T000000Z,pos=1;day=TH;month=2;
2,Assurance test,monthday=12;month=1;endtime=20230112T000000Z;,Repeat=yearly,term=1,12,monthday=12,month=1;endtime=20230112T000000Z;
3,MSC Audit,COUNT=10;month=8;monthday=19;,Repeat=yearly,term=2,9,COUNT=10,month=8;monthday=19;
4,Regression Test,COUNT=2;monthday=9;,Repeat=monthly,term=2,8,COUNT=2,monthday=9;
5,Lab Safety,COUNT=10;,Repeat=monthly,term=3,9,COUNT=10,
6,Testing,,Repeat=monthly,term=6,0,,
7,test,,Repeat=monthly,term=1,0,,


root
 |-- Id: long (nullable = true)
 |-- Type: string (nullable = true)
 |-- Frequency2: string (nullable = true)
 |-- Repeat: string (nullable = true)
 |-- term: string (nullable = true)
 |-- loc3: integer (nullable = true)
 |-- count: string (nullable = true)
 |-- Frequency3: string (nullable = true)



In [0]:
df_04 = df_03.withColumn("loc4", locate(";", col("Frequency3"), 1)) \
             .withColumn("endtime", expr("substring(Frequency3, 0, (loc4-1))")) \
             .withColumn("Frequency4", expr("substring(Frequency3, loc4+1, length(Frequency3))"))

df_04_term = df_04.select("Id", "Type", "Frequency3", "Repeat", "term", "count", "loc4", "endtime", "Frequency4")
display(df_04_term)
df_04_term.printSchema()

Id,Type,Frequency3,Repeat,term,count,loc4,endtime,Frequency4
1,Weekly,pos=1;day=TH;month=2;,Repeat=yearly,term=1,endtime=20240131T000000Z,6,pos=1,day=TH;month=2;
2,Assurance test,month=1;endtime=20230112T000000Z;,Repeat=yearly,term=1,monthday=12,8,month=1,endtime=20230112T000000Z;
3,MSC Audit,month=8;monthday=19;,Repeat=yearly,term=2,COUNT=10,8,month=8,monthday=19;
4,Regression Test,monthday=9;,Repeat=monthly,term=2,COUNT=2,11,monthday=9,
5,Lab Safety,,Repeat=monthly,term=3,COUNT=10,0,,
6,Testing,,Repeat=monthly,term=6,,0,,
7,test,,Repeat=monthly,term=1,,0,,


root
 |-- Id: long (nullable = true)
 |-- Type: string (nullable = true)
 |-- Frequency3: string (nullable = true)
 |-- Repeat: string (nullable = true)
 |-- term: string (nullable = true)
 |-- count: string (nullable = true)
 |-- loc4: integer (nullable = true)
 |-- endtime: string (nullable = true)
 |-- Frequency4: string (nullable = true)



In [0]:
df_05 = df_04.withColumn("loc5", locate(";", col("Frequency4"), 1)) \
             .withColumn("day", expr("substring(Frequency4, 0, (loc5-1))")) \
             .withColumn("month", expr("substring(Frequency4, loc5+1, length(Frequency4))"))

df_05_term = df_05.select("Id", "Type", "Frequency4", "Repeat", "term", "count", "endtime", "loc5", "day", "month")
display(df_05_term)
df_05_term.printSchema()

Id,Type,Frequency4,Repeat,term,count,endtime,loc5,day,month
1,Weekly,day=TH;month=2;,Repeat=yearly,term=1,endtime=20240131T000000Z,pos=1,7,day=TH,month=2;
2,Assurance test,endtime=20230112T000000Z;,Repeat=yearly,term=1,monthday=12,month=1,25,endtime=20230112T000000Z,
3,MSC Audit,monthday=19;,Repeat=yearly,term=2,COUNT=10,month=8,12,monthday=19,
4,Regression Test,,Repeat=monthly,term=2,COUNT=2,monthday=9,0,,
5,Lab Safety,,Repeat=monthly,term=3,COUNT=10,,0,,
6,Testing,,Repeat=monthly,term=6,,,0,,
7,test,,Repeat=monthly,term=1,,,0,,


root
 |-- Id: long (nullable = true)
 |-- Type: string (nullable = true)
 |-- Frequency4: string (nullable = true)
 |-- Repeat: string (nullable = true)
 |-- term: string (nullable = true)
 |-- count: string (nullable = true)
 |-- endtime: string (nullable = true)
 |-- loc5: integer (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)



In [0]:
df_final = df_05_term.select("Id", "Type", "Repeat", "term", "count", "endtime", "day", "month")
display(df_final)

Id,Type,Repeat,term,count,endtime,day,month
1,Weekly,Repeat=yearly,term=1,endtime=20240131T000000Z,pos=1,day=TH,month=2;
2,Assurance test,Repeat=yearly,term=1,monthday=12,month=1,endtime=20230112T000000Z,
3,MSC Audit,Repeat=yearly,term=2,COUNT=10,month=8,monthday=19,
4,Regression Test,Repeat=monthly,term=2,COUNT=2,monthday=9,,
5,Lab Safety,Repeat=monthly,term=3,COUNT=10,,,
6,Testing,Repeat=monthly,term=6,,,,
7,test,Repeat=monthly,term=1,,,,
