In [55]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("UserSessions").getOrCreate()

data = [
    ("U1", "2025-01-01"),
    ("U1", "2025-01-05"),
    ("U1", "2025-01-10"),
    ("U2", "2025-01-02"),
    ("U2", "2025-01-04")
]

columns = ["user_id", "session_date"]

df = spark.createDataFrame(data, columns)

# df.show()
wind1 = Window.partitionBy("user_id").orderBy('session_date')

df1 = df.withColumn('lag1', lag('session_date').over(wind1))
df1.show()

df2 = df1.withColumn('days_since_last_session', date_diff(col('session_date'), col('lag1')))
df2.show()

+-------+------------+----------+
|user_id|session_date|      lag1|
+-------+------------+----------+
|     U1|  2025-01-01|      NULL|
|     U1|  2025-01-05|2025-01-01|
|     U1|  2025-01-10|2025-01-05|
|     U2|  2025-01-02|      NULL|
|     U2|  2025-01-04|2025-01-02|
+-------+------------+----------+

+-------+------------+----------+-----------------------+
|user_id|session_date|      lag1|days_since_last_session|
+-------+------------+----------+-----------------------+
|     U1|  2025-01-01|      NULL|                   NULL|
|     U1|  2025-01-05|2025-01-01|                      4|
|     U1|  2025-01-10|2025-01-05|                      5|
|     U2|  2025-01-02|      NULL|                   NULL|
|     U2|  2025-01-04|2025-01-02|                      2|
+-------+------------+----------+-----------------------+



In [42]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("EmployeeData").getOrCreate()

data = [
    ("C1", "2025-01-01", 200),
    ("C1", "2025-01-10", 300),
    ("C2", "2025-01-05", 150),
    ("C3", "2025-01-02", 100),
    ("C3", "2025-01-04", 200)
]

columns = ["customer_id", "purchase_date", "amount"]

df = spark.createDataFrame(data, columns)
# df.show()

df1 = df.groupBy('customer_id').agg(count('amount').alias('amount_count'))
df1.show()
df2 = df1.filter(col('amount_count')==1).select('customer_id')
df2.show()

+-----------+------------+
|customer_id|amount_count|
+-----------+------------+
|         C1|           2|
|         C3|           2|
|         C2|           1|
+-----------+------------+

+-----------+
|customer_id|
+-----------+
|         C2|
+-----------+



In [40]:
# df_sql = df.createOrReplaceTempView('customer_table')             # returns None

df.createOrReplaceTempView('customer_table')

result = spark.sql("""
select customer_id
from customer_table
group by customer_id
having count('amount')==1
""")
result.show()

+-----------+
|customer_id|
+-----------+
|         C2|
+-----------+

