In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("TransactionsDF").getOrCreate()

schema = StructType([
    StructField("txn_date", StringType(), True),
    StructField("txn_id", StringType(), True),
    StructField("amount", IntegerType(), True)
])

data = [
    ("2025-01-01", "T1", 100),
    ("2025-01-01", "T2", 200),
    ("2025-01-01", "T3", 500),
    ("2025-01-02", "T4", 300),
    ("2025-01-02", "T5", 400),
    ("2025-01-02", "T6", 600),
]

df = spark.createDataFrame(data, schema)

# df.show()
df_avg = df.groupBy("txn_date").agg(avg("amount").alias("avg_amount"))

# df1 = df.join(df_avg, df['txn_date']==df_avg['txn_date'], 'left') #ambiguous error
df1 = df.join(df_avg, on="txn_date", how="left")
df1.show()

df_re = df1.filter(col('amount') > col('avg_amount')).select('txn_date', 'txn_id','amount')
df_re.show()

+----------+------+------+-----------------+
|  txn_date|txn_id|amount|       avg_amount|
+----------+------+------+-----------------+
|2025-01-01|    T1|   100|266.6666666666667|
|2025-01-01|    T2|   200|266.6666666666667|
|2025-01-01|    T3|   500|266.6666666666667|
|2025-01-02|    T4|   300|433.3333333333333|
|2025-01-02|    T5|   400|433.3333333333333|
|2025-01-02|    T6|   600|433.3333333333333|
+----------+------+------+-----------------+

+----------+------+------+
|  txn_date|txn_id|amount|
+----------+------+------+
|2025-01-01|    T3|   500|
|2025-01-02|    T6|   600|
+----------+------+------+



In [17]:
schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("manager_id", IntegerType(), True),
    StructField("change_date", StringType(), True)
])

# Sample data
data = [
    (1, 10, "2025-01-01"),
    (1, 10, "2025-02-01"),
    (2, 11, "2025-01-01"),
    (2, 12, "2025-03-01"),
    (3, 13, "2025-01-05")
]

# Create DataFrame
df_em = spark.createDataFrame(data, schema)

# Show DataFrame
df_em.show()

df_em.createOrReplaceTempView('employee_manager')

result = spark.sql("""select emp_id
from employee_manager
group by emp_id
having count(distinct manager_id) = 1 """)
result.show()

+------+----------+-----------+
|emp_id|manager_id|change_date|
+------+----------+-----------+
|     1|        10| 2025-01-01|
|     1|        10| 2025-02-01|
|     2|        11| 2025-01-01|
|     2|        12| 2025-03-01|
|     3|        13| 2025-01-05|
+------+----------+-----------+

+------+
|emp_id|
+------+
|     1|
|     3|
+------+

