In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("TransactionsDF").getOrCreate()

schema = StructType([
    StructField("product", StringType(), True),
    StructField("sales", IntegerType(), True)
])

data = [
    ("Laptop", 1200),
    ("Phone", 800),
    ("Tablet", 500),
    ("Desktop", 500)
]

df_sales = spark.createDataFrame(data, schema)

df_sales.show()
total_sales = df_sales.agg(sum('sales').alias("total_sales"))
total_sales.show()
total_sales = df_sales.agg(sum('sales').alias("total_sales")).collect()[0]['total_sales'] # 0= first row,

df_sales = df_sales.withColumn('percentage_contribution', round((col('sales')/total_sales)*100, 1))
df_sales.show()

+-------+-----+
|product|sales|
+-------+-----+
| Laptop| 1200|
|  Phone|  800|
| Tablet|  500|
|Desktop|  500|
+-------+-----+

+-----------+
|total_sales|
+-----------+
|       3000|
+-----------+

+-------+-----+-----------------------+
|product|sales|percentage_contribution|
+-------+-----+-----------------------+
| Laptop| 1200|                   40.0|
|  Phone|  800|                   26.7|
| Tablet|  500|                   16.7|
|Desktop|  500|                   16.7|
+-------+-----+-----------------------+



In [30]:
schema = StructType([
    StructField("txn_id", StringType(), True),
    StructField("txn_date", StringType(), True),
    StructField("amount", IntegerType(), True)
])

data = [
    ("T1", "2025-01-10", 100),
    ("T2", "2025-01-15", 200),
    ("T3", "2025-03-05", 300),
    ("T4", "2025-05-20", 400)
]

df_txn = spark.createDataFrame(data, schema)
df_txn = df_txn.withColumn("txn_date", to_date("txn_date", "yyyy-MM-dd"))
# df_txn.show()
txn_months = df_txn.select(date_format("txn_date", "yyyy-MM").alias('month')).distinct()
txn_months.show()

months = ["2025-01","2025-02","2025-03","2025-04","2025-05","2025-06",
          "2025-07","2025-08","2025-09","2025-10","2025-11","2025-12"]

all_months = spark.createDataFrame([(m,) for m in months], ["month"])
# all_months.show()

df1 = all_months.join(txn_months, on="month", how="left_anti").orderBy('month')
df1.show()

+-------+
|  month|
+-------+
|2025-01|
|2025-03|
|2025-05|
+-------+

+-------+
|  month|
+-------+
|2025-02|
|2025-04|
|2025-06|
|2025-07|
|2025-08|
|2025-09|
|2025-10|
|2025-11|
|2025-12|
+-------+



In [40]:
df_txn.createOrReplaceTempView("transactions")

spark.sql("""
create or replace temp view all_months as
select stack(12,
  '2025-01', '2025-02', '2025-03', '2025-04',
  '2025-05', '2025-06', '2025-07', '2025-08',
  '2025-09', '2025-10', '2025-11', '2025-12'
) as month
""")

missing_months = spark.sql("""
select m.month
from all_months m
left join (
    select distinct date_format(txn_date, 'yyyy-mm') as month
    from transactions
) t
on m.month = t.month
where t.month is null
order by m.month
""")

missing_months.show()

+-------+
|  month|
+-------+
|2025-01|
|2025-02|
|2025-03|
|2025-04|
|2025-05|
|2025-06|
|2025-07|
|2025-08|
|2025-09|
|2025-10|
|2025-11|
|2025-12|
+-------+

