In [4]:
'''
You are given a dataset containing transaction records with details about the transaction's country, state (approved/declined), amount, and date.

Write a PySpark program to compute the following for each month and country:

The total number of transactions (trans_count)
The total amount of transactions (trans_total_amount)
The number of approved transactions (approved_count)
The total amount of approved transactions (approved_total_amount)
Return the result sorted by month and country.

Input Data
The dataset contains the following columns:

Column Name	Type	Description
id	int	Unique transaction ID (Primary Key)
country	string	Country where the transaction occurred
state	string	Transaction status (approved, declined)
amount	int	Transaction amount
trans_date	date	Date when the transaction occurred
The state column only has values "approved" or "declined".

Example Input
Transactions Dataset
id	country	state	amount	trans_date
121	US	approved	1000	2018-12-18
122	US	declined	2000	2018-12-19
123	US	approved	2000	2019-01-01
124	DE	approved	2000	2019-01-07
125	US	approved	500	2018-12-20
126	US	declined	1500	2019-01-05
127	DE	approved	1800	2019-01-10
128	FR	declined	1200	2019-02-15
129	FR	approved	2500	2019-02-17
130	US	approved	3000	2019-02-20
131	DE	declined	2200	2019-03-05
132	FR	approved	1000	2019-03-10
Expected Output
month	country	trans_count	approved_count	trans_total_amount	approved_total_amount
2018-12	US	3	2	3500	1500
2019-01	DE	2	2	3800	3800
2019-01	US	2	1	3500	2000
2019-02	FR	2	1	3700	2500
2019-02	US	1	1	3000	3000
2019-03	DE	1	0	2200	0
2019-03	FR	1	1	1000	1000
Starter Code
Below is the starter code to create the input PySpark DataFrame:

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import col, to_date

# Initialize Spark session
spark = SparkSession.builder.appName("MonthlyTransactionSummary").getOrCreate()

# Define schema
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("country", StringType(), False),
    StructField("state", StringType(), False),
    StructField("amount", IntegerType(), False),
    StructField("trans_date", StringType(), False),  # Initially as String
])

# Sample data
data = [
    (121, "US", "approved", 1000, "2018-12-18"),
    (122, "US", "declined", 2000, "2018-12-19"),
    (123, "US", "approved", 2000, "2019-01-01"),
    (124, "DE", "approved", 2000, "2019-01-07"),
    (125, "US", "approved", 500, "2018-12-20"),
    (126, "US", "declined", 1500, "2019-01-05"),
    (127, "DE", "approved", 1800, "2019-01-10"),
    (128, "FR", "declined", 1200, "2019-02-15"),
    (129, "FR", "approved", 2500, "2019-02-17"),
    (130, "US", "approved", 3000, "2019-02-20"),
    (131, "DE", "declined", 2200, "2019-03-05"),
    (132, "FR", "approved", 1000, "2019-03-10"),
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Convert trans_date column to DateType
df = df.withColumn("trans_date", to_date(col("trans_date"), "yyyy-MM-dd"))

# Show the DataFrame
df.show()
Constraints
The dataset is stored as a PySpark DataFrame.
The trans_date column should be used to extract the month (formatted as YYYY-MM).
The result should be grouped by month and country.
Use display(df) to show the final DataFrame.
'''
# Initialize Spark session
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

# Define schema
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("country", StringType(), False),
    StructField("state", StringType(), False),
    StructField("amount", IntegerType(), False),
    StructField("trans_date", StringType(), False),  # Initially as String
])

# Sample data
data = [
    (121, "US", "approved", 1000, "2018-12-18"),
    (122, "US", "declined", 2000, "2018-12-19"),
    (123, "US", "approved", 2000, "2019-01-01"),
    (124, "DE", "approved", 2000, "2019-01-07"),
    (125, "US", "approved", 500, "2018-12-20"),
    (126, "US", "declined", 1500, "2019-01-05"),
    (127, "DE", "approved", 1800, "2019-01-10"),
    (128, "FR", "declined", 1200, "2019-02-15"),
    (129, "FR", "approved", 2500, "2019-02-17"),
    (130, "US", "approved", 3000, "2019-02-20"),
    (131, "DE", "declined", 2200, "2019-03-05"),
    (132, "FR", "approved", 1000, "2019-03-10"),
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Convert trans_date column to DateType
df = df.withColumn("trans_date", F.to_date(F.col("trans_date"), "yyyy-MM-dd"))

# Aggregate data, groupBy month and country
df_result = (
  # Extract month in yyyy-MM format
  df.withColumn("month", F.date_format(F.col("trans_date"), "yyyy-MM"))
  .groupBy("month", "country")
  # Aggregate metrics
  .agg(
    F.count("*").alias("trans_count"),
    F.sum(F.col("amount")).alias("trans_total_amount"),
    F.sum(F.when(F.col("state") == "approved", 1).otherwise(0)).alias("approved_count"),
    F.sum(F.when(F.col("state") == "approved", F.col("amount")).otherwise(0)).alias("approved_total_amount")
  )
  .orderBy("month", "country")
)

# Show result of dataframe
df_result.show()

+-------+-------+-----------+------------------+--------------+---------------------+
|  month|country|trans_count|trans_total_amount|approved_count|approved_total_amount|
+-------+-------+-----------+------------------+--------------+---------------------+
|2018-12|     US|          3|              3500|             2|                 1500|
|2019-01|     DE|          2|              3800|             2|                 3800|
|2019-01|     US|          2|              3500|             1|                 2000|
|2019-02|     FR|          2|              3700|             1|                 2500|
|2019-02|     US|          1|              3000|             1|                 3000|
|2019-03|     DE|          1|              2200|             0|                    0|
|2019-03|     FR|          1|              1000|             1|                 1000|
+-------+-------+-----------+------------------+--------------+---------------------+

