In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[*]").appName("Etl Job").getOrCreate()

In [14]:
## Examples

In [18]:
transactions = spark.read.format("csv") \
.option('InferSchema','True') \
.option('Header','True') \
.load("transactions.csv")

In [19]:
transactions.printSchema()
print(transactions.count())

root
 |-- TRANSACTION_ID: string (nullable = true)
 |-- OUTLET_ID: string (nullable = true)
 |-- CARD_TYPE: string (nullable = true)
 |-- CURENCY: string (nullable = true)
 |-- TRANSACTION_DATE: string (nullable = true)
 |-- AMOUNT: double (nullable = true)
 |-- TRAN_TYPE: string (nullable = true)
 |-- ADDITIONAL_DATA: string (nullable = true)
 |-- CARD_TOKEN: string (nullable = true)

8


In [20]:
merchants = spark.read.format("csv") \
.option('InferSchema','True') \
.option('Header','True') \
.load("merchants.csv")

In [22]:
merchants.printSchema()
print(merchants.count())

root
 |-- OUTLET_ID: integer (nullable = true)
 |-- MERCHANT_NAME: string (nullable = true)
 |-- MERCHANT_ID: integer (nullable = true)
 |-- CITY: string (nullable = true)

5


In [None]:
#clean data and remove duplicates

In [24]:
from pyspark.sql.functions import *
clean = transactions\
        .dropDuplicates() \
        .filter('TRANSACTION_ID is not null and OUTLET_ID is not null and AMOUNT is not null') \
        .withColumn("PROCESS_DATE",current_date())

In [25]:
clean.show()

+------------------+---------+---------+-------+----------------+------+---------+--------------------+--------------+------------+
|    TRANSACTION_ID|OUTLET_ID|CARD_TYPE|CURENCY|TRANSACTION_DATE|AMOUNT|TRAN_TYPE|     ADDITIONAL_DATA|    CARD_TOKEN|PROCESS_DATE|
+------------------+---------+---------+-------+----------------+------+---------+--------------------+--------------+------------+
|804710388674334720|     2126|  Maestro|    USD|      2021-02-08| 280.7|        A|L1L2L3L41d03cakes...|440836****4355|  2021-03-01|
|804710375248367616|     2124|     VISA|    USD|      2021-02-08|  96.2|        A|L1L2L3L41d03cakes...|650595****6341|  2021-03-01|
|804710388552699904|     2125|     VISA|    USD|      2021-02-08| 663.4|        A|L1L2L3L41d03cakes...|140595****2355|  2021-03-01|
|124567103886743341|     null|  Maestro|    USD|      2021-02-08| 280.7|        A|L1L2L3L41d03cakes...|440836****4355|  2021-03-01|
|              null|     2126|  Maestro|    USD|      2021-02-08| 280.7|    

In [None]:
#enhance data

In [26]:
joined = clean.join(merchants,'OUTLET_ID')
joined.show(truncate =False)

+---------+------------------+---------+-------+----------------+------+---------+-----------------------------+--------------+------------+-------------+-----------+----------+
|OUTLET_ID|TRANSACTION_ID    |CARD_TYPE|CURENCY|TRANSACTION_DATE|AMOUNT|TRAN_TYPE|ADDITIONAL_DATA              |CARD_TOKEN    |PROCESS_DATE|MERCHANT_NAME|MERCHANT_ID|CITY      |
+---------+------------------+---------+-------+----------------+------+---------+-----------------------------+--------------+------------+-------------+-----------+----------+
|2126     |804710388674334720|Maestro  |USD    |2021-02-08      |280.7 |A        |L1L2L3L41d03cakesupplies01Ref|440836****4355|2021-03-01  |ALDI         |2          |BIRMINGHAM|
|2124     |804710375248367616|VISA     |USD    |2021-02-08      |96.2  |A        |L1L2L3L41d03cakesupplies01Ref|650595****6341|2021-03-01  |DELTA        |1          |LONDON    |
|2125     |804710388552699904|VISA     |USD    |2021-02-08      |663.4 |A        |L1L2L3L41d03cakesupplies01Re

In [None]:
#render total sales amount and total transactions per merchant

In [27]:
sales = joined.groupBy("MERCHANT_NAME") \
            .agg(sum("AMOUNT").alias("TOTAL_AMOUNT"), count("TRANSACTION_ID").alias("TOTAL_NR"))

In [28]:
sales.show()

+-------------+------------+--------+
|MERCHANT_NAME|TOTAL_AMOUNT|TOTAL_NR|
+-------------+------------+--------+
|         ALDI|       729.8|       3|
|        DELTA|       882.0|       3|
+-------------+------------+--------+



In [35]:
transaction_output = "output/transaction/"
sales_output = "output/sales/"

In [34]:
ls

[0m[01;32mexamples.ipynb[0m*  [01;32mmerchants.csv[0m*     [01;32mUntitled.ipynb[0m*
[01;32mflights.csv[0m*     [01;32mtransactions.csv[0m*


In [40]:
joined \
        .write \
        .mode("overwrite") \
        .partitionBy("PROCESS_DATE") \
        .parquet \
        (transaction_output)

In [39]:
sales \
        .write \
        .mode("overwrite") \
        .parquet \
        (sales_output)