In [45]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *

In [46]:
spark = SparkSession.builder.master('local[*]').appName('Exercise1').getOrCreate()

In [47]:
import os
os.path.abspath(os.getcwd())

'C:\\alexs_work\\courses\\M2_spark\\das_2021\\lesson4'

**1.Read data from a csv to a dataframe**

In [48]:
df_trx = spark.read.csv('../data/transactions.csv', header=True, inferSchema=True)

In [49]:
df_trx.show()

+------------------+---------+---------+-------+----------------+------+---------+--------------------+--------------+
|    TRANSACTION_ID|OUTLET_ID|CARD_TYPE|CURENCY|TRANSACTION_DATE|AMOUNT|TRAN_TYPE|     ADDITIONAL_DATA|    CARD_TOKEN|
+------------------+---------+---------+-------+----------------+------+---------+--------------------+--------------+
|804710375248367616|     2124|     VISA|    USD|      2021-02-08|  96.2|        A|L1L2L3L41d03cakes...|650595****6341|
|804710388523339776|     2124|     VISA|    USD|      2021-02-08| 122.4|        D|L1L2L3L41d03cakes...|440836****4355|
|804710388552699904|     2125|     VISA|    USD|      2021-02-08| 663.4|        A|L1L2L3L41d03cakes...|140595****2355|
|804710388653363200|     2126|  Maestro|    USD|      2021-02-08| 168.4|        A|L1L2L3L41d03cakes...|414836****4322|
|804710388674334720|     2126|  Maestro|    USD|      2021-02-08| 280.7|        A|L1L2L3L41d03cakes...|440836****4355|
|804710388674334720|     2126|  Maestro|    USD|

In [50]:
df_trx.printSchema()

root
 |-- TRANSACTION_ID: string (nullable = true)
 |-- OUTLET_ID: string (nullable = true)
 |-- CARD_TYPE: string (nullable = true)
 |-- CURENCY: string (nullable = true)
 |-- TRANSACTION_DATE: string (nullable = true)
 |-- AMOUNT: double (nullable = true)
 |-- TRAN_TYPE: string (nullable = true)
 |-- ADDITIONAL_DATA: string (nullable = true)
 |-- CARD_TOKEN: string (nullable = true)



**2.clean data and remove duplicates**

In [51]:
df_trx.count()

8

In [52]:
if df_trx.distinct().count() < df_trx.count():
    df_trx = df_trx.dropDuplicates()

In [53]:
df_trx.show()

+------------------+---------+---------+-------+----------------+------+---------+--------------------+--------------+
|    TRANSACTION_ID|OUTLET_ID|CARD_TYPE|CURENCY|TRANSACTION_DATE|AMOUNT|TRAN_TYPE|     ADDITIONAL_DATA|    CARD_TOKEN|
+------------------+---------+---------+-------+----------------+------+---------+--------------------+--------------+
|804710388674334720|     2126|  Maestro|    USD|      2021-02-08| 280.7|        A|L1L2L3L41d03cakes...|440836****4355|
|804710375248367616|     2124|     VISA|    USD|      2021-02-08|  96.2|        A|L1L2L3L41d03cakes...|650595****6341|
|804710388552699904|     2125|     VISA|    USD|      2021-02-08| 663.4|        A|L1L2L3L41d03cakes...|140595****2355|
|124567103886743341|     null|  Maestro|    USD|      2021-02-08| 280.7|        A|L1L2L3L41d03cakes...|440836****4355|
|              null|     2126|  Maestro|    USD|      2021-02-08| 280.7|        A|L1L2L3L41d03cakes...|440836****4355|
|804710388523339776|     2124|     VISA|    USD|

In [54]:
df_trx = df_trx.filter((col('TRANSACTION_ID') != 'null') & (col('OUTLET_ID') != 'null'))

In [55]:
df_trx.show()

+------------------+---------+---------+-------+----------------+------+---------+--------------------+--------------+
|    TRANSACTION_ID|OUTLET_ID|CARD_TYPE|CURENCY|TRANSACTION_DATE|AMOUNT|TRAN_TYPE|     ADDITIONAL_DATA|    CARD_TOKEN|
+------------------+---------+---------+-------+----------------+------+---------+--------------------+--------------+
|804710388674334720|     2126|  Maestro|    USD|      2021-02-08| 280.7|        A|L1L2L3L41d03cakes...|440836****4355|
|804710375248367616|     2124|     VISA|    USD|      2021-02-08|  96.2|        A|L1L2L3L41d03cakes...|650595****6341|
|804710388552699904|     2125|     VISA|    USD|      2021-02-08| 663.4|        A|L1L2L3L41d03cakes...|140595****2355|
|804710388523339776|     2124|     VISA|    USD|      2021-02-08| 122.4|        D|L1L2L3L41d03cakes...|440836****4355|
|804710388653363200|     2126|  Maestro|    USD|      2021-02-08| 168.4|        A|L1L2L3L41d03cakes...|414836****4322|
+------------------+---------+---------+-------+

**3.Enrich transaction data using merchant data**

In [56]:
df_m = spark.read.csv('../data/merchants.csv', header=True, inferSchema=True)

In [57]:
df_m.show()

+---------+-------------+-----------+----------+
|OUTLET_ID|MERCHANT_NAME|MERCHANT_ID|      CITY|
+---------+-------------+-----------+----------+
|     2124|        DELTA|          1|    LONDON|
|     2125|        DELTA|          1|    LONDON|
|     2126|         ALDI|          2|BIRMINGHAM|
|     2127|       OFFICE|          3| SHEFFIELD|
|     3000|         LIDL|          4|MANCHESTER|
+---------+-------------+-----------+----------+



In [58]:
df_trx_1 = df_trx.join(df_m, 'OUTLET_ID', 'inner')

In [59]:
df_trx_1.show()

+---------+------------------+---------+-------+----------------+------+---------+--------------------+--------------+-------------+-----------+----------+
|OUTLET_ID|    TRANSACTION_ID|CARD_TYPE|CURENCY|TRANSACTION_DATE|AMOUNT|TRAN_TYPE|     ADDITIONAL_DATA|    CARD_TOKEN|MERCHANT_NAME|MERCHANT_ID|      CITY|
+---------+------------------+---------+-------+----------------+------+---------+--------------------+--------------+-------------+-----------+----------+
|     2126|804710388674334720|  Maestro|    USD|      2021-02-08| 280.7|        A|L1L2L3L41d03cakes...|440836****4355|         ALDI|          2|BIRMINGHAM|
|     2124|804710375248367616|     VISA|    USD|      2021-02-08|  96.2|        A|L1L2L3L41d03cakes...|650595****6341|        DELTA|          1|    LONDON|
|     2125|804710388552699904|     VISA|    USD|      2021-02-08| 663.4|        A|L1L2L3L41d03cakes...|140595****2355|        DELTA|          1|    LONDON|
|     2124|804710388523339776|     VISA|    USD|      2021-02-08

**4.Render total sales amount and total transactions per merchant**

In [60]:
df_trx_1.groupby('OUTLET_ID').agg({'AMOUNT':'SUM', '*':'COUNT'}).sort(asc('OUTLET_ID')).withColumnRenamed('sum(AMOUNT)', 'SUM_AMOUNT').withColumnRenamed('count(1)', 'TOTAL_TRX').show()

+---------+------------------+---------+
|OUTLET_ID|        SUM_AMOUNT|TOTAL_TRX|
+---------+------------------+---------+
|     2124|218.60000000000002|        2|
|     2125|             663.4|        1|
|     2126|             449.1|        2|
+---------+------------------+---------+



**5.Persist data to storage parquet format/ partition transactions using a date**

In [61]:
df_trx_1.coalesce(1).write.parquet('./out/total_sales_per_merchant')