<a href="https://colab.research.google.com/github/eder1985/pismo_recruiting_technical_case/blob/main/work/notebooks/Colab_Pismo_Recruiting_Technical_Case.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><center>Pismo Recruiting Technical Case</center></h1>

---



## 1. Pre-requisites

### Installing Spark

Install Dependencies:


1.   Java 8
2.   Apache Spark with hadoop and
3.   Findspark (used to locate the spark in the system)


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

Set Environment Variables:

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [4]:
!ls

sample_data  spark-3.1.1-bin-hadoop3.2	spark-3.1.1-bin-hadoop3.2.tgz  work


In [5]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [6]:
!python --version

Python 3.10.6


### Creating data folders

In [7]:
!mkdir -p work/data/raw/events/
!mkdir -p work/data/staging/events/
!mkdir -p work/data/processed/events/
!mkdir -p work/data/trusted/events/

## 2. Generate Fake Data

### Installing libs

In [8]:
!pip install -q faker

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.7 MB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m27.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25h

### Imports

In [9]:
from faker import Faker
from faker.providers import BaseProvider
from datetime import datetime
from json import dumps
import pandas as pd
import random
import collections
import glob
import os

### Generating fake `event_id`: random UUIDs for unique and non-repeated event_id values

In [10]:
fake = Faker()
Faker.seed(random.randrange(0, 99999999999999999999, 1))
fake_event_id = fake.uuid4()
print(fake_event_id)

dd003587-c9d9-47ae-8a7e-f3fb41c83d2b


### Generating custom fake `event_id`: random UUIDs for duplicate event_id scenario

In [11]:
class CustomUUIDProvider(BaseProvider):
    def custom_uuid(self):
        list_uuids = [
            '1a1a1a1a-1a1a-1a1a-1a1a-1a1a1a1a1a1a',
            '2b2b2b2b-2b2b-2b2b-2b2b-2b2b2b2b2b2b',
            '3c3c3c3c-3c3c-3c3c-3c3c-3c3c3c3c3c3c'
            ]
        return random.choice(list_uuids)

### Generating fake `timestamp`: random timestamps with values until 3 years ago

In [12]:
fake_timestamp = datetime.strftime(fake.date_time_between(start_date='-3y', end_date='now'),"%Y-%m-%dT%H:%M:%S")
print(fake_timestamp)

2023-04-28T21:39:21


### Generating custom fake `event_type`: random values based on list


In [13]:
class EventTypeProvider(BaseProvider):
    def event_type(self):
        list_event_types = ['account-status-change','transaction-new-value']
        return random.choice(list_event_types)

fake.add_provider(EventTypeProvider)

fake_event_type = fake.event_type()
print(fake_event_type)

account-status-change


### Generating fake `status`: random values based on list

In [14]:
class StatusTypeProvider(BaseProvider):
    def status_type(self):
        list_status_types = ['ACTIVE','INACTIVE','SUSPENDED','BLOCKED', 'DELETED']
        return random.choice(list_status_types)

fake.add_provider(StatusTypeProvider)

fake_status_type = fake.status_type()
print(fake_status_type)

SUSPENDED


### Generating custom fake `data`: values based on dict

In [15]:
def custom_data(fake):
  dict_data = {
      "account-status-change": collections.OrderedDict([
          ('id', fake.random_number(digits=6)),
          ('old_status', fake.status_type()),
          ('new_status', fake.status_type()),
          ('reason', fake.sentence(nb_words=5))
      ]),
      "transaction-new-value": collections.OrderedDict([
          ('id', fake.random_number(digits=6)),
          ('account_orig_id', fake.random_number(digits=6)),
          ('account_dest_id', fake.random_number(digits=6)),
          ('amount', fake.pyfloat(positive=True)),
          ('currency', fake.currency_code())
      ])
  }
  return dict_data

fake_custom_data = custom_data(fake).get(fake.event_type())
print(fake_custom_data)

OrderedDict([('id', 915423), ('account_orig_id', 624054), ('account_dest_id', 111988), ('amount', 8290345.63304253), ('currency', 'GIP')])


### Defining `write_fake_data` and `read_fake_data` functions

In [16]:
def write_fake_data(fake, length, destination_path, unique_uuid = True):

    database = []
    current_time = datetime.now().strftime("%Y%m%d%H%M%S")
    filename = 'fake_events_'+current_time

    for x in range(length):
        uuid = fake.uuid4() if unique_uuid else fake.custom_uuid()
        event_type = fake.event_type()
        project_domain_name = event_type.split('-')[0]

        database.append(collections.OrderedDict([
            ('event_id', uuid),
            ('timestamp', datetime.strftime(fake.date_time_between(start_date='-3y', end_date='now'),"%Y-%m-%dT%H:%M:%S")),
            ('domain', project_domain_name),
            ('event_type', event_type),
            ('data', custom_data(fake).get(event_type))
        ]))

    with open('%s%s.json' % (destination_path, filename), 'w') as output:
        output.write(dumps(database, indent=4, sort_keys=False, default=str))

    print("Done.")

def read_fake_data(json_filepath):
    json_files = [os.path.normpath(i) for i in glob.glob(json_filepath)]
    df = pd.concat([pd.read_json(f) for f in json_files])
    return df

def run(length, unique_uuid = True):
    fake = Faker()
    Faker.seed(random.randrange(0, 99999999999999999999, 1))
    fake.add_provider(StatusTypeProvider)
    fake.add_provider(CustomUUIDProvider)
    fake.add_provider(EventTypeProvider)

    destination_path = 'work/data/raw/events/'
    write_fake_data(fake, length, destination_path,unique_uuid)

    json_filepath = destination_path+'*.json'
    fake_data = read_fake_data(json_filepath)
    print(fake_data)

### Writing and reading fake data

> Writing json file with 1000 unique events

In [17]:
run(1000)

Done.
                                 event_id           timestamp       domain  \
0    d7e041fc-c1be-4d39-9a70-b7d703edcc44 2022-04-05 18:08:56      account   
1    5bf712fb-8da4-4c97-804e-3c6cf5cf32d2 2021-01-16 17:10:26  transaction   
2    c0f5aded-0413-4f7d-8895-4d73b866c8be 2022-03-16 15:36:01      account   
3    9cdbd576-d267-4902-a6d2-5064fecfca49 2021-12-13 10:27:59      account   
4    8b11fb0d-6fc7-42d2-9591-d194c69868f4 2022-07-08 17:17:09      account   
..                                    ...                 ...          ...   
995  0e645f8c-c32f-47be-9006-9dfab3187f05 2020-11-15 23:59:01  transaction   
996  b101430d-f5d2-46c0-a6fc-7c8643a3a153 2020-08-31 07:47:53  transaction   
997  21c84f50-7da5-4971-8088-ca3c718dcd1f 2023-03-21 16:51:18      account   
998  b13c3103-a682-4e47-b75f-9abbc18cbff7 2023-01-08 18:22:10  transaction   
999  d3cf5e32-163e-4d23-8ede-5475c416dbb2 2021-03-01 00:20:05  transaction   

                event_type                               

> Writing json file with 100 events with duplicate rows

In [18]:
run(100,unique_uuid = False)

Done.
                                event_id           timestamp       domain  \
0   d7e041fc-c1be-4d39-9a70-b7d703edcc44 2022-04-05 18:08:56      account   
1   5bf712fb-8da4-4c97-804e-3c6cf5cf32d2 2021-01-16 17:10:26  transaction   
2   c0f5aded-0413-4f7d-8895-4d73b866c8be 2022-03-16 15:36:01      account   
3   9cdbd576-d267-4902-a6d2-5064fecfca49 2021-12-13 10:27:59      account   
4   8b11fb0d-6fc7-42d2-9591-d194c69868f4 2022-07-08 17:17:09      account   
..                                   ...                 ...          ...   
95  2b2b2b2b-2b2b-2b2b-2b2b-2b2b2b2b2b2b 2021-12-27 14:26:26      account   
96  1a1a1a1a-1a1a-1a1a-1a1a-1a1a1a1a1a1a 2023-01-18 12:39:45  transaction   
97  1a1a1a1a-1a1a-1a1a-1a1a-1a1a1a1a1a1a 2021-09-18 16:25:49  transaction   
98  2b2b2b2b-2b2b-2b2b-2b2b-2b2b2b2b2b2b 2022-08-30 22:23:56  transaction   
99  3c3c3c3c-3c3c-3c3c-3c3c-3c3c3c3c3c3c 2022-10-28 09:54:09  transaction   

               event_type                                            

## 3. Raw -> Staging: Read raw json files and write parquet files into staging area

### Steps:

- List raw json files
- Define raw_events_schema with only StringType for not lost data
- Load raw_events dataframe
- Write raw data into staging area in parquet format

In [19]:
!ls work/data/raw/events/ -la

total 428
drwxr-xr-x 2 root root   4096 Jul 27 20:25 .
drwxr-xr-x 3 root root   4096 Jul 27 20:20 ..
-rw-r--r-- 1 root root 387083 Jul 27 20:25 fake_events_20230727202516.json
-rw-r--r-- 1 root root  38719 Jul 27 20:25 fake_events_20230727202521.json


In [20]:

from pyspark.sql.types import StructType,StructField, StringType

raw_events_schema = StructType([
      StructField("data",StringType(),True),
      StructField("domain",StringType(),True),
      StructField("event_id",StringType(),True),
      StructField("event_type",StringType(),True),
      StructField("timestamp",StringType(),True)
  ])

In [21]:
raw_events = spark.read.option("multiline","true").schema(raw_events_schema).json('work/data/raw/events/')
raw_events.show(5, truncate = False)

+---------------------------------------------------------------------------------------------------------+-----------+------------------------------------+---------------------+-------------------+
|data                                                                                                     |domain     |event_id                            |event_type           |timestamp          |
+---------------------------------------------------------------------------------------------------------+-----------+------------------------------------+---------------------+-------------------+
|{"id":140096,"old_status":"DELETED","new_status":"DELETED","reason":"Still hand card."}                  |account    |d7e041fc-c1be-4d39-9a70-b7d703edcc44|account-status-change|2022-04-05T18:08:56|
|{"id":247213,"account_orig_id":797264,"account_dest_id":26309,"amount":2.17856064888852,"currency":"THB"}|transaction|5bf712fb-8da4-4c97-804e-3c6cf5cf32d2|transaction-new-value|2021-01-16T17:10:26|
|{"id

In [22]:
raw_events.count()

1100

In [23]:
raw_events.printSchema()

root
 |-- data: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [48]:
raw_events.write \
    .mode("overwrite") \
    .parquet("work/data/staging/events/")

In [26]:
staging_events = spark.read.parquet('work/data/staging/events/')
staging_events.show()

+--------------------+-----------+--------------------+--------------------+-------------------+
|                data|     domain|            event_id|          event_type|          timestamp|
+--------------------+-----------+--------------------+--------------------+-------------------+
|{"id":140096,"old...|    account|d7e041fc-c1be-4d3...|account-status-ch...|2022-04-05T18:08:56|
|{"id":247213,"acc...|transaction|5bf712fb-8da4-4c9...|transaction-new-v...|2021-01-16T17:10:26|
|{"id":417243,"old...|    account|c0f5aded-0413-4f7...|account-status-ch...|2022-03-16T15:36:01|
|{"id":221633,"old...|    account|9cdbd576-d267-490...|account-status-ch...|2021-12-13T10:27:59|
|{"id":139369,"old...|    account|8b11fb0d-6fc7-42d...|account-status-ch...|2022-07-08T17:17:09|
|{"id":467381,"old...|    account|7f408e72-953d-443...|account-status-ch...|2020-12-01T14:56:19|
|{"id":181151,"acc...|transaction|f2527564-153c-4bb...|transaction-new-v...|2022-05-01T04:04:13|
|{"id":63972,"acco...|transact

In [27]:
staging_events.printSchema()

root
 |-- data: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)



## 4. Applying transformations

### Columns `timestamp` and `day` transformations

In [28]:
partial_events = staging_events\
  .withColumn("timestamp",to_timestamp("timestamp"))\
  .withColumn("day",to_date("timestamp"))

In [29]:
partial_events.printSchema()

root
 |-- data: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- day: date (nullable = true)



### Drop duplicated events

In [30]:
from pyspark.sql.functions import countDistinct

# Count
partial_events.select(countDistinct("event_id", "event_type").alias("distinct_events")).show()


+---------------+
|distinct_events|
+---------------+
|           1006|
+---------------+



In [31]:
grouped_events = partial_events \
  .groupBy( \
      col("event_id"), \
      col("event_type")) \
  .agg( \
      max(col("timestamp")))

grouped_events.show(truncate = False)

+------------------------------------+---------------------+-------------------+
|event_id                            |event_type           |max(timestamp)     |
+------------------------------------+---------------------+-------------------+
|131516b4-8700-4a86-af18-cefd070e3632|transaction-new-value|2021-07-23 16:18:39|
|0857511d-4f44-4094-9c4a-6cf757fe064d|account-status-change|2022-06-17 17:13:20|
|55b16a54-8033-4f8d-b87a-3cedbdf24bcb|transaction-new-value|2021-07-01 05:38:41|
|5c428b2c-33c6-4a0b-8935-c01fc135a462|account-status-change|2022-10-12 09:06:09|
|f8c6f537-eec5-4d21-8437-2613f240a362|account-status-change|2022-07-23 07:21:58|
|cdbb2c77-ce63-4403-9a69-d734637d1f00|account-status-change|2022-12-11 21:45:19|
|fca646e5-eea2-4462-99d1-60e47a32196a|transaction-new-value|2021-11-30 06:25:06|
|e7a3567e-3bbc-4699-801b-6cb21e4df87a|account-status-change|2021-07-02 16:46:38|
|ebb249c1-0d5b-47c6-b01b-14cd272a997c|account-status-change|2022-04-25 04:38:01|
|c267e5fc-e9f9-4bae-8245-f0e

In [32]:
trusted_events = grouped_events \
    .join(partial_events, ["event_id","event_type"]) \
    .dropDuplicates(["event_id","event_type"]) \
    .drop("timestamp") \
    .withColumnRenamed("max(timestamp)", "timestamp")
trusted_events.show()

+--------------------+--------------------+-------------------+--------------------+-----------+----------+
|            event_id|          event_type|          timestamp|                data|     domain|       day|
+--------------------+--------------------+-------------------+--------------------+-----------+----------+
|0857511d-4f44-409...|account-status-ch...|2022-06-17 17:13:20|{"id":400923,"old...|    account|2022-06-17|
|131516b4-8700-4a8...|transaction-new-v...|2021-07-23 16:18:39|{"id":866148,"acc...|transaction|2021-07-23|
|55b16a54-8033-4f8...|transaction-new-v...|2021-07-01 05:38:41|{"id":23589,"acco...|transaction|2021-07-01|
|5c428b2c-33c6-4a0...|account-status-ch...|2022-10-12 09:06:09|{"id":771939,"old...|    account|2022-10-12|
|cdbb2c77-ce63-440...|account-status-ch...|2022-12-11 21:45:19|{"id":302324,"old...|    account|2022-12-11|
|e7a3567e-3bbc-469...|account-status-ch...|2021-07-02 16:46:38|{"id":187466,"old...|    account|2021-07-02|
|f8c6f537-eec5-4d2...|accoun

In [33]:
trusted_events.count()

1006

### Flatten `data` column by event-type

> Each event-type has its own schema

In [34]:
trusted_events.groupBy('event_type').agg(count(col('event_type'))).show(truncate=False)

+---------------------+-----------------+
|event_type           |count(event_type)|
+---------------------+-----------------+
|account-status-change|515              |
|transaction-new-value|491              |
+---------------------+-----------------+



In [36]:
def distinct_values(df, col):
    list_distinct_values = df.select(col).distinct().toPandas()[col].to_list()
    return list_distinct_values

def flatten_df(df):
    json_schema = spark.read.json(df.rdd.map(lambda row: row.data)).schema
    df2 = df.withColumn("data", from_json("data", json_schema))
    col1 = df2.columns
    col1.remove("data")
    col2 = df2.select("data.*").columns
    append_str ="data."
    col3 = [append_str + val for val in col2]
    col_list = col1 + col3
    df3 = df2.select(*col_list).drop("data")
    return df3

def write_trusted_data(df):
    df.printSchema()
    df.write \
    .partitionBy("event_type", "day") \
    .mode("append") \
    .parquet("work/data/trusted/events/")

## 5. Staging -> Trusted: Write trusted/transformed data in parquet format





In [37]:
for value in distinct_values(trusted_events, 'event_type'):
    filtered_df = trusted_events.filter(col('event_type') == value)
    flattened_df = flatten_df(filtered_df)
    write_trusted_data(flattened_df)

root
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- domain: string (nullable = true)
 |-- day: date (nullable = true)
 |-- id: long (nullable = true)
 |-- new_status: string (nullable = true)
 |-- old_status: string (nullable = true)
 |-- reason: string (nullable = true)

root
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- domain: string (nullable = true)
 |-- day: date (nullable = true)
 |-- account_dest_id: long (nullable = true)
 |-- account_orig_id: long (nullable = true)
 |-- amount: double (nullable = true)
 |-- currency: string (nullable = true)
 |-- id: long (nullable = true)



## 6. Read transformed data in parquet format

In [38]:
!ls -la work/data/trusted/events/

total 52
drwxr-xr-x   4 root root  4096 Jul 27 20:39  .
drwxr-xr-x   3 root root  4096 Jul 27 20:20  ..
drwxr-xr-x 421 root root 20480 Jul 27 20:39 'event_type=account-status-change'
drwxr-xr-x 394 root root 20480 Jul 27 20:39 'event_type=transaction-new-value'
-rw-r--r--   1 root root     0 Jul 27 20:39  _SUCCESS
-rw-r--r--   1 root root     8 Jul 27 20:39  ._SUCCESS.crc


In [39]:
trusted_transaction_events = spark.read.parquet('work/data/trusted/events/event_type=transaction-new-value')
trusted_transaction_events.show()

+--------------------+-------------------+-----------+---------------+---------------+-------------------+--------+------+----------+
|            event_id|          timestamp|     domain|account_dest_id|account_orig_id|             amount|currency|    id|       day|
+--------------------+-------------------+-----------+---------------+---------------+-------------------+--------+------+----------+
|8e087607-df4b-493...|2021-05-08 01:26:15|transaction|         183270|         476985|     206949.1867367|     GMD|602324|2021-05-08|
|13087a41-df29-474...|2022-09-03 05:18:05|transaction|         795371|         145173|     4.575345477454|     PLN|620976|2022-09-03|
|5edcb757-4c8c-42d...|2022-09-03 09:05:12|transaction|         710084|         453984|     84628.35971386|     CUC|510148|2022-09-03|
|7859f2d9-60b7-4c9...|2022-04-29 04:56:14|transaction|          36133|         989141| 7.47582603844621E7|     FKP|601859|2022-04-29|
|b5cc94bc-fec4-437...|2022-06-22 11:16:13|transaction|        

In [40]:
trusted_transaction_events.count()

491

In [41]:
trusted_account_events = spark.read.parquet('work/data/trusted/events/event_type=account-status-change')
trusted_account_events.show()

+--------------------+-------------------+-------+------+----------+----------+--------------------+----------+
|            event_id|          timestamp| domain|    id|new_status|old_status|              reason|       day|
+--------------------+-------------------+-------+------+----------+----------+--------------------+----------+
|1e80c9c4-8231-468...|2020-08-30 17:31:04|account|922386| SUSPENDED| SUSPENDED|Strategy reduce p...|2020-08-30|
|42c4acbe-6396-423...|2022-09-07 08:32:11|account|529262| SUSPENDED| SUSPENDED|Country north sev...|2022-09-07|
|6ed47f6d-56c0-411...|2022-03-27 11:56:45|account|326138| SUSPENDED|   BLOCKED|Letter commercial...|2022-03-27|
|f97a4fc8-2ea6-469...|2021-04-26 10:29:52|account|134429|   BLOCKED| SUSPENDED|Manage hard movem...|2021-04-26|
|5032ab26-2e7e-4bd...|2021-05-24 22:52:42|account| 96668|   DELETED|   BLOCKED|Garden interview ...|2021-05-24|
|5041ff99-e471-4e4...|2021-10-07 00:45:05|account|467243|  INACTIVE|   DELETED|Performance provi...|2021

In [42]:
trusted_account_events.count()

515

## 7. Move raw data to processed data folder and clean staging data

In [None]:
!mv work/data/raw/events/* work/data/processed/events/
!ls work/data/processed/events/ -la

total 428
drwxr-xr-x 2 root root   4096 Jul 27 17:59 .
drwxr-xr-x 3 root root   4096 Jul 27 16:28 ..
-rw-r--r-- 1 root root 387821 Jul 27 16:28 fake_events_20230727162819.json
-rw-r--r-- 1 root root  38965 Jul 27 16:30 fake_events_20230727163006.json


In [51]:
!rm -rf work/data/staging/events/*
!ls work/data/staging/events