<a href="https://colab.research.google.com/github/eder1985/pismo_recruiting_technical_case/blob/main/work/notebooks/Colab_Pismo_Recruiting_Technical_Case.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><center>Pismo Recruiting Technical Case</center></h1>

---



## 1. Pre-requisites

### Installing Spark

Install Dependencies:


1.   Java 8
2.   Apache Spark with hadoop and
3.   Findspark (used to locate the spark in the system)


In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

Set Environment Variables:

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [3]:
!ls

sample_data  spark-3.1.1-bin-hadoop3.2	spark-3.1.1-bin-hadoop3.2.tgz


In [8]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [9]:
!python --version

Python 3.10.6


### Creating data folders

In [26]:
!mkdir -p work/data/raw/events/
!mkdir -p work/data/processed/events/
!mkdir -p work/data/trusted/events/

## 2. Generate Fake Data

### Installing libs

In [10]:
!pip install -q faker

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.7 MB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.7 MB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.7 MB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m1.1/1.7 MB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m1.5/1.7 MB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

### Imports

In [12]:
from faker import Faker
from faker.providers import BaseProvider
from datetime import datetime
from json import dumps
import pandas as pd
import random
import collections
import glob
import os

### Generating fake `event_id`: random UUIDs for unique and non-repeated event_id values

In [13]:
fake = Faker()
Faker.seed(random.randrange(0, 99999999999999999999, 1))
fake_event_id = fake.uuid4()
print(fake_event_id)

32e7615a-0d59-4494-a175-4b93f2082a64


### Generating custom fake `event_id`: random UUIDs for duplicate event_id scenario

In [16]:
class CustomUUIDProvider(BaseProvider):
    def custom_uuid(self):
        list_uuids = [
            '1a1a1a1a-1a1a-1a1a-1a1a-1a1a1a1a1a1a',
            '2b2b2b2b-2b2b-2b2b-2b2b-2b2b2b2b2b2b',
            '3c3c3c3c-3c3c-3c3c-3c3c-3c3c3c3c3c3c'
            ]
        return random.choice(list_uuids)

### Generating fake `timestamp`: random timestamps with values until 3 years ago

In [17]:
fake_timestamp = datetime.strftime(fake.date_time_between(start_date='-3y', end_date='now'),"%Y-%m-%dT%H:%M:%S")
print(fake_timestamp)

2022-12-22T08:53:28


### Generating custom fake `event_type`: random values based on list


In [19]:
class EventTypeProvider(BaseProvider):
    def event_type(self):
        list_event_types = ['account-status-change','transaction-new-value']
        return random.choice(list_event_types)

fake.add_provider(EventTypeProvider)

fake_event_type = fake.event_type()
print(fake_event_type)

account-status-change


### Generating fake `status`: random values based on list

In [20]:
class StatusTypeProvider(BaseProvider):
    def status_type(self):
        list_status_types = ['ACTIVE','INACTIVE','SUSPENDED','BLOCKED', 'DELETED']
        return random.choice(list_status_types)

fake.add_provider(StatusTypeProvider)

fake_status_type = fake.status_type()
print(fake_status_type)

ACTIVE


### Generating custom fake `data`: values based on dict

In [21]:
class CustomDataProvider(BaseProvider):
      def custom_data(self):
        dict_data = {
            "account-status-change": collections.OrderedDict([
                ('id', fake.random_number(digits=6)),
                ('old_status', fake.status_type()),
                ('new_status', fake.status_type()),
                ('reason', fake.sentence(nb_words=5))
            ]),
            "transaction-new-value": collections.OrderedDict([
                ('id', fake.random_number(digits=6)),
                ('account_orig_id', fake.random_number(digits=6)),
                ('account_dest_id', fake.random_number(digits=6)),
                ('amount', fake.pyfloat(positive=True)),
                ('currency', fake.currency_code())
            ])
        }
        return dict_data

fake.add_provider(CustomDataProvider)

fake_custom_data = fake.custom_data().get(fake.event_type())
print(fake_custom_data)

OrderedDict([('id', 63888), ('account_orig_id', 838769), ('account_dest_id', 187050), ('amount', 891287578.480053), ('currency', 'SOS')])


### Defining `write_fake_data` and `read_fake_data` functions

In [25]:
def write_fake_data(fake, length, destination_path, unique_uuid = True):

    database = []
    current_time = datetime.now().strftime("%Y%m%d%H%M%S")
    filename = 'fake_events_'+current_time

    for x in range(length):
        uuid = fake.uuid4() if unique_uuid else fake.custom_uuid()
        event_type = fake.event_type()
        project_domain_name = event_type.split('-')[0]

        database.append(collections.OrderedDict([
            ('event_id', uuid),
            ('timestamp', datetime.strftime(fake.date_time_between(start_date='-3y', end_date='now'),"%Y-%m-%dT%H:%M:%S")),
            ('domain', project_domain_name),
            ('event_type', event_type),
            ('data', fake.custom_data().get(event_type))
        ]))

    with open('%s%s.json' % (destination_path, filename), 'w') as output:
        output.write(dumps(database, indent=4, sort_keys=False, default=str))

    print("Done.")

def read_fake_data(json_filepath):
    json_files = [os.path.normpath(i) for i in glob.glob(json_filepath)]
    df = pd.concat([pd.read_json(f) for f in json_files])
    return df

def run(length, unique_uuid = True):
    fake = Faker()
    Faker.seed(random.randrange(0, 99999999999999999999, 1))
    fake.add_provider(StatusTypeProvider)
    fake.add_provider(CustomUUIDProvider)
    fake.add_provider(EventTypeProvider)
    fake.add_provider(CustomDataProvider)

    destination_path = 'work/data/raw/events/'
    write_fake_data(fake, length, destination_path,unique_uuid)

    json_filepath = destination_path+'*.json'
    fake_data = read_fake_data(json_filepath)
    print(fake_data)

### Writing and reading fake data

> Writing json file with 1000 unique events

In [27]:
run(1000)

Done.
                                 event_id           timestamp       domain  \
0    d9ddf046-05ab-48a5-b411-02dd9955a2c5 2021-08-20 10:02:39  transaction   
1    318cbfd1-5bbb-45f3-8874-22e1a7fc386e 2021-05-05 13:53:16      account   
2    03732535-f661-46e3-84f5-9e541bb461f2 2023-06-26 10:35:34  transaction   
3    d3096c5d-7c93-4170-b87a-1a6377100b36 2020-12-22 13:53:11      account   
4    dff9946e-d91e-4616-a881-06a4eb29089f 2020-12-23 00:17:02  transaction   
..                                    ...                 ...          ...   
995  a493c58d-05b6-415b-b6c0-c63b99cd6924 2021-09-06 23:41:55  transaction   
996  6a6c4c3d-87bc-4642-b7c6-04e278f7eda7 2023-07-27 04:11:42  transaction   
997  ac40af3a-662c-406f-9508-a1a186520c76 2021-02-03 21:08:35  transaction   
998  04e6a811-7eba-4848-8e49-e94666f5fc31 2021-04-17 07:32:35      account   
999  f59ea7fa-8edc-4ed4-94fe-53a3a685639f 2021-12-24 02:57:06  transaction   

                event_type                               

> Writing json file with 100 events with duplicate rows

In [28]:
run(100,unique_uuid = False)

Done.
                                 event_id           timestamp       domain  \
0    1a1a1a1a-1a1a-1a1a-1a1a-1a1a1a1a1a1a 2021-09-11 21:14:46      account   
1    1a1a1a1a-1a1a-1a1a-1a1a-1a1a1a1a1a1a 2022-11-01 21:36:41      account   
2    2b2b2b2b-2b2b-2b2b-2b2b-2b2b2b2b2b2b 2020-09-03 05:12:39      account   
3    3c3c3c3c-3c3c-3c3c-3c3c-3c3c3c3c3c3c 2021-10-24 01:31:38  transaction   
4    3c3c3c3c-3c3c-3c3c-3c3c-3c3c3c3c3c3c 2021-08-05 08:49:07  transaction   
..                                    ...                 ...          ...   
995  a493c58d-05b6-415b-b6c0-c63b99cd6924 2021-09-06 23:41:55  transaction   
996  6a6c4c3d-87bc-4642-b7c6-04e278f7eda7 2023-07-27 04:11:42  transaction   
997  ac40af3a-662c-406f-9508-a1a186520c76 2021-02-03 21:08:35  transaction   
998  04e6a811-7eba-4848-8e49-e94666f5fc31 2021-04-17 07:32:35      account   
999  f59ea7fa-8edc-4ed4-94fe-53a3a685639f 2021-12-24 02:57:06  transaction   

                event_type                               

## 3. Exploring the Raw Dataframe

### Loading the Dataframe:

- List raw json files
- Define raw_events_schema with only StringType for not lost data
- Load raw_events dataframe

In [29]:
!ls work/data/raw/events/ -la

total 428
drwxr-xr-x 2 root root   4096 Jul 27 16:30 .
drwxr-xr-x 3 root root   4096 Jul 27 16:28 ..
-rw-r--r-- 1 root root 387821 Jul 27 16:28 fake_events_20230727162819.json
-rw-r--r-- 1 root root  38965 Jul 27 16:30 fake_events_20230727163006.json


In [30]:

from pyspark.sql.types import StructType,StructField, StringType

raw_events_schema = StructType([
      StructField("data",StringType(),True),
      StructField("domain",StringType(),True),
      StructField("event_id",StringType(),True),
      StructField("event_type",StringType(),True),
      StructField("timestamp",StringType(),True)
  ])

In [31]:
raw_events = spark.read.option("multiline","true").schema(raw_events_schema).json('work/data/raw/events/')
raw_events.show(5, truncate = False)

+-------------------------------------------------------------------------------------------------------------+-----------+------------------------------------+---------------------+-------------------+
|data                                                                                                         |domain     |event_id                            |event_type           |timestamp          |
+-------------------------------------------------------------------------------------------------------------+-----------+------------------------------------+---------------------+-------------------+
|{"id":727466,"account_orig_id":915391,"account_dest_id":465171,"amount":734.448561822683,"currency":"CNY"}   |transaction|d9ddf046-05ab-48a5-b411-02dd9955a2c5|transaction-new-value|2021-08-20T10:02:39|
|{"id":559660,"old_status":"BLOCKED","new_status":"BLOCKED","reason":"Usually truth card east."}              |account    |318cbfd1-5bbb-45f3-8874-22e1a7fc386e|account-status-change|2021-0

In [32]:
raw_events.count()

1100

### Dataframe Raw Schema

In [33]:
raw_events.printSchema()

root
 |-- data: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)



## 4. Applying transformations

### Columns `timestamp` and `day` transformations

In [34]:
partial_events = raw_events\
  .withColumn("timestamp",to_timestamp("timestamp"))\
  .withColumn("day",to_date("timestamp"))

In [35]:
partial_events.printSchema()

root
 |-- data: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- day: date (nullable = true)



### Drop duplicated events

In [36]:
from pyspark.sql.functions import countDistinct

# Count
partial_events.select(countDistinct("event_id", "event_type").alias("distinct_events")).show()


+---------------+
|distinct_events|
+---------------+
|           1006|
+---------------+



In [37]:
grouped_events = partial_events \
  .groupBy( \
      col("event_id"), \
      col("event_type")) \
  .agg( \
      max(col("timestamp")))

grouped_events.show(truncate = False)

+------------------------------------+---------------------+-------------------+
|event_id                            |event_type           |max(timestamp)     |
+------------------------------------+---------------------+-------------------+
|c8b7a073-2967-42a2-b1a2-57932bac2d80|transaction-new-value|2023-01-12 09:34:38|
|d5ee70c5-72e2-4dfc-b044-8a02b6495af9|transaction-new-value|2023-04-19 07:46:50|
|52e834f4-1569-440c-acdd-970ab5a268ad|account-status-change|2023-03-21 08:23:33|
|a3115854-83f0-4dca-a592-dcd99541631e|account-status-change|2023-03-01 03:13:25|
|c0226b43-9864-4fdb-8060-53d465cf5108|transaction-new-value|2022-11-18 00:20:19|
|5e04f614-6c0a-49ba-8863-276a955b4256|transaction-new-value|2020-09-29 02:48:30|
|1f6c4ab7-27ec-4f11-a0ee-ebd4ce14ea6b|transaction-new-value|2020-10-04 18:12:24|
|c5e337bc-9f41-4d69-af2b-d0552e6c0011|transaction-new-value|2022-12-12 21:27:54|
|1447f2d2-7a1b-469f-942a-aa3589e26a5b|account-status-change|2023-04-22 18:32:47|
|d0590eb9-c506-454e-9705-227

In [38]:
final_events = grouped_events \
    .join(partial_events, ["event_id","event_type"]) \
    .dropDuplicates(["event_id","event_type"]) \
    .drop("timestamp") \
    .withColumnRenamed("max(timestamp)", "timestamp")
final_events.show()

+--------------------+--------------------+-------------------+--------------------+-----------+----------+
|            event_id|          event_type|          timestamp|                data|     domain|       day|
+--------------------+--------------------+-------------------+--------------------+-----------+----------+
|52e834f4-1569-440...|account-status-ch...|2023-03-21 08:23:33|{"id":625198,"old...|    account|2023-03-21|
|a3115854-83f0-4dc...|account-status-ch...|2023-03-01 03:13:25|{"id":132643,"old...|    account|2023-03-01|
|c0226b43-9864-4fd...|transaction-new-v...|2022-11-18 00:20:19|{"id":398056,"acc...|transaction|2022-11-18|
|c8b7a073-2967-42a...|transaction-new-v...|2023-01-12 09:34:38|{"id":31953,"acco...|transaction|2023-01-12|
|d5ee70c5-72e2-4df...|transaction-new-v...|2023-04-19 07:46:50|{"id":494669,"acc...|transaction|2023-04-19|
|1447f2d2-7a1b-469...|account-status-ch...|2023-04-22 18:32:47|{"id":958560,"old...|    account|2023-04-22|
|1f6c4ab7-27ec-4f1...|transa

In [39]:
final_events.count()

1006

### Flatten `data` column by event-type

> Each event-type has its own schema

In [40]:
final_events.groupBy('event_type').agg(count(col('event_type'))).show(truncate=False)

+---------------------+-----------------+
|event_type           |count(event_type)|
+---------------------+-----------------+
|account-status-change|483              |
|transaction-new-value|523              |
+---------------------+-----------------+



In [41]:
def distinct_values(df, col):
    list_distinct_values = df.select(col).distinct().toPandas()[col].to_list()
    return list_distinct_values

def flatten_df(df):
    json_schema = spark.read.json(df.rdd.map(lambda row: row.data)).schema
    df2 = df.withColumn("data", from_json("data", json_schema))
    col1 = df2.columns
    col1.remove("data")
    col2 = df2.select("data.*").columns
    append_str ="data."
    col3 = [append_str + val for val in col2]
    col_list = col1 + col3
    df3 = df2.select(*col_list).drop("data")
    return df3

def write_parquet_data(df):
    df.printSchema()
    df.write \
    .partitionBy("event_type", "day") \
    .mode("append") \
    .parquet("work/data/trusted/events/")

## 5. Write transformed data in parquet format





In [42]:
for value in distinct_values(final_events, 'event_type'):
    filtered_df = final_events.filter(col('event_type') == value)
    flattened_df = flatten_df(filtered_df)
    write_parquet_data(flattened_df)

root
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- domain: string (nullable = true)
 |-- day: date (nullable = true)
 |-- id: long (nullable = true)
 |-- new_status: string (nullable = true)
 |-- old_status: string (nullable = true)
 |-- reason: string (nullable = true)

root
 |-- event_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- domain: string (nullable = true)
 |-- day: date (nullable = true)
 |-- account_dest_id: long (nullable = true)
 |-- account_orig_id: long (nullable = true)
 |-- amount: double (nullable = true)
 |-- currency: string (nullable = true)
 |-- id: long (nullable = true)



## 6. Read transformed data in parquet format

In [43]:
!ls -la work/data/trusted/events/

total 52
drwxr-xr-x   4 root root  4096 Jul 27 17:58  .
drwxr-xr-x   3 root root  4096 Jul 27 16:28  ..
drwxr-xr-x 392 root root 20480 Jul 27 17:57 'event_type=account-status-change'
drwxr-xr-x 406 root root 20480 Jul 27 17:58 'event_type=transaction-new-value'
-rw-r--r--   1 root root     0 Jul 27 17:58  _SUCCESS
-rw-r--r--   1 root root     8 Jul 27 17:58  ._SUCCESS.crc


In [44]:
trusted_transaction_events = spark.read.parquet('work/data/trusted/events/event_type=transaction-new-value')
trusted_transaction_events.show()

+--------------------+-------------------+-----------+---------------+---------------+-------------------+--------+------+----------+
|            event_id|          timestamp|     domain|account_dest_id|account_orig_id|             amount|currency|    id|       day|
+--------------------+-------------------+-----------+---------------+---------------+-------------------+--------+------+----------+
|5e04f614-6c0a-49b...|2020-09-29 02:48:30|transaction|         263662|         759139| 8.17173960051861E9|     AED|237285|2020-09-29|
|7312e48d-158b-432...|2020-09-29 17:38:56|transaction|         885368|         727721| 3.41947704352005E9|     PGK| 57630|2020-09-29|
|e1d73bb0-2a74-450...|2020-09-29 17:31:25|transaction|         534542|         694999|   5332419.77476907|     ZAR|346215|2020-09-29|
|dc63e448-4908-453...|2022-07-16 23:57:53|transaction|          57176|         786839|   138531.541359864|     RWF|370925|2022-07-16|
|6331dd00-a620-470...|2021-10-22 01:24:00|transaction|        

In [45]:
trusted_transaction_events.count()

523

In [46]:
trusted_account_events = spark.read.parquet('work/data/trusted/events/event_type=account-status-change')
trusted_account_events.show()

+--------------------+-------------------+-------+------+----------+----------+--------------------+----------+
|            event_id|          timestamp| domain|    id|new_status|old_status|              reason|       day|
+--------------------+-------------------+-------+------+----------+----------+--------------------+----------+
|b7c638b2-5cb1-440...|2021-01-01 00:10:00|account|989694|   BLOCKED| SUSPENDED|Former small poli...|2021-01-01|
|2b58a0d5-459b-434...|2021-01-23 18:28:00|account|145771|   BLOCKED|    ACTIVE|Both relationship...|2021-01-23|
|397ce6c9-8d98-486...|2022-10-18 17:57:53|account|540100| SUSPENDED|   BLOCKED|Exist agreement m...|2022-10-18|
|251b3d2b-1e75-4cd...|2020-11-27 07:26:53|account|917054|   DELETED| SUSPENDED|Computer garden e...|2020-11-27|
|ae1a431e-3c58-498...|2022-12-14 13:21:13|account|874415| SUSPENDED|  INACTIVE|Officer often fou...|2022-12-14|
|fca5219e-8933-43f...|2023-07-05 10:41:27|account|789096|  INACTIVE|  INACTIVE|Specific product ...|2023

In [47]:
trusted_account_events.count()

483

## 7. Move raw data to processed data folder

In [48]:
!mv work/data/raw/events/* work/data/processed/events/
!ls work/data/processed/events/ -la

total 428
drwxr-xr-x 2 root root   4096 Jul 27 17:59 .
drwxr-xr-x 3 root root   4096 Jul 27 16:28 ..
-rw-r--r-- 1 root root 387821 Jul 27 16:28 fake_events_20230727162819.json
-rw-r--r-- 1 root root  38965 Jul 27 16:30 fake_events_20230727163006.json
