In [1]:
from zipfile import ZipFile

# extract zip data
with ZipFile("../data/raw/PS_20174392719_1491204439457_log.csv (1).zip", 'r') as zObject:
    zObject.extractall(path="../data/raw/")

In [2]:
import os
import sys


os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession \
        .builder.appName('PreprocessData') \
        .master("local[*]")\
        .getOrCreate()

In [5]:
df = spark.read \
    .option("header", "true") \
    .option("index", "true") \
    .csv('../data/raw/PS_20174392719_1491204439457_log.csv')

In [6]:
df.printSchema()

root
 |-- step: string (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: string (nullable = true)
 |-- newbalanceOrig: string (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: string (nullable = true)
 |-- newbalanceDest: string (nullable = true)
 |-- isFraud: string (nullable = true)
 |-- isFlaggedFraud: string (nullable = true)



In [7]:
df.show(10)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

In [8]:
from pyspark.sql.functions import col,isnan, when, count

# check missing values
df.select(\
    [count( \
        when( isnan(c) | col(c).isNull(), c) \
    ).alias(c) for c in df.columns])\
   .show()

+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+--------------+
|step|type|amount|nameOrig|oldbalanceOrg|newbalanceOrig|nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+--------------+
|   0|   0|     0|       0|            0|             0|       0|             0|             0|      0|             0|
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+--------------+



In [8]:
from datetime import datetime, timedelta
from pyspark.sql import functions as F

dt = datetime(2023, 5, 1, 0)
# print(dt + timedelta(days = 4, hours=700))

def step_to_date(step, dt = dt):
    step = int(step)
    new_dt = dt + timedelta(hours=step)

    new_dt_str = new_dt.__str__()
    return new_dt_str

In [56]:
from pyspark.sql.types import TimestampType, StringType, IntegerType, LongType

step_to_date_UDF = F.udf(lambda z: step_to_date(z),StringType())

In [10]:
# df.select(F.col("step"), \
#     step_to_date_UDF(F.col("step")).alias("datetime") ) \
#     .withColumn("datetime", F.to_timestamp("datetime"))\
#    .show()

df_transform = df.withColumn('datetime',\
              F.to_timestamp(\
                  step_to_date_UDF(F.col('step'))\
              ))\
             .select(
                 'datetime',
                 'type',
                'amount',
                'nameOrig',
                'oldbalanceOrg',
                'newbalanceOrig',
                'nameDest',
                'oldbalanceDest',
                'newbalanceDest',
                'isFraud',
                'isFlaggedFraud',
                )

df_transform.show()

+-------------------+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|           datetime|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+-------------------+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|2023-05-01 01:00:00| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|2023-05-01 01:00:00| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|2023-05-01 01:00:00|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|2023-05-01 01:00:00|CASH_OUT|    181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|          

In [49]:
df_transform = df_transform.withColumn("id", F.monotonically_increasing_id().cast(IntegerType()))

df_transform.show()

+-------------------+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---+------------+
|           datetime|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud| id|     id_date|
+-------------------+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---+------------+
|2023-05-01 01:00:00| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|  0| 20230501010|
|2023-05-01 01:00:00| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|  1| 20230501011|
|2023-05-01 01:00:00|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|  2| 20230501012|
|2023-05-01 01:0

In [50]:
df_transform.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: string (nullable = true)
 |-- newbalanceOrig: string (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: string (nullable = true)
 |-- newbalanceDest: string (nullable = true)
 |-- isFraud: string (nullable = true)
 |-- isFlaggedFraud: string (nullable = true)
 |-- id: integer (nullable = false)
 |-- id_date: string (nullable = true)



In [62]:
df_transform = df_transform.withColumn('id_date', F.date_format(F.col('datetime'), "yyyyMMdd").cast(IntegerType()))

df_transform.show()

+-------------------+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---+--------+
|           datetime|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud| id| id_date|
+-------------------+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---+--------+
|2023-05-01 01:00:00| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|  0|20230501|
|2023-05-01 01:00:00| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|  1|20230501|
|2023-05-01 01:00:00|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|  2|20230501|
|2023-05-01 01:00:00|CASH_OUT|    181.0|

In [63]:
df_transform.tail(1)

[Row(datetime=datetime.datetime(2023, 5, 31, 23, 0), type='CASH_OUT', amount='850002.52', nameOrig='C1280323807', oldbalanceOrg='850002.52', newbalanceOrig='0.0', nameDest='C873221189', oldbalanceDest='6510099.11', newbalanceDest='7360101.63', isFraud='1', isFlaggedFraud='0', id=1546705, id_date=20230531)]

In [54]:
df_transform.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: string (nullable = true)
 |-- newbalanceOrig: string (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: string (nullable = true)
 |-- newbalanceDest: string (nullable = true)
 |-- isFraud: string (nullable = true)
 |-- isFlaggedFraud: string (nullable = true)
 |-- id: integer (nullable = false)
 |-- id_date: string (nullable = true)



In [40]:
# df_transform.write.options(header='True', delimiter=',') \
#  .csv("../data/preprocessed/PS_20174392719_1491204439457_log.csv")

df_transform.write.parquet("../data/preprocessed/online_transactions.parquet")

In [47]:
spark.stop()