In [1]:
import os
import sys


os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession \
        .builder.appName('PreprocessData') \
        .master("local[*]")\
        .getOrCreate()

In [5]:
df = spark.read \
    .parquet('../data/raw/online_transaction.parquet')

In [6]:
df.show()

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|    181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT| 11668.14|C2048537720|      41554.0|      29885.86|M123070170

In [8]:
from datetime import datetime, timedelta
from pyspark.sql import functions as F

dt = datetime(2023, 5, 1, 0)
# print(dt + timedelta(days = 4, hours=700))

def step_to_date(step, dt = dt):
    step = int(step)
    new_dt = dt + timedelta(hours=step)

    new_dt_str = new_dt.__str__()
    return new_dt_str

In [9]:
from pyspark.sql.types import TimestampType, StringType

step_to_date_UDF = F.udf(lambda z: step_to_date(z),StringType())

In [12]:
df_transform = df.withColumn('datetime',\
              F.to_timestamp(\
                  step_to_date_UDF(F.col('step'))\
              ))

df_transform.show()

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+-------------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|           datetime|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+-------------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|2023-05-01 01:00:00|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|2023-05-01 01:00:00|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|2023-05-01 01:00:00|
|   1|CASH_OUT|    181.0| C840083671|        181.0|           0.0|  C38997010|       211

In [16]:
df_transform = df_transform.withColumn('DiffOrg',
                        F.when(
                            F.col('type') == 'CASH_IN',
                            F.round(F.col('newbalanceOrig') - F.col('oldbalanceOrg'), 2)
                        )\
                        .otherwise(
                            F.round(F.col('oldbalanceOrg') - F.col('newbalanceOrig'), 2)
                        )
                )

In [17]:
df_transform = df_transform.withColumn('DiffOrgStatus',
                        F.when(
                            F.col('amount') == F.col('DiffOrg'),
                            1
                        )\
                        .otherwise(
                            0
                        )
                )

In [18]:
df_transform = df_transform.drop('nameDest', 'oldbalanceDest', 'newbalanceDest')

In [20]:
df_transform.show()

+----+--------+---------+-----------+-------------+--------------+-------+--------------+-------------------+--------+-------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|isFraud|isFlaggedFraud|           datetime| DiffOrg|DiffOrgStatus|
+----+--------+---------+-----------+-------------+--------------+-------+--------------+-------------------+--------+-------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|      0|             0|2023-05-01 01:00:00| 9839.64|            1|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|      0|             0|2023-05-01 01:00:00| 1864.28|            1|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0|      1|             0|2023-05-01 01:00:00|   181.0|            1|
|   1|CASH_OUT|    181.0| C840083671|        181.0|           0.0|      1|             0|2023-05-01 01:00:00|   181.0|            1|
|   1| PAYMENT| 11668.14|C2048537720|      41554.0|      29885.86|   

In [None]:
df_transform.write.parquet("../data/preprocessed/online_transactions.parquet")

In [7]:
import datetime

f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}"

'20230721114921'