In [1]:
import os

from src.config.spark import get_spark_session
from src.extract.extract_src_data import extract_src_db, extract_src_csv

# Set Hadoop home dynamically in Python
PARENT_DIR = os.getcwd()
HADOOP_PATH = os.path.join(PARENT_DIR, "library/hadoop")
POSTGRES_DRIVER_PATH = os.path.join(PARENT_DIR, "library/postgre/postgresql-42.7.5.jar")

os.environ["HADOOP_HOME"] = HADOOP_PATH
os.environ["PATH"] += os.pathsep + os.path.join(HADOOP_PATH, "bin")

In [2]:
spark_session = get_spark_session("week6-profiling", POSTGRES_DRIVER_PATH)

In [3]:
education_status_df = extract_src_db(spark_session, "education_status")
marital_status_df = extract_src_db(spark_session, "marital_status")
marketing_campaign_df = extract_src_db(spark_session, "marketing_campaign_deposit")

In [4]:
csv_data_df = extract_src_csv(spark_session, "data/new_bank_transaction_csv/")
csv_data_df.show(5)

+-------------+----------+-----------+----------+------------+------------------+---------------+---------------+-----------------------+
|TransactionID|CustomerID|CustomerDOB|CustGender|CustLocation|CustAccountBalance|TransactionDate|TransactionTime|TransactionAmount (INR)|
+-------------+----------+-----------+----------+------------+------------------+---------------+---------------+-----------------------+
|      T401396|  C1010024|    21/6/65|         M|     KOLKATA|          87058.65|        18/8/16|         141103|                 5000.0|
|      T303294|  C1010068|    14/7/76|         M|     GURGAON|          46741.73|        10/8/16|         101617|                  546.0|
|      T347496|  C1010081|     1/5/89|         M|   GHAZIABAD|           1584.18|        14/8/16|         144742|                  429.0|
|      T329017|C1010081_2|     2/9/77|         F|   PANCHKULA|          23319.04|        15/8/16|         172658|                 1699.0|
|      T113706|C1010081_3|    11/2

In [5]:
customers_df = csv_data_df.select("CustomerID", "CustomerDOB", "CustGender", "CustLocation", "CustAccountBalance")
transactions_df = csv_data_df.select("TransactionID", "CustomerID", "TransactionDate", "TransactionTime", "TransactionAmount (INR)")
transactions_df.show(5)

+-------------+----------+---------------+---------------+-----------------------+
|TransactionID|CustomerID|TransactionDate|TransactionTime|TransactionAmount (INR)|
+-------------+----------+---------------+---------------+-----------------------+
|      T401396|  C1010024|        18/8/16|         141103|                 5000.0|
|      T303294|  C1010068|        10/8/16|         101617|                  546.0|
|      T347496|  C1010081|        14/8/16|         144742|                  429.0|
|      T329017|C1010081_2|        15/8/16|         172658|                 1699.0|
|      T113706|C1010081_3|         6/8/16|         145828|                 2856.0|
+-------------+----------+---------------+---------------+-----------------------+
only showing top 5 rows



In [6]:
transactions_df.show(5)

+-------------+----------+---------------+---------------+-----------------------+
|TransactionID|CustomerID|TransactionDate|TransactionTime|TransactionAmount (INR)|
+-------------+----------+---------------+---------------+-----------------------+
|      T401396|  C1010024|        18/8/16|         141103|                 5000.0|
|      T303294|  C1010068|        10/8/16|         101617|                  546.0|
|      T347496|  C1010081|        14/8/16|         144742|                  429.0|
|      T329017|C1010081_2|        15/8/16|         172658|                 1699.0|
|      T113706|C1010081_3|         6/8/16|         145828|                 2856.0|
+-------------+----------+---------------+---------------+-----------------------+
only showing top 5 rows

+-------+-------------+----------+---------------+------------------+-----------------------+
|summary|TransactionID|CustomerID|TransactionDate|   TransactionTime|TransactionAmount (INR)|
+-------+-------------+----------+------