In [1]:
import pyspark
import pyspark.sql.functions as F
import mysql.connector as mydbconnection
from mysql.connector import Error
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws
from credentials import mysql_username, mysql_password

# create the SparkSession
spark = SparkSession.builder.appName('creditcard-pyspark').getOrCreate()

In [2]:
df_credit = spark.read.json("json_source_data/cdw_sapp_credit.json")
df_credit.show()

+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|        114|4210653349028689|123459988| 14|    2|             1|       Education|             78.9|2018|
|         35|4210653349028689|123459988| 20|    3|             2|   Entertainment|            14.24|2018|
|        160|4210653349028689|123459988|  8|    7|             3|         Grocery|             56.7|2018|
|        114|4210653349028689|123459988| 19|    4|             4|   Entertainment|            59.73|2018|
|         93|4210653349028689|123459988| 10|   10|             5|             Gas|             3.59|2018|
|        164|4210653349028689|123459988| 28|    5|             6|       Education|             6.89|2018|
|        119|4210653349028689|123459988| 19|  

In [3]:
df_credit.columns
df_credit.printSchema()
df_credit.describe().show()
df_credit.show()

root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- DAY: long (nullable = true)
 |-- MONTH: long (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- YEAR: long (nullable = true)

+-------+------------------+--------------------+-------------------+-----------------+------------------+------------------+----------------+------------------+--------------------+
|summary|       BRANCH_CODE|      CREDIT_CARD_NO|           CUST_SSN|              DAY|             MONTH|    TRANSACTION_ID|TRANSACTION_TYPE| TRANSACTION_VALUE|                YEAR|
+-------+------------------+--------------------+-------------------+-----------------+------------------+------------------+----------------+------------------+--------------------+
|  count|             46694|               46694|              46694|        

In [4]:
# combine month and day column into a new column name timeID
df_credit = df_credit.withColumn("TimeID", concat_ws('', df_credit['year'], 
    F.lpad(df_credit['MONTH'], 2, '0'), 
    F.lpad(df_credit['DAY'], 2, '0')))

In [5]:
# reorganize the columns as required.
df_credit = df_credit.select('CREDIT_CARD_NO', 'TIMEID', 'CUST_SSN', 'BRANCH_CODE', 'TRANSACTION_TYPE', 'TRANSACTION_VALUE', 'TRANSACTION_ID')
df_credit.show()

+----------------+--------+---------+-----------+----------------+-----------------+--------------+
|  CREDIT_CARD_NO|  TIMEID| CUST_SSN|BRANCH_CODE|TRANSACTION_TYPE|TRANSACTION_VALUE|TRANSACTION_ID|
+----------------+--------+---------+-----------+----------------+-----------------+--------------+
|4210653349028689|20180214|123459988|        114|       Education|             78.9|             1|
|4210653349028689|20180320|123459988|         35|   Entertainment|            14.24|             2|
|4210653349028689|20180708|123459988|        160|         Grocery|             56.7|             3|
|4210653349028689|20180419|123459988|        114|   Entertainment|            59.73|             4|
|4210653349028689|20181010|123459988|         93|             Gas|             3.59|             5|
|4210653349028689|20180528|123459988|        164|       Education|             6.89|             6|
|4210653349028689|20180519|123459988|        119|   Entertainment|            43.39|             7|


In [6]:
df_credit = df_credit.withColumnRenamed("CREDIT_CARD_NO", "CUST_CC_NO")
df_credit.show()


+----------------+--------+---------+-----------+----------------+-----------------+--------------+
|      CUST_CC_NO|  TIMEID| CUST_SSN|BRANCH_CODE|TRANSACTION_TYPE|TRANSACTION_VALUE|TRANSACTION_ID|
+----------------+--------+---------+-----------+----------------+-----------------+--------------+
|4210653349028689|20180214|123459988|        114|       Education|             78.9|             1|
|4210653349028689|20180320|123459988|         35|   Entertainment|            14.24|             2|
|4210653349028689|20180708|123459988|        160|         Grocery|             56.7|             3|
|4210653349028689|20180419|123459988|        114|   Entertainment|            59.73|             4|
|4210653349028689|20181010|123459988|         93|             Gas|             3.59|             5|
|4210653349028689|20180528|123459988|        164|       Education|             6.89|             6|
|4210653349028689|20180519|123459988|        119|   Entertainment|            43.39|             7|


In [7]:
# convert the PySpark DataFrame to a pandas DataFrame
df_credit = df_credit.toPandas()

In [8]:
# credit_card table DDL
creditcard_table = (
    "CREATE TABLE IF NOT EXISTS `cdw_sapp_credit_card` ("
    "`CUST_CC_NO` VARCHAR(16) NOT NULL,"
    "`TIMEID` VARCHAR(8) NOT NULL,"  
    "`CUST_SSN` VARCHAR(9) NOT NULL,"  
    "`BRANCH_CODE` INT NOT NULL,"
    "`TRANSACTION_TYPE` VARCHAR(50) NOT NULL,"
    "`TRANSACTION_VALUE` DOUBLE NOT NULL,"
    "`TRANSACTION_ID` INT NOT NULL PRIMARY KEY,"  
    "FOREIGN KEY (`BRANCH_CODE`) REFERENCES `cdw_sapp_branch`(`BRANCH_CODE`)" 
    ")"
)


In [9]:
try:
    conn = mydbconnection.connect(database='creditcard_capstone', user=mysql_username, password=mysql_password)
    cursor = conn.cursor()

    # Create the table
    print('Creating cdw_sapp_credit_card table....')
    cursor.execute(creditcard_table)
    print("cdw_sapp_credit_card table is created....")

    # Prepare the data for batch insertion
    data_to_insert = [tuple(row) for _, row in df_credit.iterrows()]
    sql = "INSERT INTO creditcard_capstone.cdw_sapp_credit_card VALUES (%s,%s,%s,%s,%s,%s,%s)"

    # Batch insert the data
    cursor.executemany(sql, data_to_insert)
    print(f"All {len(data_to_insert)} Creditcard Records inserted")

    # Commit the transaction
    conn.commit()
    print("Creditcard data fully loaded")

except Error as e:
    print("Error while connecting to MySQL", e)

finally:
    # Ensure the connection is closed
    if conn and conn.is_connected():
        cursor.close()
        conn.close()
        print("MySQL connection is closed.")

Creating cdw_sapp_credit_card table....
cdw_sapp_credit_card table is created....
All 46694 Creditcard Records inserted
Creditcard data fully loaded
MySQL connection is closed.
