In [9]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import*
from cap_secrets import username
from cap_secrets import password

In [10]:
# 2. Initializing SparkSession
spark = SparkSession.builder.appName('creditinfo1').getOrCreate()

In [11]:
df_credit = spark.read.json("cdw_sapp_credit.json")

In [12]:
df_credit.show(5)

+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|        114|4210653349028689|123459988| 14|    2|             1|       Education|             78.9|2018|
|         35|4210653349028689|123459988| 20|    3|             2|   Entertainment|            14.24|2018|
|        160|4210653349028689|123459988|  8|    7|             3|         Grocery|             56.7|2018|
|        114|4210653349028689|123459988| 19|    4|             4|   Entertainment|            59.73|2018|
|         93|4210653349028689|123459988| 10|   10|             5|             Gas|             3.59|2018|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
only showing top 5 rows



In [13]:
df_credit = df_credit.withColumn('YEAR', df_credit['YEAR'].cast('string'))
df_credit = df_credit.withColumn('MONTH', lpad(df_credit['MONTH'], 2, '0'))
df_credit = df_credit.withColumn('DAY', lpad(df_credit['DAY'], 2, '0'))
# df_cust = df_cust.select(df_cust['CUST_PHONE'].cast('string'))
df_credit.printSchema()

root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- DAY: string (nullable = true)
 |-- MONTH: string (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- YEAR: string (nullable = true)



In [14]:
df_credit = df_credit.withColumn('TIMEID', concat(df_credit['YEAR'],df_credit['MONTH'],df_credit['DAY']))


In [15]:
df_credit.show(5)

+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+--------+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR|  TIMEID|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+--------+
|        114|4210653349028689|123459988| 14|   02|             1|       Education|             78.9|2018|20180214|
|         35|4210653349028689|123459988| 20|   03|             2|   Entertainment|            14.24|2018|20180320|
|        160|4210653349028689|123459988| 08|   07|             3|         Grocery|             56.7|2018|20180708|
|        114|4210653349028689|123459988| 19|   04|             4|   Entertainment|            59.73|2018|20180419|
|         93|4210653349028689|123459988| 10|   10|             5|             Gas|             3.59|2018|20181010|
+-----------+----------------+---------+---+-----+--------------+---------------

In [16]:
df_credit = df_credit.drop('DAY','MONTH','YEAR')
df_credit.show(5)

+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|  TIMEID|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|        114|4210653349028689|123459988|             1|       Education|             78.9|20180214|
|         35|4210653349028689|123459988|             2|   Entertainment|            14.24|20180320|
|        160|4210653349028689|123459988|             3|         Grocery|             56.7|20180708|
|        114|4210653349028689|123459988|             4|   Entertainment|            59.73|20180419|
|         93|4210653349028689|123459988|             5|             Gas|             3.59|20181010|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
only showing top 5 rows



In [17]:
df_credit = df_credit.withColumnRenamed('CREDIT_CARD_NO','CUST_CC_NO')

In [18]:
df_credit.printSchema()

root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CUST_CC_NO: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- TIMEID: string (nullable = true)



In [8]:
df_credit.write.format("jdbc") \
  .mode("append") \
  .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
  .option("dbtable", "CDW_SAPP_CREDIT_CARD") \
  .option("user", username) \
  .option("password", password) \
  .save()

In [6]:
df_new = spark.read.format("jdbc") \
  .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
  .option("dbtable", "CDW_SAPP_CREDIT_CARD") \
  .option("user", username) \
  .option("password", password) \
  .load()
df_new.show(5)

+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|BRANCH_CODE|      CUST_CC_NO| CUST_SSN|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|  TIMEID|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|        156|4210653312478046|123455692|         22562|         Grocery|            91.08|20180813|
|        114|4210653349028689|123459988|             1|       Education|             78.9|20180214|
|        180|4210653342242023|123451310|         45069|           Bills|            77.79|20180315|
|        114|4210653312478046|123455692|         22563|           Bills|             22.2|20180626|
|        107|4210653342242023|123451310|         45070|      Healthcare|            20.47|20180419|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
only showing top 5 rows

