In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
from pyspark.sql.types import *

#used "_" instead of space to match parqate file format
raw_schema = StructType([
    StructField("User", IntegerType(), True),
    StructField("CARD_INDEX", IntegerType(), True),
    StructField("Card_Brand", StringType(), True),
    StructField("Card_Type", StringType(), True),
    StructField("Card_Number", StringType(), True),
    StructField("Expires", StringType(), True),           
    StructField("CVV", StringType(), True),
    StructField("Has_Chip", StringType(), True),
    StructField("Cards_Issued", IntegerType(), True),
    StructField("Credit_Limit", StringType(), True),      
    StructField("Acct_Open_Date", StringType(), True),
    StructField("Year_PIN_Last_Changed", IntegerType(), True),
    StructField("Card_on_Dark_Web", StringType(), True)
])


In [7]:
path = "file:///home/talentum/shared/Project/sd254_cards.csv"
df_raw = spark.read.csv(path, header = True, schema=raw_schema)
df_raw.show()

+----+----------+----------+---------------+----------------+-------+---+--------+------------+------------+--------------+---------------------+----------------+
|User|CARD_INDEX|Card_Brand|      Card_Type|     Card_Number|Expires|CVV|Has_Chip|Cards_Issued|Credit_Limit|Acct_Open_Date|Year_PIN_Last_Changed|Card_on_Dark_Web|
+----+----------+----------+---------------+----------------+-------+---+--------+------------+------------+--------------+---------------------+----------------+
|   0|         0|      Visa|          Debit|4344676511950444|12/2022|623|     YES|           2|      $24295|       09/2002|                 2008|              No|
|   0|         1|      Visa|          Debit|4956965974959986|12/2020|393|     YES|           2|      $21968|       04/2014|                 2014|              No|
|   0|         2|      Visa|          Debit|4582313478255491|02/2024|719|     YES|           2|      $46414|       07/2003|                 2004|              No|
|   0|         3|     

In [16]:
from pyspark.sql import functions as F

#remove $ sign and from coloumn Credit_Limit and cast it into double
df_clean = df_raw.withColumn("Credit_Limit", F.regexp_replace("Credit_limit","\\$","").cast("double"))

#converted "Has_Chip" coloumn into boolean format
df_clean = df_clean.withColumn("Has_Chip", F.when(F.col("Has_Chip")=="YES",True).when(F.col("Has_Chip")=="NO",False).otherwise(None))

#converted "Card_on_Dark_Web" coloumn into boolean format
df_clean = df_clean.withColumn("Card_on_Dark_Web", 
                F.when(F.col("Card_on_Dark_Web") == "Yes", True)
                 .when(F.col("Card_on_Dark_Web") == "No", False)
                 .otherwise(None))


# Convert Expires "09/2002" → proper DateType
df_clean = df_clean.withColumn("Expires", 
                F.to_date(F.concat(F.lit("01/"), F.col("Expires")), "dd/MM/yyyy"))

# Convert Acct_Open_Date "09/2002" → proper DateType
df_clean = df_clean.withColumn("Acct_Open_Date",
                F.to_date(F.concat(F.lit("01/"), F.col("Acct_Open_Date")), "dd/MM/yyyy"))
df_clean.show()

+----+----------+----------+---------------+----------------+----------+---+--------+------------+------------+--------------+---------------------+----------------+
|User|CARD_INDEX|Card_Brand|      Card_Type|     Card_Number|   Expires|CVV|Has_Chip|Cards_Issued|Credit_Limit|Acct_Open_Date|Year_PIN_Last_Changed|Card_on_Dark_Web|
+----+----------+----------+---------------+----------------+----------+---+--------+------------+------------+--------------+---------------------+----------------+
|   0|         0|      Visa|          Debit|4344676511950444|2022-12-01|623|    true|           2|     24295.0|    2002-09-01|                 2008|           false|
|   0|         1|      Visa|          Debit|4956965974959986|2020-12-01|393|    true|           2|     21968.0|    2014-04-01|                 2014|           false|
|   0|         2|      Visa|          Debit|4582313478255491|2024-02-01|719|    true|           2|     46414.0|    2003-07-01|                 2004|           false|
|   

In [17]:
df_clean.printSchema()

root
 |-- User: integer (nullable = true)
 |-- CARD_INDEX: integer (nullable = true)
 |-- Card_Brand: string (nullable = true)
 |-- Card_Type: string (nullable = true)
 |-- Card_Number: string (nullable = true)
 |-- Expires: date (nullable = true)
 |-- CVV: string (nullable = true)
 |-- Has_Chip: boolean (nullable = true)
 |-- Cards_Issued: integer (nullable = true)
 |-- Credit_Limit: double (nullable = true)
 |-- Acct_Open_Date: date (nullable = true)
 |-- Year_PIN_Last_Changed: integer (nullable = true)
 |-- Card_on_Dark_Web: boolean (nullable = true)



In [18]:
df_clean.show(truncate=False)

+----+----------+----------+---------------+----------------+----------+---+--------+------------+------------+--------------+---------------------+----------------+
|User|CARD_INDEX|Card_Brand|Card_Type      |Card_Number     |Expires   |CVV|Has_Chip|Cards_Issued|Credit_Limit|Acct_Open_Date|Year_PIN_Last_Changed|Card_on_Dark_Web|
+----+----------+----------+---------------+----------------+----------+---+--------+------------+------------+--------------+---------------------+----------------+
|0   |0         |Visa      |Debit          |4344676511950444|2022-12-01|623|true    |2           |24295.0     |2002-09-01    |2008                 |false           |
|0   |1         |Visa      |Debit          |4956965974959986|2020-12-01|393|true    |2           |21968.0     |2014-04-01    |2014                 |false           |
|0   |2         |Visa      |Debit          |4582313478255491|2024-02-01|719|true    |2           |46414.0     |2003-07-01    |2004                 |false           |
|0  

In [23]:
df_clean.write.mode("overwrite").parquet("file:///home/talentum/projectMaster/dataStaging/")
df_clean.write.mode("overwrite").parquet('/user/talentum/projectMaster/dataStaging/')
