In [25]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import length
from pyspark.sql.functions import concat, lit
from pyspark.sql.functions import substring, upper, col
from pyspark.sql.functions import initcap
from pyspark.sql.functions import lower
import random
from credentials import mysql_username, mysql_password

# create the SparkSession
spark = SparkSession.builder.appName('customer-pyspark').getOrCreate()

In [52]:
df_customer = spark.read.json("json_source_data/cdw_sapp_customer.json")
df_customer.show()

+------+----------------+------------+-------------+--------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+
|APT_NO|  CREDIT_CARD_NO|   CUST_CITY| CUST_COUNTRY|          CUST_EMAIL|CUST_PHONE|CUST_STATE|CUST_ZIP|FIRST_NAME|LAST_NAME|        LAST_UPDATED|MIDDLE_NAME|      SSN|      STREET_NAME|
+------+----------------+------------+-------------+--------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+
|   656|4210653310061055|     Natchez|United States| AHooper@example.com|   1237818|        MS|   39120|      Alec|   Hooper|2018-04-21T12:49:...|         Wm|123456100|Main Street North|
|   829|4210653310102868|Wethersfield|United States| EHolman@example.com|   1238933|        CT|   06109|      Etta|   Holman|2018-04-21T12:49:...|    Brendan|123453023|    Redwood Drive|
|   683|4210653310116272|     Huntley|United States| WDunham@exam

In [48]:
df_customer.columns
df_customer.printSchema()
df_customer.describe().show()


root
 |-- SSN: long (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- MIDDLE_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- FULL_STREET_ADDRESS: string (nullable = true)
 |-- CUST_CITY: string (nullable = true)
 |-- CUST_STATE: string (nullable = true)
 |-- CUST_COUNTRY: string (nullable = true)
 |-- CUST_ZIP: string (nullable = true)
 |-- CUST_PHONE: string (nullable = false)
 |-- CUST_EMAIL: string (nullable = true)
 |-- LAST_UPDATED: string (nullable = true)

+-------+--------------------+----------+-----------+---------+--------------------+-------------------+---------+----------+-------------+------------------+-------------+--------------------+--------------------+
|summary|                 SSN|FIRST_NAME|MIDDLE_NAME|LAST_NAME|      CREDIT_CARD_NO|FULL_STREET_ADDRESS|CUST_CITY|CUST_STATE| CUST_COUNTRY|          CUST_ZIP|   CUST_PHONE|          CUST_EMAIL|        LAST_UPDATED|
+-------+---

In [28]:
df_customer = df_customer.select('SSN', 'FIRST_NAME', 'MIDDLE_NAME', 'LAST_NAME',  \
            'CREDIT_CARD_NO', 'APT_NO', 'STREET_NAME', 'CUST_CITY', 'CUST_STATE',  \
            'CUST_COUNTRY', 'CUST_ZIP', 'CUST_PHONE', 'CUST_EMAIL', 'LAST_UPDATED')

df_customer.show()

+---------+----------+-----------+---------+----------------+------+-----------------+------------+----------+-------------+--------+----------+--------------------+--------------------+
|      SSN|FIRST_NAME|MIDDLE_NAME|LAST_NAME|  CREDIT_CARD_NO|APT_NO|      STREET_NAME|   CUST_CITY|CUST_STATE| CUST_COUNTRY|CUST_ZIP|CUST_PHONE|          CUST_EMAIL|        LAST_UPDATED|
+---------+----------+-----------+---------+----------------+------+-----------------+------------+----------+-------------+--------+----------+--------------------+--------------------+
|123456100|      Alec|         Wm|   Hooper|4210653310061055|   656|Main Street North|     Natchez|        MS|United States|   39120|   1237818| AHooper@example.com|2018-04-21T12:49:...|
|123453023|      Etta|    Brendan|   Holman|4210653310102868|   829|    Redwood Drive|Wethersfield|        CT|United States|   06109|   1238933| EHolman@example.com|2018-04-21T12:49:...|
|123454487|    Wilber|   Ezequiel|   Dunham|4210653310116272|   6

In [29]:
# separate city names with a space.
df_customer = df_customer.withColumn(
    'CUST_CITY',
    F.regexp_replace(df_customer['CUST_CITY'], "(?<=.)([A-Z])", ' $1')
)

df_customer.select('CUST_CITY').show()

+-------------+
|    CUST_CITY|
+-------------+
|      Natchez|
| Wethersfield|
|      Huntley|
|   New Berlin|
|      El Paso|
|North Olmsted|
|       Vienna|
|       Duarte|
|       Owosso|
|         Zion|
|   Youngstown|
|  Summerville|
|      El Paso|
|       Fenton|
|   Grandville|
|    Yuba City|
|   Cape Coral|
|   Brookfield|
|     Richmond|
| West Chester|
+-------------+
only showing top 20 rows



In [30]:
df_customer.select(F.countDistinct("CREDIT_CARD_NO")).show()
df_customer.select(F.countDistinct("SSN")).show()
df_customer.select(F.countDistinct("CUST_PHONE")).show()
df_customer.select(F.countDistinct("CUST_EMAIL")).show()

+------------------------------+
|count(DISTINCT CREDIT_CARD_NO)|
+------------------------------+
|                           952|
+------------------------------+

+-------------------+
|count(DISTINCT SSN)|
+-------------------+
|                952|
+-------------------+

+--------------------------+
|count(DISTINCT CUST_PHONE)|
+--------------------------+
|                       901|
+--------------------------+

+--------------------------+
|count(DISTINCT CUST_EMAIL)|
+--------------------------+
|                       928|
+--------------------------+



In [31]:
df_customer.select('STREET_NAME','APT_NO').show(10)

+-----------------+------+
|      STREET_NAME|APT_NO|
+-----------------+------+
|Main Street North|   656|
|    Redwood Drive|   829|
| 12th Street East|   683|
|Country Club Road|   253|
|   Madison Street|   301|
|   Colonial Drive|     3|
|   Belmont Avenue|    84|
|     Oxford Court|   728|
|    Forest Street|    81|
|     Court Street|   561|
+-----------------+------+
only showing top 10 rows



In [32]:
df_customer = df_customer.withColumn("FULL_STREET_ADDRESS", concat(df_customer["APT_NO"], lit(", "), df_customer["STREET_NAME"]))
df_customer.show()

+---------+----------+-----------+---------+----------------+------+-----------------+-------------+----------+-------------+--------+----------+--------------------+--------------------+--------------------+
|      SSN|FIRST_NAME|MIDDLE_NAME|LAST_NAME|  CREDIT_CARD_NO|APT_NO|      STREET_NAME|    CUST_CITY|CUST_STATE| CUST_COUNTRY|CUST_ZIP|CUST_PHONE|          CUST_EMAIL|        LAST_UPDATED| FULL_STREET_ADDRESS|
+---------+----------+-----------+---------+----------------+------+-----------------+-------------+----------+-------------+--------+----------+--------------------+--------------------+--------------------+
|123456100|      Alec|         Wm|   Hooper|4210653310061055|   656|Main Street North|      Natchez|        MS|United States|   39120|   1237818| AHooper@example.com|2018-04-21T12:49:...|656, Main Street ...|
|123453023|      Etta|    Brendan|   Holman|4210653310102868|   829|    Redwood Drive| Wethersfield|        CT|United States|   06109|   1238933| EHolman@example.co

In [33]:
df_customer = df_customer.select('SSN', 'FIRST_NAME', 'MIDDLE_NAME', 'LAST_NAME',  \
            'CREDIT_CARD_NO', 'FULL_STREET_ADDRESS', 'CUST_CITY', 'CUST_STATE',  \
            'CUST_COUNTRY', 'CUST_ZIP', 'CUST_PHONE', 'CUST_EMAIL', 'LAST_UPDATED')

df_customer.show()

+---------+----------+-----------+---------+----------------+--------------------+-------------+----------+-------------+--------+----------+--------------------+--------------------+
|      SSN|FIRST_NAME|MIDDLE_NAME|LAST_NAME|  CREDIT_CARD_NO| FULL_STREET_ADDRESS|    CUST_CITY|CUST_STATE| CUST_COUNTRY|CUST_ZIP|CUST_PHONE|          CUST_EMAIL|        LAST_UPDATED|
+---------+----------+-----------+---------+----------------+--------------------+-------------+----------+-------------+--------+----------+--------------------+--------------------+
|123456100|      Alec|         Wm|   Hooper|4210653310061055|656, Main Street ...|      Natchez|        MS|United States|   39120|   1237818| AHooper@example.com|2018-04-21T12:49:...|
|123453023|      Etta|    Brendan|   Holman|4210653310102868|  829, Redwood Drive| Wethersfield|        CT|United States|   06109|   1238933| EHolman@example.com|2018-04-21T12:49:...|
|123454487|    Wilber|   Ezequiel|   Dunham|4210653310116272|683, 12th Street ..

In [34]:
# Define the list of area codes
area_codes = ["212", "213", "214", "215", "216", "217", "218", "219", "220", "221"
              "312", "313", "314", "315", "316", "317", "318", "319", "320", "321"
              "412", "413", "414", "415", "416", "417", "418", "419", "420", "421"
              "512", "513", "514", "515", "516", "517", "518", "519", "520", "521"
              "712", "713", "714", "715", "716", "717", "718", "719", "720", "721"]  

# Define the UDF
def random_area_code(phone):
    phone = str(phone)
    if len(phone) == 7:
        return random.choice(area_codes) + phone
    elif len(phone) == 10:
        return phone
    else:
        return phone

# Register the UDF with Spark
random_area_code = udf(random_area_code, StringType())

# Apply the UDF to the dataframe
#df_customer = df_customer.withColumn("CUST_PHONE", random_area_code("CUST_PHONE"))
df_customer = df_customer.withColumn("CUST_PHONE", 
                   random_area_code("CUST_PHONE")).filter(length(df_customer['CUST_PHONE']) <= 10)

valid_phone = df_customer.filter(length(df_customer['CUST_PHONE']) == 10)


In [35]:
invalid_count = df_customer.count() - valid_phone.count()
print(invalid_count)


81


In [36]:
invalid_phone_df = df_customer.filter(length(df_customer['CUST_PHONE']) != 10)
invalid_phone_df.show()

+---------+----------+-----------+----------+----------------+--------------------+----------------+----------+-------------+--------+-------------+--------------------+--------------------+
|      SSN|FIRST_NAME|MIDDLE_NAME| LAST_NAME|  CREDIT_CARD_NO| FULL_STREET_ADDRESS|       CUST_CITY|CUST_STATE| CUST_COUNTRY|CUST_ZIP|   CUST_PHONE|          CUST_EMAIL|        LAST_UPDATED|
+---------+----------+-----------+----------+----------------+--------------------+----------------+----------+-------------+--------+-------------+--------------------+--------------------+
|123452373|    Amalia|  Heriberto|   Ballard|4210653311229354|   71, Warren Street|      Grandville|        MI|United States|   49418|5217121242113|ABallard@example.com|2018-04-21T12:49:...|
|123455343|     Patty|   Angelita|    Thomas|4210653311652836|   195, Jones Street|       Yuba City|        CA|United States|   95993|   5141239888| PThomas@example.com|2018-04-21T12:49:...|
|123454933|   Marcelo|    Gonzalo|   Emerson|

In [37]:
# convert phone numbers to this (000)000-0000 format 
df_customer = df_customer.withColumn('CUST_PHONE',
            F.format_string("(%s)%s-%s", 
                            F.substring(df_customer['CUST_PHONE'], 1, 3), 
                            F.substring(df_customer['CUST_PHONE'], 4, 3), 
                            F.substring(df_customer['CUST_PHONE'], 7, 4)))

In [38]:
# verify if phone was converted to required format
df_customer.select('CUST_PHONE').show()

+-------------+
|   CUST_PHONE|
+-------------+
|(721)123-7818|
|(212)123-8933|
|(212)124-3018|
|(215)124-3215|
|(420)124-2074|
|(414)124-2570|
|(218)123-9685|
|(221)312-1238|
|(220)124-0689|
|(320)123-5222|
|(513)124-1363|
|(316)123-6228|
|(416)123-8165|
|(321)412-1234|
|(519)124-2113|
|(514)123-9888|
|(520)124-0158|
|(513)124-1408|
|(519)123-8390|
|(213)123-5067|
+-------------+
only showing top 20 rows



In [39]:
df_customer.groupBy('CUST_PHONE').count().orderBy(F.col('count').desc()).show()


+-------------+-----+
|   CUST_PHONE|count|
+-------------+-----+
|(421)512-1237|    7|
|(321)412-1235|    6|
|(521)712-1238|    6|
|(521)712-1242|    5|
|(321)412-1236|    4|
|(221)312-1237|    4|
|(321)412-1239|    4|
|(421)512-1240|    4|
|(421)512-1243|    3|
|(221)312-1236|    3|
|(521)712-1239|    3|
|(521)712-1241|    3|
|(221)312-1238|    3|
|(521)712-1240|    3|
|(321)412-1241|    3|
|(221)312-1235|    2|
|(521)712-1237|    2|
|(221)312-1240|    2|
|(321)412-1240|    2|
|(321)412-1242|    2|
+-------------+-----+
only showing top 20 rows



In [40]:
# Group by CUST_PHONE and count the occurrences
phone_duplicates = df_customer.groupBy("CUST_PHONE").agg(F.count("CUST_PHONE").alias("count"))
phone_duplicates.show()

#Filter for phone numbers that appear more than once
#phone_duplicates = df_customer.filter(df_customer["count"] >= 1)

#phone_duplicates.show()


+-------------+-----+
|   CUST_PHONE|count|
+-------------+-----+
|(413)124-2311|    1|
|(513)123-7245|    1|
|(215)124-0724|    1|
|(317)123-9668|    1|
|(216)124-1962|    1|
|(715)123-7172|    1|
|(413)123-9063|    1|
|(321)412-1236|    2|
|(213)123-8430|    1|
|(214)124-1308|    1|
|(713)123-5511|    1|
|(718)124-1268|    1|
|(421)512-1235|    4|
|(321)412-1240|    1|
|(214)124-2942|    1|
|(518)124-2269|    1|
|(216)124-3343|    1|
|(517)124-2509|    1|
|(516)123-5913|    1|
|(420)123-7727|    1|
+-------------+-----+
only showing top 20 rows



In [41]:
# convert middle names to lower case.
df_customer = df_customer.withColumn('MIDDLE_NAME', lower(df_customer['MIDDLE_NAME']))
df_customer.select('MIDDLE_NAME').show()

+-----------+
|MIDDLE_NAME|
+-----------+
|         wm|
|    brendan|
|   ezequiel|
|      trina|
|        may|
|    ambrose|
|      larry|
|        ora|
|     tracie|
|    mitchel|
|      denny|
|   isabelle|
|      henry|
|     rickey|
|  heriberto|
|   angelita|
|   dorothea|
|  jefferson|
|     maximo|
|    arnulfo|
+-----------+
only showing top 20 rows



In [42]:
# convert first letter of first and last names to capital letters
df_customer = df_customer.withColumn('FIRST_NAME', initcap(df_customer['FIRST_NAME']))
df_customer = df_customer.withColumn('LAST_NAME', initcap(df_customer['LAST_NAME']))

In [50]:
df_customer.show()
df_customer.printSchema()


+---------+----------+-----------+---------+----------------+--------------------+-------------+----------+-------------+--------+-------------+--------------------+--------------------+
|      SSN|FIRST_NAME|MIDDLE_NAME|LAST_NAME|  CREDIT_CARD_NO| FULL_STREET_ADDRESS|    CUST_CITY|CUST_STATE| CUST_COUNTRY|CUST_ZIP|   CUST_PHONE|          CUST_EMAIL|        LAST_UPDATED|
+---------+----------+-----------+---------+----------------+--------------------+-------------+----------+-------------+--------+-------------+--------------------+--------------------+
|123456100|      Alec|         wm|   Hooper|4210653310061055|656, Main Street ...|      Natchez|        MS|United States|   39120|(418)123-7818| AHooper@example.com|2018-04-21T12:49:...|
|123453023|      Etta|    brendan|   Holman|4210653310102868|  829, Redwood Drive| Wethersfield|        CT|United States|   06109|(318)123-8933| EHolman@example.com|2018-04-21T12:49:...|
|123454487|    Wilber|   ezequiel|   Dunham|4210653310116272|683,

In [51]:
df_customer.write.format("jdbc") \
  .mode("append") \
  .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
  .option("dbtable", "creditcard_capstone.cdw_sapp_customer") \
  .option("user", mysql_username) \
  .option("password", mysql_password) \
  .save()

Py4JJavaError: An error occurred while calling o443.save.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 98.0 failed 1 times, most recent failure: Lost task 0.0 in stage 98.0 (TID 66) (LAPTOP-IQMPCJRO executor driver): java.sql.BatchUpdateException: Data truncation: Data too long for column 'CUST_COUNTRY' at row 1
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:53)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:484)
	at com.mysql.cj.util.Util.handleNewInstance(Util.java:192)
	at com.mysql.cj.util.Util.getInstance(Util.java:167)
	at com.mysql.cj.util.Util.getInstance(Util.java:174)
	at com.mysql.cj.jdbc.exceptions.SQLError.createBatchUpdateException(SQLError.java:224)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:853)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchInternal(ClientPreparedStatement.java:435)
	at com.mysql.cj.jdbc.StatementImpl.executeBatch(StatementImpl.java:794)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:740)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:891)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1009)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1009)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2303)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: com.mysql.cj.jdbc.exceptions.MysqlDataTruncation: Data truncation: Data too long for column 'CUST_COUNTRY' at row 1
	at com.mysql.cj.jdbc.exceptions.SQLExceptionsMapping.translateException(SQLExceptionsMapping.java:104)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeInternal(ClientPreparedStatement.java:953)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeUpdateInternal(ClientPreparedStatement.java:1092)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:832)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$1(RDD.scala:1009)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:1007)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.saveTable(JdbcUtils.scala:890)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:70)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:133)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:856)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:387)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:360)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: java.sql.BatchUpdateException: Data truncation: Data too long for column 'CUST_COUNTRY' at row 1
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:53)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:484)
	at com.mysql.cj.util.Util.handleNewInstance(Util.java:192)
	at com.mysql.cj.util.Util.getInstance(Util.java:167)
	at com.mysql.cj.util.Util.getInstance(Util.java:174)
	at com.mysql.cj.jdbc.exceptions.SQLError.createBatchUpdateException(SQLError.java:224)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:853)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchInternal(ClientPreparedStatement.java:435)
	at com.mysql.cj.jdbc.StatementImpl.executeBatch(StatementImpl.java:794)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:740)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:891)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1009)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1009)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2303)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: com.mysql.cj.jdbc.exceptions.MysqlDataTruncation: Data truncation: Data too long for column 'CUST_COUNTRY' at row 1
	at com.mysql.cj.jdbc.exceptions.SQLExceptionsMapping.translateException(SQLExceptionsMapping.java:104)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeInternal(ClientPreparedStatement.java:953)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeUpdateInternal(ClientPreparedStatement.java:1092)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:832)
	... 17 more
