## Bank Demo—Marketing Analysis

#### Starting Point: SparkSession

In [58]:
import org.apache.spark.sql.SparkSession

In [76]:
val spark = (SparkSession
        .builder()
        .appName("Banking")
        .master("local")
        .config("spark.sql.warehouse.dir", "tmp/sparksql")
        .getOrCreate())

#### Creating DataFrames

In [77]:
 val df = (spark.read
                   .option("header", true)
                   .option("delimiter", ";")
                   .option("inferSchema", true)
                   .csv("data/bank-full.csv"))

In [78]:
// Displays the content of the DataFrame to stdout
df.show(5)

+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job|marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management|married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician| single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur|married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar|married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown| single|  unknown|     no|      1|     no|  no|unknown|  5|  may|     19

In [79]:
// Print the schema in a tree format
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



#### This import is needed to use the $-notation

In [80]:
import spark.implicits._

In [81]:
//Give marketing success rate. (No. of people subscribed / total no. of entries)

val success_rate = (df.filter($"y" === "yes").count).toDouble / (df.count).toDouble

print("Marketing Success Rate: " +success_rate)


Marketing Success Rate: 0.11698480458295547

In [91]:
//Check max, min, Mean and median age of average targeted customer 

df.select(max($"age"), min($"age"), mean($"age")).show()

+--------+--------+-----------------+
|max(age)|min(age)|         avg(age)|
+--------+--------+-----------------+
|      95|      18|40.93621021432837|
+--------+--------+-----------------+



#### // Register the DataFrame as a SQL temporary view

In [86]:
df.createOrReplaceTempView("bank")

In [88]:
spark.sql("SELECT PERCENTILE(age, 0.50) FROM bank").show()

+----------------------------------------+
|percentile(age, CAST(0.50 AS DOUBLE), 1)|
+----------------------------------------+
|                                    39.0|
+----------------------------------------+



In [114]:
spark.sql("select max(age), min(age), avg(age) , percentile(age, 0.50) from bank").show()

+--------+--------+-----------------+----------------------------------------+
|max(age)|min(age)|         avg(age)|percentile(age, CAST(0.50 AS DOUBLE), 1)|
+--------+--------+-----------------+----------------------------------------+
|      95|      18|40.93621021432837|                                    39.0|
+--------+--------+-----------------+----------------------------------------+



In [111]:
//Check quality of clients by checking average balance, median balance of clients 

spark.sql("SELECT AVG(balance) , PERCENTILE(balance, 0.50) FROM bank").show()

+------------------+--------------------------------------------+
|      avg(balance)|percentile(balance, CAST(0.50 AS DOUBLE), 1)|
+------------------+--------------------------------------------+
|1362.2720576850766|                                       448.0|
+------------------+--------------------------------------------+



In [90]:
//Check if age matters in marketing subscription for deposit 

df.groupBy("y").agg(avg($"age")).show()

+---+------------------+
|  y|          avg(age)|
+---+------------------+
| no| 40.83898602274435|
|yes|41.670069956513515|
+---+------------------+



In [92]:
//Check if marital status mattered for subscription to deposit. 

df.groupBy("y").agg(count($"marital")).show()

+---+--------------+
|  y|count(marital)|
+---+--------------+
| no|         39922|
|yes|          5289|
+---+--------------+



In [93]:
// Check if age and marital status together mattered for subscription to deposit scheme 

df.groupBy("marital","y").count.show()

+--------+---+-----+
| marital|  y|count|
+--------+---+-----+
|divorced|yes|  622|
|  single| no|10878|
|  single|yes| 1912|
|divorced| no| 4585|
| married|yes| 2755|
| married| no|24459|
+--------+---+-----+



In [98]:
df.groupBy("age","y").count.show()

+---+---+-----+
|age|  y|count|
+---+---+-----+
| 20| no|   35|
| 78| no|   16|
| 56|yes|   68|
| 28|yes|  162|
| 29|yes|  171|
| 71| no|   29|
| 86|yes|    4|
| 57| no|  750|
| 79|yes|   10|
| 22|yes|   40|
| 42| no| 1131|
| 31|yes|  206|
| 59|yes|   88|
| 87|yes|    3|
| 25| no|  414|
| 34|yes|  198|
| 23|yes|   44|
| 63| no|   47|
| 24| no|  234|
| 64| no|   39|
+---+---+-----+
only showing top 20 rows



In [102]:
//Do Feature engineering for age column and find right age effect on campaign

val df_new = df.withColumn("age_cat", when($"age" < 25 , "young")
                                    .otherwise( when($"age" > 60 , "old")
                                    .otherwise("mid_age")  ))

In [116]:
df_new.groupBy("age_cat","y").count.show()

+-------+---+-----+
|age_cat|  y|count|
+-------+---+-----+
|mid_age| no|38634|
|mid_age|yes| 4580|
|  young| no|  602|
|    old|yes|  502|
|    old| no|  686|
|  young|yes|  207|
+-------+---+-----+

