## 4. Data Scientist - Create ML models with Spark

In [120]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

In [121]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('Spark - Data Scientist Demo') \
.config('spark.jars', '/usr/lib/spark/jars/spark-bigquery-latest.jar') \
.config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.18.0") \
.getOrCreate()

 # --properties spark:spark.jars=gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.18.0.jar

In [122]:
spark.conf.get("spark.app.id")
spark.sparkContext._jvm.scala.util.Properties.versionString()

'version 2.12.12'

In [123]:
#!ls /usr/lib/spark/jars/

Create a Spark DataFrame from hive table

In [124]:
table = "datalake-vol2:datalake_vol2_raw.banking_marketing_train"
df_banking_marketing_train = spark.read \
.format("bigquery") \
.option("table", table) \
.load()

In [125]:
df_banking_marketing_train.createOrReplaceTempView("bank_marketing_train_view")

In [126]:
data = spark.sql("""
SELECT * 
FROM bank_marketing_train_view
""").cache()

Cache the DataFrame in memory 

In [127]:
data.cache()

DataFrame[call_id: string, Age: bigint, Job: string, MaritalStatus: string, Education: string, Default: boolean, Balance: bigint, Housing: boolean, Loan: boolean, Contact: string, Day: bigint, Month: string, Duration: bigint, Campaign: bigint, PDays: bigint, Previous: bigint, POutcome: string, Deposit: bigint]

In [128]:
data.columns

['call_id',
 'Age',
 'Job',
 'MaritalStatus',
 'Education',
 'Default',
 'Balance',
 'Housing',
 'Loan',
 'Contact',
 'Day',
 'Month',
 'Duration',
 'Campaign',
 'PDays',
 'Previous',
 'POutcome',
 'Deposit']

In [129]:
data.groupBy("Deposit").count().show()

+-------+-----+
|Deposit|count|
+-------+-----+
|      1|36009|
|      2| 4771|
+-------+-----+



### Split training and test data

In [130]:
(train_data, test_data) = data.randomSplit([0.7, 0.3], seed=42)

In [131]:
train_data.groupBy("Deposit").count().show()

+-------+-----+
|Deposit|count|
+-------+-----+
|      1|25254|
|      2| 3370|
+-------+-----+



In [132]:
train_data.count()

28624

In [133]:
test_data.count()

12156

## Create Spark ML Pipeline

Train a RandomForestClassifier model

In [155]:
train_data = train_data.drop('call_id')
train_data
#test_data = test_data.drop('call_id')

DataFrame[Age: bigint, Job: string, MaritalStatus: string, Education: string, Default: boolean, Balance: bigint, Housing: boolean, Loan: boolean, Contact: string, Day: bigint, Month: string, Duration: bigint, Campaign: bigint, PDays: bigint, Previous: bigint, POutcome: string, Deposit: bigint]

In [156]:

spark.version   
#predictions.select("call_id").show(5)

'3.0.1'

In [157]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.classification import RandomForestClassifier


categorical_cols = [field for (field, data_type) in train_data.dtypes 
                    if ((data_type == "string") & (field != 'Deposit'))]

index_output_cols = [x + "_Index" for x in categorical_cols]

ohe_output_cols = [x + "_OHE" for x in categorical_cols]

categorical_string_indexer = StringIndexer(
    inputCols=categorical_cols,
    outputCols=index_output_cols,
    handleInvalid="skip")

ohe_encoder = OneHotEncoder(
    inputCols=index_output_cols,
    outputCols=ohe_output_cols)

numeric_cols = [field for (field, data_type) in train_data.dtypes 
                if (((data_type == "double") | (data_type == "int") | (data_type == "bigint"))
                  & (field != 'Deposit'))]

assembler_inputs = ohe_output_cols + numeric_cols

vec_assembler = VectorAssembler(
    inputCols=assembler_inputs,
    outputCol="features")

label_string_indexer = StringIndexer(). \
  setInputCol("Deposit"). \
  setOutputCol("label")

# Train a RandomForestClassifier model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[
    categorical_string_indexer,
    ohe_encoder,
    vec_assembler,
    label_string_indexer,
    rf
])

# Train model on training data
pipeline_model = pipeline.fit(train_data)

# Make predictions on test.
tests = pipeline_model.transform(test_data)

# Select example rows to display.
tests.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(39,[1,11,14,16,2...|
|       0.0|  1.0|(39,[1,13,19,29,3...|
|       0.0|  0.0|(39,[3,13,17,18,2...|
|       0.0|  0.0|(39,[1,11,13,17,1...|
|       0.0|  0.0|(39,[2,11,14,16,1...|
+----------+-----+--------------------+
only showing top 5 rows



In [158]:
test_data.show(5)

+--------------------+---+----------+-------------+---------+-------+-------+-------+-----+---------+---+-----+--------+--------+-----+--------+--------+-------+
|             call_id|Age|       Job|MaritalStatus|Education|Default|Balance|Housing| Loan|  Contact|Day|Month|Duration|Campaign|PDays|Previous|POutcome|Deposit|
+--------------------+---+----------+-------------+---------+-------+-------+-------+-----+---------+---+-----+--------+--------+-----+--------+--------+-------+
|000458ba-5ab1-4f7...| 54|management|      married| tertiary|  false|   7249|   true| true| cellular|  4|  feb|     102|       2|   77|       1| failure|      1|
|000d9fb1-7eeb-46a...| 58|management|     divorced|secondary|  false|   3161|  false|false|telephone| 30|  jul|     542|       2|   -1|       0| unknown|      2|
|0010dfae-c527-462...| 53|    admin.|     divorced|secondary|  false|    315|   true|false|  unknown|  5|  may|     181|       2|   -1|       0| unknown|      1|
|001218b4-4a10-446...| 31|ma

As the dataset is imbalanced a good metric is AUC: Area Under the ROC Curve. [Learn more about AUC here.](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc#AUC)

In [159]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

binaryEvaluator = BinaryClassificationEvaluator(labelCol="label")

auc = binaryEvaluator.evaluate(tests, {binaryEvaluator.metricName: "areaUnderROC"})
print(auc)

0.8851243267494067


In [160]:
tests_np = np.array((tests.select("label","prediction").collect()))
tests_np

array([[0., 0.],
       [1., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [161]:
tests_np = np.array((tests.select("label","prediction").collect()))

np_acc = accuracy_score(tests_np[:,0], tests_np[:,1])
np_f1 = f1_score(tests_np[:,0], tests_np[:,1])
np_precision = precision_score(tests_np[:,0], tests_np[:,1])
np_recall = recall_score(tests_np[:,0], tests_np[:,1])
np_auc = roc_auc_score(tests_np[:,0], tests_np[:,1])

print("f1:", np_f1)
print("precision:", np_precision)
print("recall:", np_recall)

f1: 0.009936124911284601
precision: 0.875
recall: 0.004996431120628123


In [162]:
# import package that will generate the confusion matrix scores
from sklearn.metrics import confusion_matrix
# import packages that will help display the scores
import pandas as pd

confusion_matrix_scores = confusion_matrix(tests_np[:,0], 
                                           tests_np[:,1], 
                                           labels=[1, 0])

# display scores as a heatmap
df = pd.DataFrame(confusion_matrix_scores, 
                  columns = ["Predicted True", "Predicted Not True"],
                  index = ["Actually True", "Actually Not True"])


df.head()

Unnamed: 0,Predicted True,Predicted Not True
Actually True,7,1394
Actually Not True,1,10754


### Save model_pipeline

In [163]:
from pyspark.ml import Pipeline, PipelineModel

model_path = 'gs://datalake-vol2-data/'

pipeline_model.write().overwrite().save(model_path)

In [164]:
loaded_pipeline_model = PipelineModel.load(model_path)

In [165]:
# Make predictions using loaded model

tests = loaded_pipeline_model.transform(test_data)

tests.show(5)

+--------------------+---+----------+-------------+---------+-------+-------+-------+-----+---------+---+-----+--------+--------+-----+--------+--------+-------+-----------+---------+-------------------+--------------+-------------+---------------+-------------+--------------+-------------+-------------+-----------------+--------------+--------------------+-----+--------------------+--------------------+----------+
|             call_id|Age|       Job|MaritalStatus|Education|Default|Balance|Housing| Loan|  Contact|Day|Month|Duration|Campaign|PDays|Previous|POutcome|Deposit|Month_Index|Job_Index|MaritalStatus_Index|POutcome_Index|Contact_Index|Education_Index| POutcome_OHE|       Job_OHE|  Contact_OHE|Education_OHE|MaritalStatus_OHE|     Month_OHE|            features|label|       rawPrediction|         probability|prediction|
+--------------------+---+----------+-------------+---------+-------+-------+-------+-----+---------+---+-----+--------+--------+-----+--------+--------+-------+-

In [169]:
bq_table_path = 'datalake_vol2_annotated.bank_test'
schema_inline = tests.schema.simpleString().replace('struct<', '').replace('>', '').replace('int', 'int64').replace('bigint64', 'int64').replace('double', 'numeric').replace('vector', 'STRING')

!bq mk --table \
{bq_table_path} \
{schema_inline}

Table 'datalake-vol2:datalake_vol2_annotated.bank_test' successfully created.


In [174]:
tests.write \
.format("bigquery") \
.option("table", 'datalake-vol2:datalake_vol2_annotated.bank_test') \
.option("temporaryGcsBucket", "datalake-vol2-data") \
.mode('overwrite') \
.save()

In [175]:
%%bigquery
SELECT * FROM datalake_vol2_annotated.INFORMATION_SCHEMA.TABLES;

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 381.40query/s]                          
Downloading: 100%|██████████| 3/3 [00:01<00:00,  2.09rows/s]


Unnamed: 0,table_catalog,table_schema,table_name,table_type,is_insertable_into,is_typed,creation_time
0,datalake-vol2,datalake_vol2_annotated,banking_test,BASE TABLE,YES,NO,2021-01-14 12:17:50.130000+00:00
1,datalake-vol2,datalake_vol2_annotated,banking_marketing_predict,BASE TABLE,YES,NO,2021-01-11 16:47:22.859000+00:00
2,datalake-vol2,datalake_vol2_annotated,bank_test,BASE TABLE,YES,NO,2021-01-14 12:17:25.430000+00:00


### Predict Results

In [201]:
path_to_predict_csv = "gs://datalake-vol2-data/banking_predict_set.csv"
df_bank_predict_from_csv = spark \
.read \
.option("inferSchema" , "true") \
.option("header" , "true") \
.csv(path_to_predict_csv)
df_bank_predict_from_csv.printSchema()

root
 |-- call_id: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Job: string (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Default: boolean (nullable = true)
 |-- Balance: integer (nullable = true)
 |-- Housing: boolean (nullable = true)
 |-- Loan: boolean (nullable = true)
 |-- Contact: string (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- Duration: integer (nullable = true)
 |-- Campaign: integer (nullable = true)
 |-- PDays: integer (nullable = true)
 |-- Previous: integer (nullable = true)
 |-- POutcome: string (nullable = true)
 |-- Deposit: integer (nullable = true)



In [202]:
# Make predictions on test.
predictions = loaded_pipeline_model.transform(df_bank_predict_from_csv)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(39,[0,11,15,16,2...|
|       0.0|  0.0|(39,[3,12,13,16,2...|
|       0.0|  0.0|(39,[0,12,13,16,2...|
|       0.0|  0.0|(39,[5,11,15,16,2...|
|       0.0|  0.0|(39,[1,12,14,16,2...|
+----------+-----+--------------------+
only showing top 5 rows



In [203]:
predictions.show(5)

+--------------------+---+-----------+-------------+---------+-------+-------+-------+-----+--------+---+-----+--------+--------+-----+--------+--------+-------+-----------+---------+-------------------+--------------+-------------+---------------+-------------+--------------+-------------+-------------+-----------------+--------------+--------------------+-----+--------------------+--------------------+----------+
|             call_id|Age|        Job|MaritalStatus|Education|Default|Balance|Housing| Loan| Contact|Day|Month|Duration|Campaign|PDays|Previous|POutcome|Deposit|Month_Index|Job_Index|MaritalStatus_Index|POutcome_Index|Contact_Index|Education_Index| POutcome_OHE|       Job_OHE|  Contact_OHE|Education_OHE|MaritalStatus_OHE|     Month_OHE|            features|label|       rawPrediction|         probability|prediction|
+--------------------+---+-----------+-------------+---------+-------+-------+-------+-----+--------+---+-----+--------+--------+-----+--------+--------+-------+-

### Join Data

In [204]:
path_to_join_csv = "gs://datalake-vol2-data/banking_join.csv"
df_bank_join_from_csv = spark \
.read \
.option("inferSchema" , "true") \
.option("header" , "true") \
.csv(path_to_join_csv)
df_bank_join_from_csv.printSchema()

root
 |-- call_id: string (nullable = true)
 |-- account_number: integer (nullable = true)



In [209]:
predictions = predictions.toPandas()
df_bank_join_from_csv = df_bank_join_from_csv.toPandas()
result = pd.merge(predictions, df_bank_join_from_csv, on=['call_id','call_id'])

In [210]:
result

Unnamed: 0,call_id,Age,Job,MaritalStatus,Education,Default,Balance,Housing,Loan,Contact,...,Contact_OHE,Education_OHE,MaritalStatus_OHE,Month_OHE,features,label,rawPrediction,probability,prediction,account_number
0,2d33a1aa-fcb2-4745-a4b7-e1450a975988,32,blue-collar,married,primary,False,-56,True,True,cellular,...,"(1.0, 0.0)","(0.0, 0.0, 1.0)","(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"[16.219303142520648, 3.7806968574793562]","[0.8109651571260322, 0.18903484287396777]",0.0,54609395
1,5b85bc40-b1fb-4eca-a42a-d0743b480ac7,32,admin.,single,secondary,False,103,True,False,cellular,...,"(1.0, 0.0)","(1.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"[16.820702893966505, 3.1792971060335025]","[0.841035144698325, 0.15896485530167506]",0.0,61634206
2,8692a0b6-b4bd-470f-8cfa-65595ceb43ad,29,blue-collar,single,secondary,False,314,True,False,cellular,...,"(1.0, 0.0)","(1.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"[17.000265662116227, 2.9997343378837775]","[0.8500132831058113, 0.14998671689418885]",0.0,23075661
3,74c1a1a5-eaee-40a8-a64e-7be48c5457f1,60,retired,married,primary,False,0,True,False,cellular,...,"(1.0, 0.0)","(0.0, 0.0, 1.0)","(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",0.0,"[17.141563878271093, 2.8584361217289147]","[0.8570781939135543, 0.1429218060864457]",0.0,25150644
4,d74e8230-30eb-4f54-a29d-4b6a03239acd,33,management,single,tertiary,False,1423,True,False,cellular,...,"(1.0, 0.0)","(0.0, 1.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"[13.753890287719315, 6.246109712280687]","[0.6876945143859657, 0.3123054856140343]",0.0,57323608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4554,2df245d2-8152-4141-92c9-89f9c79701a5,19,student,single,secondary,False,88,False,False,cellular,...,"(1.0, 0.0)","(1.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"[15.748316175079738, 4.251683824920261]","[0.7874158087539869, 0.21258419124601305]",0.0,91123689
4555,c49a76d0-94fd-4451-9b30-0810533aeff5,65,retired,married,primary,False,308,False,False,cellular,...,"(1.0, 0.0)","(0.0, 0.0, 1.0)","(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1.0,"[15.512640992203687, 4.487359007796313]","[0.7756320496101844, 0.22436795038981563]",0.0,42013046
4556,3c104f26-7702-4e06-b01f-e4aba3b2df13,42,management,single,tertiary,False,0,False,False,cellular,...,"(1.0, 0.0)","(0.0, 1.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"[17.468086461920187, 2.5319135380798112]","[0.8734043230960093, 0.12659567690399057]",0.0,57833130
4557,c165e67f-f867-4268-9772-0cccef472781,45,management,married,tertiary,False,786,False,False,cellular,...,"(1.0, 0.0)","(0.0, 1.0, 0.0)","(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"[16.799695936124497, 3.200304063875503]","[0.8399847968062248, 0.16001520319377516]",0.0,28880299


### Store result to Enriched Zone

In [216]:
result = spark.createDataFrame(result)

In [219]:
bq_result_table_path = 'datalake-vol2:datalake_vol2_enriched.bank_result'
schema_inline = result.schema.simpleString().replace('struct<', '').replace('>', '').replace('int', 'int64').replace('bigint64', 'int64').replace('double', 'numeric').replace('vector', 'STRING')

!bq mk --table \
{bq_result_table_path} \
{schema_inline}

Table 'datalake-vol2:datalake_vol2_enriched.bank_result' successfully created.


In [220]:
result.write \
.format("bigquery") \
.option("table", bq_result_table_path) \
.option("temporaryGcsBucket", "datalake-vol2-data") \
.mode('overwrite') \
.save()

In [222]:
%%bigquery
SELECT * FROM datalake_vol2_enriched.INFORMATION_SCHEMA.TABLES;

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 360.61query/s]                          
Downloading: 100%|██████████| 1/1 [00:02<00:00,  2.61s/rows]


Unnamed: 0,table_catalog,table_schema,table_name,table_type,is_insertable_into,is_typed,creation_time
0,datalake-vol2,datalake_vol2_enriched,bank_result,BASE TABLE,YES,NO,2021-01-14 12:51:00.955000+00:00
