## 3. Data Scientist - Create ML models with Spark

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('Spark - Data Scientist Demo') \
.config('spark.jars', '/usr/lib/spark/jars/spark-bigquery-latest.jar') \
.config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.18.0") \
.getOrCreate()

In [None]:
spark.conf.get("spark.app.id")
spark.sparkContext._jvm.scala.util.Properties.versionString()

In [None]:
project_id = !gcloud config list --format 'value(core.project)' 2>/dev/null
bq_raw_dataset_name = project_id[0] + '-raw'
bq_raw_dataset_name = bq_raw_dataset_name.replace('-', '_')
bq_raw_table_path = project_id[0] + ':' + bq_raw_dataset_name + '.transaction_data_train' 
bq_raw_table_path

#### Load Training Data using Spark

In [None]:
df_transaction_data_train = spark.read \
.format("bigquery") \
.option("table", bq_raw_table_path) \
.load()

In [None]:
view_name = "bank_transaction_view"
df_transaction_data_train.createOrReplaceTempView(view_name)

In [None]:
data = spark.sql("""
SELECT * 
FROM bank_transaction_view
""")

In [None]:
data = data.drop('transactionID')
data.cache()

#### Create a pyspark ML pipeline 

The pipeline will transform the features and train a Decision Tree classifier 

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier


categorical_cols = [field for (field, data_type) in data.dtypes 
                    if ((data_type == "string") & (field != 'isFraud'))]

ohe_output_cols = [x + "_OHE" for x in categorical_cols]

string_indexers = StringIndexer(inputCol='type', outputCol='type' +"_Index").fit(data) 

one_hot_indexer = OneHotEncoder(inputCol='type_Index', outputCol='type' +"_OHE")

numeric_cols = [field for (field, data_type) in data.dtypes 
                if (((data_type == "double") | (data_type == "int") | (data_type == "bigint"))
                  & (field != 'isFraud'))]

assembler_inputs = ohe_output_cols + numeric_cols

vec_assembler = VectorAssembler(
    inputCols=assembler_inputs,
    outputCol="features")


dtc = DecisionTreeClassifier(labelCol="isFraud", featuresCol="features", maxDepth=3, maxBins=12)


pipeline = Pipeline(stages=[
    string_indexers,
    one_hot_indexer,
    vec_assembler,
    dtc 
])

#### Train the model 

In [None]:
model = pipeline.fit(data)

#### Persist the model to GCS 

In [None]:
from pyspark.ml import Pipeline, PipelineModel

gcs_bucket = project_id[0] + '-data'
model_path = f'gs://{gcs_bucket}/model/'

model.write().overwrite().save(model_path)

#### Predict on test data 
**TODO**
* Provide path_to_predict_csv

In [None]:
path_to_predict_csv = "<gcs-path>/transaction_data_test.csv"
df_transaction_data_predict_from_csv = spark \
.read \
.option("inferSchema" , "true") \
.option("header" , "true") \
.csv(path_to_predict_csv)
df_transaction_data_predict_from_csv.printSchema()

Load the saved model 

In [None]:
loaded_pipeline_model = PipelineModel.load(model_path)

In [None]:
predictions = loaded_pipeline_model.transform(df_transaction_data_predict_from_csv)

In [None]:
predictions.show(5)

In [None]:
# Select example rows to display.
predictions.select("prediction", "isFraud").show(5)

### Evaluate the model

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

binaryEvaluator = BinaryClassificationEvaluator(labelCol="isFraud")

auc = binaryEvaluator.evaluate(predictions, {binaryEvaluator.metricName: "areaUnderROC"})
print(auc)

In [None]:
tests_np = np.array((predictions.select("isFraud","prediction").collect()))
tests_np

In [None]:
tests_np = np.array((predictions.select("isFraud","prediction").collect()))

np_acc = accuracy_score(tests_np[:,0], tests_np[:,1])
np_f1 = f1_score(tests_np[:,0], tests_np[:,1])
np_precision = precision_score(tests_np[:,0], tests_np[:,1])
np_recall = recall_score(tests_np[:,0], tests_np[:,1])
np_auc = roc_auc_score(tests_np[:,0], tests_np[:,1])

print("f1:", np_f1)
print("precision:", np_precision)
print("recall:", np_recall)

#### Create confusion matrix

In [None]:
# import package that will generate the confusion matrix scores
from sklearn.metrics import confusion_matrix
# import packages that will help display the scores
import pandas as pd

confusion_matrix_scores = confusion_matrix(tests_np[:,0], 
                                           tests_np[:,1], 
                                           labels=[1, 0])

# display scores as a heatmap
df = pd.DataFrame(confusion_matrix_scores, 
                  columns = ["Predicted True", "Predicted Not True"],
                  index = ["Actually True", "Actually Not True"])


df.head()

In [None]:
bq_annotated_table_name = 'transaction_data_predictions'
bq_annotated_table_path=  project_id[0] +  '_annotated.' + bq_annotated_table_name
bq_annotated_table_path = bq_annotated_table_path.replace('-', '_')
bq_annotated_table_path

#### Persist predictions as an annotated dataset

In [None]:
schema_inline = predictions.schema.simpleString().replace('struct<', '').replace('>', '').replace('int', 'int64').replace('double', 'float64').replace('bigint64', 'int64').replace('vector', 'STRING')

!bq mk --table \
{bq_annotated_table_path} \
{schema_inline}

In [None]:
predictions.write \
.format("bigquery") \
.option("table", project_id[0]  + ':' + bq_annotated_table_path) \
.option("temporaryGcsBucket", project_id[0]  + '-data') \
.mode('overwrite') \
.save()

In [None]:
annotated_dataset_name =  project_id[0] +  '_annotated'
annotated_dataset_name = annotated_dataset_name.replace('-', '_')
annotated_dataset_name

**TODO** 
* Add annotated_dataset_name in the FROM clause below

In [None]:
%%bigquery
SELECT * FROM <annotated_dataset_name>.INFORMATION_SCHEMA.TABLES;

#### Join buisness data to enrich the dataset

**TODO** 
* Provide the path to the join csv

In [None]:
path_to_join_csv = "gs://<enriched_dataset_name>/transaction_data_join.csv"
df_transaction_data_join_from_csv = spark \
.read \
.option("inferSchema" , "true") \
.option("header" , "true") \
.csv(path_to_join_csv)
df_transaction_data_join_from_csv.printSchema()

**TODO** (Challenge 2)
* Join the 2 spark dataframes (predictions & df_transaction_data_join_from_csv) on transactionID field 

In [None]:
joined_result = predictions.join(df_transaction_data_join_from_csv, "transactionID")

In [None]:
joined_result.show(5)

In [None]:
joined_result.count()

#### Persist result as an enriched dataset

In [None]:
bq_enriched_table_name = 'transaction_analysis_enriched'
bq_enriched_table_path = project_id[0] +  '_enriched.' + bq_enriched_table_name
bq_enriched_table_path = bq_enriched_table_path.replace('-', '_')
bq_enriched_table_path = project_id[0] + ':' + bq_enriched_table_path
bq_enriched_table_path

In [None]:
schema_inline = joined_result.schema.simpleString().replace('struct<', '').replace('>', '').replace('int', 'int64').replace('bigint64', 'int64').replace('double', 'float64').replace('vector', 'STRING')

!bq mk --table \
{bq_enriched_table_path} \
{schema_inline}

In [None]:
joined_result.write \
.format("bigquery") \
.option("table", bq_enriched_table_path) \
.option("temporaryGcsBucket", project_id[0]  + '-data') \
.mode('overwrite') \
.save()

In [None]:
enriched_dataset_name = project_id[0] +  '_enriched'
enriched_dataset_name = enriched_dataset_name.replace('-', '_')
enriched_dataset_name

**TODO**
* Provide the enriched_dataset_name in the FROM clause

In [None]:
%%bigquery
SELECT * FROM <enriched_dataset_name>.INFORMATION_SCHEMA.TABLES;

**TODO**
* Query the enriched table

In [None]:
%%bigquery 
<inser-code-here>
LIMIT 10

**TODO** (Optional: Challenge 3)
* Improve the ML pipeline
    * Try out different ML models [[doc]](https://spark.apache.org/docs/latest/ml-pipeline.html)
    * Explore hyperparameter tuning 
    * How would you split the data when there is class imbalance? 