## 5. Data Scientist - Continued

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import numpy as np
import pandas as pd
import pyspark.sql.functions as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('Spark - Data Scientist Demo') \
.config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest.jar') \
.config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.18.0") \
.getOrCreate()

In [None]:
spark.conf.get("spark.app.id")
spark.sparkContext._jvm.scala.util.Properties.versionString()

#### Refactor Table - add a date (day) field
Transform the step-column [1,742] to days [1,31]


In [None]:
project_id = !gcloud config list --format 'value(core.project)' 2>/dev/null
bq_enriched_dataset_name = project_id[0] + '-enriched'
bq_enriched_dataset_name = bq_enriched_dataset_name.replace('-', '_')
bq_enriched_table_path = project_id[0] + ':' + bq_enriched_dataset_name + '.transaction_analysis_enriched' 
bq_enriched_table_path

In [None]:
data = spark.read \
.format("bigquery") \
.option("table", bq_enriched_table_path) \
.load()

In [None]:
data = data.drop('type_OHE','features','rawPrediction','probability')
data.cache()

In [None]:
data.show(5)

In [None]:
pandas_df = data.toPandas()

**TODO** (Challenge 3)
* Convert the column steps into days
* Each step corresponds to one hour. The dataset was created over the span of a month. There are 742 steps which should be converted to 31 days

In [None]:
pandas_df['days'] = <enter-code-here>

In [None]:
#Check if there are all days - this should output day 1 to 31
sorted(pd.unique(pandas_df['days']).tolist())

In [None]:
pandas_df['days'] = pandas_df['days'].astype(int)

In [None]:
pandas_df

#### Do some further analyis including visualization

In [None]:
ax = pandas_df['type'].value_counts().plot(kind='bar',title="Number per Typ")
ax.set_xlabel("Transaction Type")
ax.set_ylabel("Frequency")

In [None]:
ax = pandas_df['days'].value_counts().plot(kind='bar',title="Busy day in a month")
ax.set_xlabel("Day")
ax.set_ylabel("Frequency")

In [None]:
pandas_df.groupby(['type']).sum().plot(kind='pie', y='amount')

#### Store table in a performance opimized way

In [None]:
df = spark.createDataFrame(pandas_df)

In [None]:
bq_optimized_table_name = 'transaction_data_optimized'
bq_optimized_table_path=  project_id[0] +  '_enriched.' + bq_optimized_table_name
bq_optimized_table_path = bq_optimized_table_path.replace('-', '_')
bq_optimized_table_path

In [None]:
schema_inline = df.schema.simpleString().replace('struct<', '').replace('>', '').replace('int', 'int64').replace('double', 'float64').replace('bigint64', 'int64').replace('vector', 'STRING').replace('bigint', 'int64')

In [None]:
!bq mk  \
--range_partitioning=days,1,31,1 \
--clustering_fields=days \
{bq_optimized_table_path} \
{schema_inline}

In [None]:
df.write \
.format("bigquery") \
.option("table", project_id[0]  + ':' + bq_optimized_table_path) \
.option("temporaryGcsBucket", project_id[0]  + '-data') \
.mode('overwrite') \
.save()

#### Measure performance

In [None]:
bq_partition_table_path = project_id[0] + ':' + bq_enriched_dataset_name + '.transaction_data_optimized' 
bq_partition_table_path

In [None]:
#load data in filter by partition
partitionset = spark.read \
  .format("bigquery") \
  .option("table", bq_partition_table_path) \
  .option("filter", 'days >= 5 AND days < 25') \
  .load()

In [None]:
partitionset = partitionset.select("days", "amount")

In [None]:
aggregation_optimized = partitionset.groupBy('days').agg(F.sum('amount').alias('total_amount'))

In [None]:
%%timeit -r 10
aggregation_optimized.orderBy('total_amount', ascending=False)