In [1]:
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import pyspark.sql.types as T
from datetime import datetime, timedelta, timezone
from airflow.providers.google.cloud.operators.bigquery import BigQueryCreateExternalTableOperator
import os
import subprocess
import joblib
import pandas as pd
from chemotools.baseline import LinearCorrection
from chemotools.smooth import SavitzkyGolayFilter
from chemotools.derivative import NorrisWilliams
from chemotools.feature_selection import RangeCut
from sklearn.preprocessing import StandardScaler

In [2]:
# Get GCP input data
PROJECT_ID = 'intricate-reef-411403'
BUCKET = "test_bucket-intricate-reef-41103"
PROJECT_HOME = os.getenv("HOME")+"/data-engineering-zoomcamp/project" # this may need to be updated when dockerized
credentials_location = PROJECT_HOME+"/.google/credentials/gcp.json"

# Get file structure data
local_data_path = "/.project/data/raw/Mendeley_data/" # this will need to change when dockerized
temp_path = "/.project/data/raw/temp/"
local_data_file = "100_Batches_IndPenSim_V3.csv"
path_to_local_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
gcs_input_path = "raw/"
gcs_output_path = "processed/raman_context/"
spark_jar_path = f"{PROJECT_HOME}/lib/gcs-connector-hadoop3-2.2.5.jar,{PROJECT_HOME}/lib/spark-3.5-bigquery-0.37.0.jar"

# Get PLS Model
model_path = PROJECT_HOME+"/data/processed/model/raman_pls_model.pk1"
pls_model = joblib.load(model_path)

In [3]:
# start spark standalone instance with worker
start_spark_master = "cd $SPARK_HOME && ./sbin/start-master.sh --port 7078"
start_spark_worker = "cd $SPARK_HOME && ./sbin/start-worker.sh spark://127.0.0.1:7078"

start_master_process = subprocess.Popen(start_spark_master, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
start_master_output, start_master_error = start_master_process.communicate()

start_worker_process = subprocess.Popen(start_spark_worker, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
start_worker_output, start_worker_error = start_worker_process.communicate()

In [4]:
# define spark configuration
conf = SparkConf() \
    .setMaster("spark://127.0.0.1:7078") \
    .setAppName("predict_concentration_data") \
    .set("spark.jars", spark_jar_path) \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.executor.memory","4g") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [5]:
# set up spark context
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

24/04/14 23:04:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
# Start Spark session using standalone cluster
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [7]:
# find most recent existing record in T_RAMAN_PREDICTION
# gather all raman records produced in the last 5 minutes
#current_time = datetime.utcnow()
current_time = "2024-04-14 10:08:55"
current_time = datetime.strptime(current_time, "%Y-%m-%d %H:%M:%S")
back_time = current_time - timedelta(minutes=7)
current_ts = current_time.strftime("%Y-%m-%d %H:%M:%S")
back_ts = back_time.strftime("%Y-%m-%d %H:%M:%S")
date_range = (current_ts,back_ts)
where_clause = """sample_ts BETWEEN TO_TIMESTAMP('{1}','yyyy-MM-dd HH:mm:ss') AND TO_TIMESTAMP('{0}','yyyy-MM-dd HH:mm:ss')""".format(*date_range)
try:
    most_recent_prediction = spark.read.format("bigquery") \
            .option("project",PROJECT_ID) \
            .option("dataset","test_schema") \
            .option("table","t_raman_prediction") \
            .load() \
            .where(where_clause) \
            .agg(F.max("sample_ts")) \
            .collect()[0][0]
    print(f"Some existing records found - starting after most recent existing prediction time")
except Exception as e:
    print(f"No existing records found within 7 minutes of current time")
    most_recent_prediction = back_time
most_recent_ts = most_recent_prediction.strftime("%Y-%m-%d %H:%M:%S")

No existing records found within 7 minutes of current time


In [8]:
# Gather New context data that has not been traced into T_RAMAN_PREDICTION
date_range = (current_ts,most_recent_ts)
where_clause = """sample_ts BETWEEN TO_TIMESTAMP('{1}','yyyy-MM-dd HH:mm:ss') AND TO_TIMESTAMP('{0}','yyyy-MM-dd HH:mm:ss')""".format(*date_range)
raman_context_ids = spark.read.format("bigquery") \
        .option("project",PROJECT_ID) \
        .option("dataset","test_schema") \
        .option("table","t_sample_context") \
        .load() \
        .select(["id","penicillin_concentration_g_l"]) \
        .where(where_clause)

In [70]:
# Join to Raman Spectra Data
# Gather raw data
df_raw_values = spark.read.parquet('gs://test_bucket-intricate-reef-41103/raw/*.parquet')
# filter columns
raman_cols = ["id"]
raman_cols.extend([str(i) for i in list(range(689,2089)[::-1])])
# separate raw df into relevant raman measurement data
df_raman = df_raw_values.select(raman_cols) \
    .withColumnRenamed("id","id_raman")
# join to raman_context_ids to filter out old data
df_raman_new = df_raman.join(raman_context_ids, df_raman.id_raman == raman_context_ids.id, "inner").drop("id_raman")

In [71]:
df_raman_new.printSchema()

root
 |-- 2088: double (nullable = true)
 |-- 2087: double (nullable = true)
 |-- 2086: double (nullable = true)
 |-- 2085: double (nullable = true)
 |-- 2084: double (nullable = true)
 |-- 2083: double (nullable = true)
 |-- 2082: double (nullable = true)
 |-- 2081: double (nullable = true)
 |-- 2080: double (nullable = true)
 |-- 2079: double (nullable = true)
 |-- 2078: double (nullable = true)
 |-- 2077: double (nullable = true)
 |-- 2076: double (nullable = true)
 |-- 2075: double (nullable = true)
 |-- 2074: double (nullable = true)
 |-- 2073: double (nullable = true)
 |-- 2072: double (nullable = true)
 |-- 2071: double (nullable = true)
 |-- 2070: double (nullable = true)
 |-- 2069: double (nullable = true)
 |-- 2068: double (nullable = true)
 |-- 2067: double (nullable = true)
 |-- 2066: double (nullable = true)
 |-- 2065: double (nullable = true)
 |-- 2064: double (nullable = true)
 |-- 2063: double (nullable = true)
 |-- 2062: double (nullable = true)
 |-- 2061: double (null

In [72]:
df_raman_predict = df_raman_new.toPandas()

                                                                                

In [73]:
df_raman_predict.shape

(7000, 1402)

In [74]:
# Calculate Derivative
df_raman_spectra = df_raman_predict.copy()
df_raman_test = df_raman_spectra
df_raman_test.drop("id", axis=1, inplace=True)
df_raman_test.drop("penicillin_concentration_g_l", axis=1, inplace=True)
rcbw = RangeCut(0, df_raman_test.shape[1])
data = rcbw.fit_transform(df_raman_test)
raman_df = pd.DataFrame(data)
lc = LinearCorrection()
spectra_baseline = lc.fit_transform(raman_df)
sgf = SavitzkyGolayFilter(window_size=15, polynomial_order=2)
spectra_norm = sgf.fit_transform(spectra_baseline)
nw = NorrisWilliams(window_size=15, gap_size=3, derivative_order=1)
spectra_derivative = pd.DataFrame(nw.fit_transform(spectra_norm))

In [75]:
# Create PLS Model
x = spectra_derivative

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [76]:
predicted_concentration = pls_model.predict(x_scaled)

In [81]:
df_raman_compare = pd.concat([df_raman_predict[["id","penicillin_concentration_g_l"]], pd.DataFrame({'predicted_penicillin_concentration': predicted_concentration})], axis=1)

In [85]:
df_raman_feature = spark.createDataFrame(df_raman_compare)

In [87]:
# define schema for T_RAMAN_PREDICT
predict_schema = T.StructType([
    T.StructField("id",T.IntegerType()),
    T.StructField("penicillin_concentration_g_l",T.DoubleType()),
    T.StructField("predicted_penicillin_concentration",T.DoubleType())
])

In [88]:
# add new sample context data to T_RAMAN_PREDICTION
print(f"About to insert new data to T_SAMPLE_CONTEXT")
df_raman_feature.write.format("bigquery") \
    .option("temporaryGcsBucket", BUCKET) \
    .option("table", PROJECT_ID+".test_schema.t_raman_prediction") \
    .option("createDisposition", "CREATE_IF_NEEDED") \
    .option("writeDisposition", "WRITE_TRUNCATE") \
    .option("schema", predict_schema.json()) \
    .mode("append") \
    .save()

About to insert new data to T_SAMPLE_CONTEXT


                                                                                

In [89]:
# Stop Local Standalone cluster
!cd $SPARK_HOME && ./sbin/stop-master.sh

IOStream.flush timed out
stopping org.apache.spark.deploy.master.Master


In [90]:
# Stop Local Standalone cluster
!cd $SPARK_HOME && ./sbin/stop-worker.sh

24/04/14 23:41:47 WARN StandaloneAppClient$ClientEndpoint: Connection to 127.0.0.1:7078 failed; waiting for master to reconnect...
24/04/14 23:41:47 WARN StandaloneSchedulerBackend: Disconnected from Spark cluster! Waiting for reconnection...
24/04/14 23:41:47 WARN StandaloneAppClient$ClientEndpoint: Connection to 127.0.0.1:7078 failed; waiting for master to reconnect...


IOStream.flush timed out
stopping org.apache.spark.deploy.worker.Worker


24/04/14 23:42:02 ERROR TaskSchedulerImpl: Lost executor 0 on 10.168.0.7: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/04/14 23:42:02 WARN BlockManagerMasterEndpoint: No more replicas available for broadcast_4_python !
