
# Battery Failure Prediction – Databricks Lakehouse Demo

This notebook demonstrates an end-to-end mini-pipeline for **lithium-ion battery test data** using Databricks Lakehouse:

1. Load CSV test data (field/vendor returns)
2. Clean & cast columns
3. Save as Delta tables (Bronze → Silver → Gold)
4. Train a simple classification model to predict `FailFlag`
5. Log metrics with MLflow and visualize predictions


In [None]:

from pyspark.sql.functions import col

# Path to your uploaded file (adjust path for your environment)
raw_path = "/FileStore/tables/sample_battery_data.csv"

raw_df = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .csv(raw_path)
)

display(raw_df)
print("Row count:", raw_df.count())


In [None]:

# Clean and cast data
df = (
    raw_df
    .withColumnRenamed("voltage", "Voltage")
    .withColumnRenamed("temperature", "Temperature")
    .withColumnRenamed("capacity", "Capacity")
    .withColumnRenamed("fail_flag", "FailFlag")
    .select("Voltage", "Temperature", "Capacity", "FailFlag")
)

df = (
    df
    .withColumn("Voltage", col("Voltage").cast("double"))
    .withColumn("Temperature", col("Temperature").cast("double"))
    .withColumn("Capacity", col("Capacity").cast("double"))
    .withColumn("FailFlag", col("FailFlag").cast("int"))
)

display(df.summary())


In [None]:

# Save data to Delta tables (Bronze and Silver layers)
bronze_path = "/mnt/battery_demo/bronze/battery_raw_delta"
silver_path = "/mnt/battery_demo/silver/battery_clean_delta"

raw_df.write.mode("overwrite").format("delta").save(bronze_path)
df.write.mode("overwrite").format("delta").save(silver_path)



### Register Delta Tables in Databricks SQL
Use this SQL in a `%sql` cell:
```sql
CREATE DATABASE IF NOT EXISTS battery_demo;

DROP TABLE IF EXISTS battery_demo.battery_raw_delta;
DROP TABLE IF EXISTS battery_demo.battery_clean_delta;

CREATE TABLE battery_demo.battery_raw_delta
USING DELTA
LOCATION '/mnt/battery_demo/bronze/battery_raw_delta';

CREATE TABLE battery_demo.battery_clean_delta
USING DELTA
LOCATION '/mnt/battery_demo/silver/battery_clean_delta';

SELECT * FROM battery_demo.battery_clean_delta LIMIT 20;
```


In [None]:

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import mlflow
import mlflow.spark

feature_cols = ["Voltage", "Temperature", "Capacity"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

assembled_df = assembler.transform(df).na.drop()

train_df, test_df = assembled_df.randomSplit([0.8, 0.2], seed=42)

lr = LogisticRegression(featuresCol="features", labelCol="FailFlag")
lr_model = lr.fit(train_df)

predictions = lr_model.transform(test_df)
display(predictions.select("Voltage","Temperature","Capacity","FailFlag","probability","prediction"))

evaluator = BinaryClassificationEvaluator(labelCol="FailFlag", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print("Test AUC:", auc)

mlflow.set_experiment("/Users/" + spark.sql("SELECT current_user()").collect()[0][0] + "/battery_failure_demo")

with mlflow.start_run(run_name="battery_lr_model"):
    mlflow.log_metric("test_auc", auc)
    mlflow.spark.log_model(lr_model, "model")
    mlflow.log_param("features", ",".join(feature_cols))



## Next Steps
- Add additional features (e.g., site ID, battery model, cycle count)
- Try more advanced models (Random Forest, Gradient Boosted Trees)
- Build a dashboard in Databricks SQL to visualize results
- Schedule daily retraining jobs or integrate with APIs for real-time scoring
