In [0]:
from pyspark.sql.functions import when, col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession


# Start Spark session (Databricks auto-creates this for you)
spark = SparkSession.builder.getOrCreate()

# Load the table from Unity Catalog with a filter for fiscal year > 2020
df = spark.read.table("workspace.default.preprossesed_mines_data").filter("fiscal_yr > 2020")

# # Preview data
# df = df.select("mine_id","current_mine_type", "primary_sic","subunit", "weather_summary", "shift_type","mining_equip",
#           "occupation", "time_diff", "narrative","last_maintenance_dt","tavg","prcp","snow").show(5)

# Select the desired columns
selected_columns_df = df.select(
    "mine_id",
    "current_mine_type",
    "primary_sic",
    "subunit",
    "weather_summary",
    "shift_type",
    "mining_equip",
    "occupation",
    "time_diff",
    "narrative",
    "last_maintenance_dt",
    "accident_dt",
    "tavg",
    "prcp",
    "snow"
)
display(selected_columns_df)


In [0]:
from pyspark.sql.functions import lit, rand
from pyspark.sql.functions import col, datediff, row_number
from pyspark.sql.window import Window


# Positive examples
accidents = selected_columns_df.withColumn("accident", lit(1))

# Negative examples - shuffle/corrupt features slightly
non_accidents = selected_columns_df.withColumn("accident", lit(0))
# Combine both
balanced_df = accidents.union(non_accidents)
balanced_df= balanced_df.withColumn("days_since_maint", datediff(col("accident_dt"), col("last_maintenance_dt")))

In [0]:

print(balanced_df.dtypes)

In [0]:
string_cols = [f.name for f in balanced_df.schema.fields if str(f.dataType) == 'StringType()']
print(string_cols)

In [0]:
# Define a function to label encode with Pandas
def encode_pandas(df_iter):
    for pdf in df_iter:
        for col_name in string_cols:
            pdf[col_name + "_encoded"] = pdf[col_name].astype("category").cat.codes
        yield pdf



In [0]:
from pyspark.sql.types import StructField, IntegerType, StructType

# Create a new schema by adding encoded columns to the existing schema
new_schema = balanced_df.schema
for col_name in string_cols:
    new_schema = new_schema.add(StructField(f"{col_name}_encoded", IntegerType()))

# Apply encoding
df_encoded = balanced_df.mapInPandas(encode_pandas, schema=new_schema)

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

# List of all encoded columns
encoded_cols = [col_name + "_encoded" for col_name in string_cols]

# Numerical columns to include
numeric_cols = ['time_diff', 'tavg', 'prcp', 'snow', 'days_since_maint', 'equipment_age_yrs', 'failure_probability']

# Final features to include in the model
feature_cols = encoded_cols + numeric_cols


In [0]:
import pandas as pd
import numpy as np

def assemble_features(pdf: pd.DataFrame) -> pd.DataFrame:
    # List your feature columns here
    feature_cols = ['mine_id', 'current_mine_type', 'primary_sic', 'subunit',
                    'weather_summary', 'shift_type', 'mining_equip', 'occupation',
                    'time_diff', 'tavg', 'prcp', 'snow',
                    'days_since_maint', 'equipment_age_yrs', 'failure_probability']
    
    # Stack features into a vector
    pdf['features'] = pdf[feature_cols].astype(float).values.tolist()
    
    return pdf[['features', 'accident']]

assembled_df = df_encoded.mapInPandas(assemble_features, schema="features array<double>, accident int")
assembled_df.show()


In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Use the dataframe returned from mapInPandas
# Example: `assembled_df` has `features` and `accident` columns
# Let's cache it for performance
# assembled_df = assembled_df.cache()

# ----------------------
# Step 1: Train/test split
# ----------------------
train_df, test_df = assembled_df.randomSplit([0.8, 0.2], seed=42)

# ----------------------
# Step 2: Define Model
# ----------------------
lr = LogisticRegression(featuresCol="features", labelCol="accident", maxIter=10)

# ----------------------
# Step 3: Train Model
# ----------------------
lr_model = lr.fit(train_df)

# ----------------------
# Step 4: Predictions
# ----------------------
predictions = lr_model.transform(test_df)
predictions.select("accident", "probability", "prediction").show(10, truncate=False)

# ----------------------
# Step 5: Evaluate
# ----------------------
evaluator = BinaryClassificationEvaluator(labelCol="accident", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
print(f"ROC AUC: {roc_auc:.4f}")



In [0]:
rf = RandomForestClassifier(labelCol="accident", featuresCol="features", numTrees=100)
pipeline = Pipeline(stages=[assembler, rf])

model = pipeline.fit(train_data)
