In [0]:
from pyspark.sql.functions import when, col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession


# Start Spark session (Databricks auto-creates this for you)
spark = SparkSession.builder.getOrCreate()

# Load the table from Unity Catalog with a filter for fiscal year > 2020
df = spark.read.table("workspace.default.preprossesed_mines_data").filter("fiscal_yr > 2020").limit(1000)

# # Preview data
# df = df.select("mine_id","current_mine_type", "primary_sic","subunit", "weather_summary", "shift_type","mining_equip",
#           "occupation", "time_diff", "narrative","last_maintenance_dt","tavg","prcp","snow").show(5)

# Select the desired columns
selected_columns_df = df.select(
    "mine_id",
    "current_mine_type",
    "primary_sic",
    "subunit",
    "weather_summary",
    "shift_type",
    "mining_equip",
    "occupation",
    "time_diff",
    "narrative",
    "last_maintenance_dt",
    "accident_dt",
    "tavg",
    "prcp",
    "snow",'equipment_age_yrs', 'failure_probability'
)
display(selected_columns_df)


In [0]:
from pyspark.sql.functions import lit, rand
from pyspark.sql.functions import col, datediff, row_number
from pyspark.sql.window import Window


# Positive examples
accidents = selected_columns_df.withColumn("accident", lit(1))

# Negative examples - shuffle/corrupt features slightly
non_accidents = selected_columns_df.withColumn("accident", lit(0))
# Combine both
balanced_df = accidents.union(non_accidents)
balanced_df= balanced_df.withColumn("days_since_maint", datediff(col("accident_dt"), col("last_maintenance_dt")))

In [0]:
# Count class distribution
class_counts = balanced_df.groupBy("accident").count().collect()
counts = {row['accident']: row['count'] for row in class_counts}

# Desired total sample size
sample_size = 1000

# Stratified sampling fractions
fractions = {
    0: min(1.0, sample_size * 0.5 / counts[0]),
    1: min(1.0, sample_size * 0.5 / counts[1])
}

sampled_df = balanced_df.stat.sampleBy("accident", fractions=fractions, seed=42)

display(sampled_df)


In [0]:
# ---------------New Approach----------------
# 1. Import libraries
import pyspark.pandas as ps
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# # Load the table from Unity Catalog with a filter for fiscal year > 2020
# df = spark.read.table("workspace.default.preprossesed_mines_data").filter("fiscal_yr > 2020")

# 3. Convert Spark → Pandas-on-Spark → Pandas
df_ps = sampled_df.to_pandas_on_spark()
df_pd = df_ps.to_pandas()

# 4. Drop nulls or fill if needed
df_pd = df_pd.dropna()

# 5. Encode categorical variables
categorical_cols = ['mine_id', 'current_mine_type', 'primary_sic', 'subunit',
                    'weather_summary', 'shift_type', 'mining_equip', 'occupation',
                    'time_diff', 'tavg', 'prcp', 'snow',
                    'days_since_maint', 'equipment_age_yrs', 'failure_probability']

df_encoded = pd.get_dummies(df_pd, columns=categorical_cols, drop_first=True)



In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 6. Define features and label
X = df_encoded.drop(columns=['accident'])  # assuming 'accident' is the binary label
y = df_encoded['accident']

# # 7. Train/test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # 8. Train logistic regression model
# model = LogisticRegression(max_iter=1000)
# model.fit(X_train, y_train)

# Convert text data to numerical features
vectorizer = TfidfVectorizer()
X_transformed = vectorizer.fit_transform(X['narrative'])  # replace 'text_column' with the actual column name containing text data

# 7. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# 8. Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 9. Predict & evaluate
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
