In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



In [0]:
spark = SparkSession.builder \
    .appName("Logistic Regression Example") \
    .getOrCreate()

In [0]:
data_path = "/FileStore/tables/train_processed.csv"  
data = spark.read.csv(data_path, header=True, inferSchema=True)

In [0]:
data.printSchema()

root
 |-- 0: integer (nullable = true)
 |-- 1: integer (nullable = true)
 |-- 2: double (nullable = true)
 |-- 3: double (nullable = true)
 |-- 4: double (nullable = true)
 |-- 5: double (nullable = true)
 |-- 6: double (nullable = true)
 |-- 7: double (nullable = true)
 |-- 8: double (nullable = true)
 |-- 9: double (nullable = true)
 |-- 10: double (nullable = true)
 |-- 11: double (nullable = true)
 |-- 12: double (nullable = true)
 |-- 13: double (nullable = true)
 |-- 14: double (nullable = true)
 |-- 15: double (nullable = true)
 |-- 16: double (nullable = true)
 |-- 17: double (nullable = true)
 |-- 18: double (nullable = true)
 |-- 19: double (nullable = true)
 |-- 20: double (nullable = true)
 |-- 21: integer (nullable = true)
 |-- 22: integer (nullable = true)
 |-- 23: double (nullable = true)
 |-- 24: double (nullable = true)
 |-- 25: double (nullable = true)



In [0]:
print("Preview of the training data:")
data.show(5)  

Preview of the training data:
+---+---+-------+-------+-----+------+------+-------+-------+-----+-----+------+-------+-------+---+-----+------+-------+-------+------+----+---+----+-----+-----+-------+
|  0|  1|      2|      3|    4|     5|     6|      7|      8|    9|   10|    11|     12|     13| 14|   15|    16|     17|     18|    19|  20| 21|  22|   23|   24|     25|
+---+---+-------+-------+-----+------+------+-------+-------+-----+-----+------+-------+-------+---+-----+------+-------+-------+------+----+---+----+-----+-----+-------+
|  1|  1|-7.0E-4|-4.0E-4|100.0|518.67|641.82| 1589.7| 1400.6|14.62|21.61|554.36|2388.06|9046.19|1.3|47.47|521.66|2388.02|8138.62|8.4195|0.03|392|2388|100.0|39.06| 23.419|
|  1|  2| 0.0019|-3.0E-4|100.0|518.67|642.15|1591.82|1403.14|14.62|21.61|553.75|2388.04|9044.07|1.3|47.49|522.28|2388.07|8131.49|8.4318|0.03|392|2388|100.0| 39.0|23.4236|
|  1|  3|-0.0043| 3.0E-4|100.0|518.67|642.35|1587.99| 1404.2|14.62|21.61|554.26|2388.08|9052.94|1.3|47.27|522.42|23

In [0]:
column_names = ['label', 'engine', 'cycle', 'setting1', 'setting2', 'setting3',
                'sensor1', 'sensor2', 'sensor3', 'sensor4', 'sensor5',
                'sensor6', 'sensor7', 'sensor8', 'sensor9', 'sensor10',
                'sensor11', 'sensor12', 'sensor13', 'sensor14', 'sensor15',
                'sensor16', 'sensor17', 'sensor18', 'sensor19', 'sensor20',
                'sensor21']

# Rename the columns in the DataFrame
for old_name, new_name in zip(data.columns, column_names):
    data = data.withColumnRenamed(old_name, new_name)

In [0]:
data.show(5)  

+-----+------+-------+--------+--------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|label|engine|  cycle|setting1|setting2|setting3|sensor1|sensor2|sensor3|sensor4|sensor5|sensor6|sensor7|sensor8|sensor9|sensor10|sensor11|sensor12|sensor13|sensor14|sensor15|sensor16|sensor17|sensor18|sensor19|sensor20|
+-----+------+-------+--------+--------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|    1|     1|-7.0E-4| -4.0E-4|   100.0|  518.67| 641.82| 1589.7| 1400.6|  14.62|  21.61| 554.36|2388.06|9046.19|    1.3|   47.47|  521.66| 2388.02| 8138.62|  8.4195|    0.03|     392|    2388|   100.0|   39.06|  23.419|
|    1|     2| 0.0019| -3.0E-4|   100.0|  518.67| 642.15|1591.82|1403.14|  14.62|  21.61| 553.75|2388.04|9044.07|   

In [0]:

# Prepare the feature vector and label
feature_columns = ['engine', 'cycle', 'setting1', 'setting2', 'setting3',
                   'sensor1', 'sensor2', 'sensor3', 'sensor4', 'sensor5',
                   'sensor6', 'sensor7', 'sensor8', 'sensor9', 'sensor10',
                   'sensor11', 'sensor12', 'sensor13', 'sensor14', 'sensor15',
                   'sensor16', 'sensor17', 'sensor18', 'sensor19', 'sensor20']
# Combine features into a single feature vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

In [0]:
final_data = data.select("features", "label")  
train_data, test_data = final_data.randomSplit([0.7, 0.3], seed=42)
lr = LogisticRegression(labelCol="label", featuresCol="features")
lr_model = lr.fit(train_data)
predictions = lr_model.transform(test_data)
predictions.select("features", "label", "prediction").show(10)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy = {accuracy:.2f}")

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[1.0,-0.005,3.0E-...|   81|      63.0|
|[1.0,-0.0037,2.0E...|   80|      77.0|
|[1.0,-0.0034,3.0E...|   88|      45.0|
|[1.0,-0.0033,3.0E...|  100|      63.0|
|[1.0,-0.0021,-4.0...|   25|      37.0|
|[1.0,-0.0021,3.0E...|   33|      53.0|
|[1.0,-0.002,-3.0E...|   53|      40.0|
|[1.0,-0.0018,6.0E...|    2|      65.0|
|[1.0,-0.0016,5.0E...|   18|      45.0|
|[1.0,-0.0014,-4.0...|   96|      57.0|
+--------------------+-----+----------+
only showing top 10 rows

Test set accuracy = 0.09


In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=100)
pipeline = Pipeline(stages=[indexer, rf])

rf_model = pipeline.fit(train_data)
rf_predictions = rf_model.transform(test_data)
rf_predictions.select("features", "label", "indexedLabel", "prediction").show(10)

# Evaluate the model
rf_evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
rf_accuracy = rf_evaluator.evaluate(rf_predictions)

print(f"Random Forest Test set accuracy = {rf_accuracy:.2f}")


+--------------------+-----+------------+----------+
|            features|label|indexedLabel|prediction|
+--------------------+-----+------------+----------+
|[1.0,-0.005,3.0E-...|   81|        19.0|       1.0|
|[1.0,-0.0037,2.0E...|   80|        70.0|       7.0|
|[1.0,-0.0034,3.0E...|   88|        26.0|       5.0|
|[1.0,-0.0033,3.0E...|  100|        56.0|       0.0|
|[1.0,-0.0021,-4.0...|   25|        31.0|      71.0|
|[1.0,-0.0021,3.0E...|   33|        59.0|      21.0|
|[1.0,-0.002,-3.0E...|   53|        62.0|       3.0|
|[1.0,-0.0018,6.0E...|    2|         4.0|       4.0|
|[1.0,-0.0016,5.0E...|   18|        41.0|       8.0|
|[1.0,-0.0014,-4.0...|   96|         2.0|      33.0|
+--------------------+-----+------------+----------+
only showing top 10 rows

Random Forest Test set accuracy = 0.11
