# Fetal Health Classification
### Justin Farnsworth (farnswj1@tcnj.edu)

In [1]:
# Make PySpark importable
import findspark
findspark.init()

In [2]:
# Imported libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, RobustScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col

In [3]:
# Set up the Spark session
spark = SparkSession.builder.appName("Fetal Health Classification").getOrCreate()
sc = spark.sparkContext

In [4]:
# Load the dataset
df = spark.read.csv("fetal_health.csv", inferSchema=True, header=True)
df.toPandas()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.000,0.000,0.000,0.000,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.000,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.000,0.008,0.000,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000,0.000,0.007,0.000,0.0,0.0,79.0,0.2,25.0,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,2.0
2122,140.0,0.001,0.000,0.007,0.000,0.0,0.0,78.0,0.4,22.0,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,2.0
2123,140.0,0.001,0.000,0.007,0.000,0.0,0.0,79.0,0.4,20.0,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,2.0
2124,140.0,0.001,0.000,0.006,0.000,0.0,0.0,78.0,0.4,27.0,...,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,2.0


In [5]:
# Show the schema
df.printSchema()

root
 |-- baseline value: double (nullable = true)
 |-- accelerations: double (nullable = true)
 |-- fetal_movement: double (nullable = true)
 |-- uterine_contractions: double (nullable = true)
 |-- light_decelerations: double (nullable = true)
 |-- severe_decelerations: double (nullable = true)
 |-- prolongued_decelerations: double (nullable = true)
 |-- abnormal_short_term_variability: double (nullable = true)
 |-- mean_value_of_short_term_variability: double (nullable = true)
 |-- percentage_of_time_with_abnormal_long_term_variability: double (nullable = true)
 |-- mean_value_of_long_term_variability: double (nullable = true)
 |-- histogram_width: double (nullable = true)
 |-- histogram_min: double (nullable = true)
 |-- histogram_max: double (nullable = true)
 |-- histogram_number_of_peaks: double (nullable = true)
 |-- histogram_number_of_zeroes: double (nullable = true)
 |-- histogram_mode: double (nullable = true)
 |-- histogram_mean: double (nullable = true)
 |-- histogram_medi

In [6]:
# Check for null values
df.toPandas().isnull().sum()

baseline value                                            0
accelerations                                             0
fetal_movement                                            0
uterine_contractions                                      0
light_decelerations                                       0
severe_decelerations                                      0
prolongued_decelerations                                  0
abnormal_short_term_variability                           0
mean_value_of_short_term_variability                      0
percentage_of_time_with_abnormal_long_term_variability    0
mean_value_of_long_term_variability                       0
histogram_width                                           0
histogram_min                                             0
histogram_max                                             0
histogram_number_of_peaks                                 0
histogram_number_of_zeroes                                0
histogram_mode                          

In [7]:
# Show more information about the dataset
df.toPandas().describe()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
count,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,...,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0
mean,133.303857,0.003178,0.009481,0.004366,0.001889,3e-06,0.000159,46.990122,1.332785,9.84666,...,93.579492,164.0254,4.068203,0.323612,137.452023,134.610536,138.09031,18.80809,0.32032,1.304327
std,9.840844,0.003866,0.046666,0.002946,0.00296,5.7e-05,0.00059,17.192814,0.883241,18.39688,...,29.560212,17.944183,2.949386,0.706059,16.381289,15.593596,14.466589,28.977636,0.610829,0.614377
min,106.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.2,0.0,...,50.0,122.0,0.0,0.0,60.0,73.0,77.0,0.0,-1.0,1.0
25%,126.0,0.0,0.0,0.002,0.0,0.0,0.0,32.0,0.7,0.0,...,67.0,152.0,2.0,0.0,129.0,125.0,129.0,2.0,0.0,1.0
50%,133.0,0.002,0.0,0.004,0.0,0.0,0.0,49.0,1.2,0.0,...,93.0,162.0,3.0,0.0,139.0,136.0,139.0,7.0,0.0,1.0
75%,140.0,0.006,0.003,0.007,0.003,0.0,0.0,61.0,1.7,11.0,...,120.0,174.0,6.0,0.0,148.0,145.0,148.0,24.0,1.0,1.0
max,160.0,0.019,0.481,0.015,0.015,0.001,0.005,87.0,7.0,91.0,...,159.0,238.0,18.0,10.0,187.0,182.0,186.0,269.0,1.0,3.0


In [8]:
# Get the counts of the labels
df.toPandas()["fetal_health"].value_counts()

1.0    1655
2.0     295
3.0     176
Name: fetal_health, dtype: int64

In [9]:
# Create a train set and a test set
train_set, test_set = df.randomSplit([0.7, 0.3], seed=3)

In [10]:
# Combine the features into a vector to feed into the ML algorithm
assembler = VectorAssembler(
    inputCols=list(filter(lambda c: c != "fetal_health", df.columns)),
    outputCol="features"
)

In [11]:
# Scale the features
scaler = RobustScaler(inputCol="features", outputCol="scaled_features")

In [12]:
# Create the random forest classifier
rf = RandomForestClassifier(featuresCol="scaled_features", labelCol="fetal_health")

In [13]:
# Make a param grid
grid = ParamGridBuilder()\
       .addGrid(rf.maxDepth, [2, 5, 8])\
       .addGrid(rf.maxBins, [5, 10, 15, 20])\
       .addGrid(rf.numTrees, [10, 30, 50])\
       .build()

In [14]:
# Use the evaluator to measure the performance of the model
evaluator = BinaryClassificationEvaluator(labelCol="fetal_health", rawPredictionCol="prediction")

In [15]:
# Set up cross-validation
cv = CrossValidator(estimator=rf, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5)

In [16]:
# Create a pipeline
pipeline = Pipeline(stages=[assembler, scaler, cv])

In [17]:
# Train the model (this could take several minutes)
model = pipeline.fit(train_set)

In [18]:
# Make the predictions
predictions = model.transform(test_set).select(col("fetal_health"), col("prediction"))
predictions.toPandas()

Unnamed: 0,fetal_health,prediction
0,1.0,1.0
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
4,1.0,1.0
...,...,...
691,2.0,2.0
692,2.0,1.0
693,2.0,1.0
694,1.0,1.0


In [19]:
# Compute the accuracy
evaluator.evaluate(predictions)

1.0

In [20]:
# Show the confusion matrix
metrics = MulticlassMetrics(predictions.rdd)
metrics.confusionMatrix().toArray()

array([[544.,  68.],
       [  3.,  26.]])

In [21]:
# Terminate the Spark session
spark.stop()