In [1]:
#installing pyspark
#pip install pyspark

In [2]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer,StandardScaler
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

In [3]:
# Create a SparkSession
spark = SparkSession.builder.appName("NasaPredictor").master("local").getOrCreate()

In [4]:
# Load the dataset
data = spark.read.csv("C:/Users/EZENWAJIAKU CHINEDU/Desktop/Course Work/Big Data/nasa.csv", header=True, inferSchema=True)

In [5]:
import pyspark.sql.functions as fc
print((data.count(), len(data.columns)))
data.describe().show()


# null values in each column
data_null = data.agg(*[fc.count(fc.when(fc.isnull(c), c)).alias(c) for c in data.columns])
data_null.show()   # no null values

(4687, 40)
+-------+-----------------+-----------------+------------------+-------------------+------------------+------------------+------------------+---------------------+---------------------+--------------------+--------------------+-------------------+-------------------------+----------------------------+---------------------------+------------------+-----------------------+-----------------+---------------------+--------------------+-------------+------------------+------------------------+------------------+--------------------------+---------------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------+
|summary| Neo Reference ID|             Name|Absolute Magnitude| Est Dia in KM(min)|Est Dia in KM(max)| Est Dia in M(min)| Est Dia in M(max)|Est Dia in Miles(min)|Est Dia in Miles(max)|E

In [6]:
# Drop columns not neccessary for classification
new_data = data.drop('Neo Reference ID', 'Name', 'Orbit ID', 'Close Approach Date',
                        'Epoch Date Close Approach', 'Orbit Determination Date')

In [7]:
#new_data.dtypes
#new_data.summary
#new_data.describe().show()

In [8]:
# Changing the Boolean type of Harzadous column to String
from pyspark.sql.functions import when

new_data = new_data.withColumn('Hazardous_Encoded', when(new_data.Hazardous==True, 1).otherwise(0))
new_data = new_data.drop("Hazardous")

In [9]:
new_data.groupBy('Hazardous_Encoded').count().orderBy('count').show()

+-----------------+-----+
|Hazardous_Encoded|count|
+-----------------+-----+
|                1|  755|
|                0| 3932|
+-----------------+-----+



In [10]:
print((new_data.count(), len(new_data.columns)))

(4687, 34)


In [11]:
#new_data.dtypes

In [12]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

#label encoding of categorical columns
categorical_cols = ["Orbiting Body","Equinox"]
label_encoders = [StringIndexer(inputCol=col, outputCol=col + "_encoded").fit(new_data) for col in categorical_cols]
pipeline = Pipeline(stages=label_encoders)
new_data = pipeline.fit(new_data).transform(new_data)

In [13]:
new_data = new_data.drop("Orbiting Body","Equinox")

In [14]:
new_data.dtypes

[('Absolute Magnitude', 'double'),
 ('Est Dia in KM(min)', 'double'),
 ('Est Dia in KM(max)', 'double'),
 ('Est Dia in M(min)', 'double'),
 ('Est Dia in M(max)', 'double'),
 ('Est Dia in Miles(min)', 'double'),
 ('Est Dia in Miles(max)', 'double'),
 ('Est Dia in Feet(min)', 'double'),
 ('Est Dia in Feet(max)', 'double'),
 ('Relative Velocity km per sec', 'double'),
 ('Relative Velocity km per hr', 'double'),
 ('Miles per hour', 'double'),
 ('Miss Dist(Astronomical)', 'double'),
 ('Miss Dist(lunar)', 'double'),
 ('Miss Dist(kilometers)', 'double'),
 ('Miss Dist(miles)', 'double'),
 ('Orbit Uncertainity', 'int'),
 ('Minimum Orbit Intersection', 'double'),
 ('Jupiter Tisserand Invariant', 'double'),
 ('Epoch Osculation', 'double'),
 ('Eccentricity', 'double'),
 ('Semi Major Axis', 'double'),
 ('Inclination', 'double'),
 ('Asc Node Longitude', 'double'),
 ('Orbital Period', 'double'),
 ('Perihelion Distance', 'double'),
 ('Perihelion Arg', 'double'),
 ('Aphelion Dist', 'double'),
 ('Perihe

In [15]:
#new_data.show(5)

In [16]:
#print((new_data.count(), len(new_data.columns)))
features_data = new_data.drop("Hazardous_Encoded")

In [17]:
#features_data.dtypes

In [18]:
features_col = features_data.columns
print(features_col)
assembler = VectorAssembler(inputCols=features_col, outputCol="Vfeatures")
new_data = assembler.transform(new_data)
new_data = new_data.select("Vfeatures", "Hazardous_Encoded")


['Absolute Magnitude', 'Est Dia in KM(min)', 'Est Dia in KM(max)', 'Est Dia in M(min)', 'Est Dia in M(max)', 'Est Dia in Miles(min)', 'Est Dia in Miles(max)', 'Est Dia in Feet(min)', 'Est Dia in Feet(max)', 'Relative Velocity km per sec', 'Relative Velocity km per hr', 'Miles per hour', 'Miss Dist(Astronomical)', 'Miss Dist(lunar)', 'Miss Dist(kilometers)', 'Miss Dist(miles)', 'Orbit Uncertainity', 'Minimum Orbit Intersection', 'Jupiter Tisserand Invariant', 'Epoch Osculation', 'Eccentricity', 'Semi Major Axis', 'Inclination', 'Asc Node Longitude', 'Orbital Period', 'Perihelion Distance', 'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly', 'Mean Motion', 'Orbiting Body_encoded', 'Equinox_encoded']


In [19]:
new_data.show(5)

+--------------------+-----------------+
|           Vfeatures|Hazardous_Encoded|
+--------------------+-----------------+
|[21.6,0.127219879...|                1|
|[21.3,0.146067964...|                0|
|[20.3,0.231502122...|                1|
|[27.4,0.008801465...|                0|
|[21.6,0.127219879...|                1|
+--------------------+-----------------+
only showing top 5 rows



In [20]:
scaled_data = StandardScaler(inputCol="Vfeatures", outputCol="features")
new_data = scaled_data.fit(new_data).transform(new_data)

In [21]:
new_data.show(20)

+--------------------+-----------------+--------------------+
|           Vfeatures|Hazardous_Encoded|            features|
+--------------------+-----------------+--------------------+
|[21.6,0.127219879...|                1|[7.47153547294760...|
|[21.3,0.146067964...|                0|[7.36776414693444...|
|[20.3,0.231502122...|                1|[7.02185972689057...|
|[27.4,0.008801465...|                0|[9.47778110920205...|
|[21.6,0.127219879...|                1|[7.47153547294760...|
|[19.6,0.319561887...|                0|[6.77972663285986...|
|[19.6,0.319561887...|                0|[6.77972663285986...|
|[19.2,0.384197891...|                0|[6.64136486484231...|
|[17.8,0.732073989...|                0|[6.15709867678089...|
|[21.5,0.133215567...|                1|[7.43694503094321...|
|[22.4,0.088014652...|                0|[7.74825900898269...|
|[25.8,0.018388867...|                0|[8.92433403713185...|
|[25.0,0.02658,0.0...|                0|[8.64761050109676...|
|[19.1,0

In [22]:
new_data = new_data.select("features", "Hazardous_Encoded")
new_data = new_data.withColumnRenamed("Hazardous_Encoded","label")

In [23]:
#splitting into test and train data
train_data, test_data = new_data.randomSplit([0.8, 0.2], seed=42)

In [24]:
# Logistic Regression
log_reg=LogisticRegression().fit(train_data)

#Get Predictions for Logistic Regression Model
predictions = log_reg.transform(test_data)
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")

#Metrics for evaluation
recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
auc = evaluator.evaluate(predictions)
print("AUC-ROC: ", auc)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

AUC-ROC:  0.9893499518670519
Accuracy:  0.9619686800894854
Precision:  0.9617393373722547
Recall:  0.9619686800894854


In [25]:
#Display the Logistic Regresssion predictions
predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[4.92221989722427...|    0|[19.4199158753412...|[0.99999999631839...|       0.0|
|[5.05020453264050...|    0|[30.6125153605044...|[0.99999999999994...|       0.0|
|[5.25774718466683...|    0|[17.2604100615157...|[0.99999996809203...|       0.0|
|[5.32692806867560...|    0|[59.6852214742181...|           [1.0,0.0]|       0.0|
|[5.43069939468876...|    0|[21.1345218259053...|[0.99999999933718...|       0.0|
|[5.49988027869754...|    0|[53.2905727665533...|           [1.0,0.0]|       0.0|
|[5.53447072070192...|    0|[21.7162694607758...|[0.99999999962953...|       0.0|
|[5.55176594170412...|    0|[15.1787306623236...|[0.99999974416443...|       0.0|
|[5.63824204671508...|    0|[21.9944213910237...|[0.99999999971949...|       0.0|
|[5.638242046715

Gradient Boost

In [26]:
# Gradient Boost Classifier
gradient_boost_class = GBTClassifier(labelCol="label", featuresCol="features")
model = gradient_boost_class.fit(train_data)

#Get predictions for Gradient Boost model
predictionGBT = model.transform(test_data)

#Metrics for evaluation
recall = multi_evaluator.evaluate(predictionGBT, {multi_evaluator.metricName: "weightedRecall"})
accuracy = multi_evaluator.evaluate(predictionGBT, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictionGBT, {multi_evaluator.metricName: "weightedPrecision"})
auc = evaluator.evaluate(predictionGBT)
print("AUC-ROC: ", auc)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

AUC-ROC:  0.990768607184476
Accuracy:  0.9888143176733781
Precision:  0.9887748275545065
Recall:  0.9888143176733781


In [27]:
predictionGBT.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[4.92221989722427...|    0|[-2.0422353592564...|[0.01655341795893...|       1.0|
|[5.05020453264050...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[5.25774718466683...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[5.32692806867560...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[5.43069939468876...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[5.49988027869754...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[5.53447072070192...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[5.55176594170412...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[5.63824204671508...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[5.638242046715

RANDOM FOREST CLASSIFIER

In [28]:
random_forest = RandomForestClassifier(labelCol="label", featuresCol="features")
model = random_forest.fit(train_data)

#Get predictions for Randomforest Boost model
predictionRDF = model.transform(test_data)
recall = multi_evaluator.evaluate(predictionRDF, {multi_evaluator.metricName: "weightedRecall"})
accuracy = multi_evaluator.evaluate(predictionRDF, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictionRDF, {multi_evaluator.metricName: "weightedPrecision"})
auc = evaluator.evaluate(predictionRDF)
print("AUC-ROC: ", auc)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

AUC-ROC:  0.9980949485737447
Accuracy:  0.9888143176733781
Precision:  0.9887476632251421
Recall:  0.9888143176733781


In [29]:
predictionRDF.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[4.92221989722427...|    0|[17.1794630695966...|[0.85897315347983...|       0.0|
|[5.05020453264050...|    0|[17.2528608431575...|[0.86264304215787...|       0.0|
|[5.25774718466683...|    0|[17.6979661596972...|[0.88489830798486...|       0.0|
|[5.32692806867560...|    0|[17.4961229546474...|[0.87480614773237...|       0.0|
|[5.43069939468876...|    0|[18.3567331781069...|[0.91783665890534...|       0.0|
|[5.49988027869754...|    0|[18.6846337085077...|[0.93423168542538...|       0.0|
|[5.53447072070192...|    0|[17.4970925001467...|[0.87485462500733...|       0.0|
|[5.55176594170412...|    0|[17.2528608431575...|[0.86264304215787...|       0.0|
|[5.63824204671508...|    0|[16.9414183610351...|[0.84707091805175...|       0.0|
|[5.638242046715