In [1]:
#installing pyspark
#pip install pyspark

In [2]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer,StandardScaler
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

In [3]:
# Create a SparkSession
spark = SparkSession.builder.appName("NPredictor").master("local").getOrCreate()

In [4]:
# Load the dataset
data = spark.read.csv("C:/Users/EZENWAJIAKU CHINEDU/Desktop/Course Work/Big Data/nasa.csv", header=True, inferSchema=True)

In [5]:
import pyspark.sql.functions as fc
print((data.count(), len(data.columns)))
data.describe().show()


# null values in each column
data_null = data.agg(*[fc.count(fc.when(fc.isnull(c), c)).alias(c) for c in data.columns])
data_null.show()   # no null values

(4687, 40)
+-------+-----------------+-----------------+------------------+-------------------+------------------+------------------+------------------+---------------------+---------------------+--------------------+--------------------+-------------------+-------------------------+----------------------------+---------------------------+------------------+-----------------------+-----------------+---------------------+--------------------+-------------+------------------+------------------------+------------------+--------------------------+---------------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------+
|summary| Neo Reference ID|             Name|Absolute Magnitude| Est Dia in KM(min)|Est Dia in KM(max)| Est Dia in M(min)| Est Dia in M(max)|Est Dia in Miles(min)|Est Dia in Miles(max)|E

In [6]:
# Drop columns not neccessary for classification
new_data = data.drop('Neo Reference ID', 'Name', 'Orbit ID', 'Close Approach Date',
                        'Epoch Date Close Approach', 'Orbit Determination Date')

In [7]:
#new_data.dtypes
#new_data.summary
#new_data.describe().show()

In [8]:
# Changing the Boolean type of Harzadous column to String
from pyspark.sql.functions import when

new_data = new_data.withColumn('Hazardous_Encoded', when(new_data.Hazardous==True, 1).otherwise(0))
new_data = new_data.drop("Hazardous")

In [9]:
new_data.groupBy('Hazardous_Encoded').count().orderBy('count').show()

+-----------------+-----+
|Hazardous_Encoded|count|
+-----------------+-----+
|                1|  755|
|                0| 3932|
+-----------------+-----+



In [10]:
from pyspark.sql.functions import col

major_df = new_data.filter(col("Hazardous_Encoded") == 0)
minor_df = new_data.filter(col("Hazardous_Encoded") == 1)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))
sampled_majority_df = major_df.sample(False, 1/ratio)
new_data = sampled_majority_df.unionAll(minor_df)

ratio: 5


In [11]:
print((new_data.count(), len(new_data.columns)))

(1520, 34)


In [12]:
new_data.groupBy('Hazardous_Encoded').count().orderBy('count').show()

+-----------------+-----+
|Hazardous_Encoded|count|
+-----------------+-----+
|                1|  755|
|                0|  765|
+-----------------+-----+



In [13]:
#new_data.dtypes

In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

#label encoding of categorical columns
categorical_cols = ["Orbiting Body","Equinox"]
label_encoders = [StringIndexer(inputCol=col, outputCol=col + "_encoded").fit(new_data) for col in categorical_cols]
pipeline = Pipeline(stages=label_encoders)
new_data = pipeline.fit(new_data).transform(new_data)

In [15]:
new_data = new_data.drop("Orbiting Body","Equinox")

In [16]:
new_data.dtypes

[('Absolute Magnitude', 'double'),
 ('Est Dia in KM(min)', 'double'),
 ('Est Dia in KM(max)', 'double'),
 ('Est Dia in M(min)', 'double'),
 ('Est Dia in M(max)', 'double'),
 ('Est Dia in Miles(min)', 'double'),
 ('Est Dia in Miles(max)', 'double'),
 ('Est Dia in Feet(min)', 'double'),
 ('Est Dia in Feet(max)', 'double'),
 ('Relative Velocity km per sec', 'double'),
 ('Relative Velocity km per hr', 'double'),
 ('Miles per hour', 'double'),
 ('Miss Dist(Astronomical)', 'double'),
 ('Miss Dist(lunar)', 'double'),
 ('Miss Dist(kilometers)', 'double'),
 ('Miss Dist(miles)', 'double'),
 ('Orbit Uncertainity', 'int'),
 ('Minimum Orbit Intersection', 'double'),
 ('Jupiter Tisserand Invariant', 'double'),
 ('Epoch Osculation', 'double'),
 ('Eccentricity', 'double'),
 ('Semi Major Axis', 'double'),
 ('Inclination', 'double'),
 ('Asc Node Longitude', 'double'),
 ('Orbital Period', 'double'),
 ('Perihelion Distance', 'double'),
 ('Perihelion Arg', 'double'),
 ('Aphelion Dist', 'double'),
 ('Perihe

In [17]:
#print((new_data.count(), len(new_data.columns)))
features_data = new_data.drop("Hazardous_Encoded")

In [18]:
#features_data.dtypes

In [19]:
features_col = features_data.columns
print(features_col)
assembler = VectorAssembler(inputCols=features_col, outputCol="Vfeatures")
new_data = assembler.transform(new_data)
new_data = new_data.select("Vfeatures", "Hazardous_Encoded")


['Absolute Magnitude', 'Est Dia in KM(min)', 'Est Dia in KM(max)', 'Est Dia in M(min)', 'Est Dia in M(max)', 'Est Dia in Miles(min)', 'Est Dia in Miles(max)', 'Est Dia in Feet(min)', 'Est Dia in Feet(max)', 'Relative Velocity km per sec', 'Relative Velocity km per hr', 'Miles per hour', 'Miss Dist(Astronomical)', 'Miss Dist(lunar)', 'Miss Dist(kilometers)', 'Miss Dist(miles)', 'Orbit Uncertainity', 'Minimum Orbit Intersection', 'Jupiter Tisserand Invariant', 'Epoch Osculation', 'Eccentricity', 'Semi Major Axis', 'Inclination', 'Asc Node Longitude', 'Orbital Period', 'Perihelion Distance', 'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly', 'Mean Motion', 'Orbiting Body_encoded', 'Equinox_encoded']


In [20]:
new_data.show(20)

+--------------------+-----------------+
|           Vfeatures|Hazardous_Encoded|
+--------------------+-----------------+
|[21.3,0.146067964...|                0|
|[17.8,0.732073989...|                0|
|[20.0,0.2658,0.59...|                0|
|[22.7,0.076657557...|                0|
|[23.9,0.04411182,...|                0|
|[26.9,0.011080388...|                0|
|[25.6,0.020162992...|                0|
|[16.7,1.214940408...|                0|
|[23.7,0.048367649...|                0|
|[25.8,0.018388867...|                0|
|[20.5,0.211132445...|                0|
|[18.8,0.46190746,...|                0|
|[18.2,0.608912622...|                0|
|[23.0,0.066765941...|                0|
|[25.187,0.0243868...|                0|
|[19.6,0.319561887...|                0|
|[18.9,0.4411182,0...|                0|
|[25.3,0.023150212...|                0|
|[23.5,0.053034072...|                0|
|[18.3,0.58150704,...|                0|
+--------------------+-----------------+
only showing top

In [21]:
scaled_data = StandardScaler(inputCol="Vfeatures", outputCol="features")
new_data = scaled_data.fit(new_data).transform(new_data)

In [22]:
new_data.show(20)

+--------------------+-----------------+--------------------+
|           Vfeatures|Hazardous_Encoded|            features|
+--------------------+-----------------+--------------------+
|[21.3,0.146067964...|                0|[8.14194861894796...|
|[17.8,0.732073989...|                0|[6.80406973790017...|
|[20.0,0.2658,0.59...|                0|[7.64502217741593...|
|[22.7,0.076657557...|                0|[8.67710017136708...|
|[23.9,0.04411182,...|                0|[9.13580150201203...|
|[26.9,0.011080388...|                0|[10.2825548286244...|
|[25.6,0.020162992...|                0|[9.78562838709239...|
|[16.7,1.214940408...|                0|[6.38359351814230...|
|[23.7,0.048367649...|                0|[9.05935128023787...|
|[25.8,0.018388867...|                0|[9.86207860886655...|
|[20.5,0.211132445...|                0|[7.83614773185132...|
|[18.8,0.46190746,...|                0|[7.18632084677097...|
|[18.2,0.608912622...|                0|[6.95697018144849...|
|[23.0,0

In [23]:
new_data = new_data.select("features", "Hazardous_Encoded")
new_data = new_data.withColumnRenamed("Hazardous_Encoded","label")

In [24]:
#splitting into test and train data
train_data, test_data = new_data.randomSplit([0.8, 0.2], seed=42)

In [25]:
# Logistic Regression
log_reg=LogisticRegression().fit(train_data)

#Get Predictions for Logistic Regression Model
predictions = log_reg.transform(test_data)
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")

#Metrics for evaluation
recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
auc = evaluator.evaluate(predictions)
print("AUC-ROC: ", auc)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

AUC-ROC:  0.9756097560975622
Accuracy:  0.9357142857142857
Precision:  0.935691266994869
Recall:  0.9357142857142857


In [26]:
#Display the Logistic Regresssion predictions
predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[5.92489218749734...|    0|[21.0299442734465...|[0.99999999926411...|       0.0|
|[6.11601774193274...|    0|[43.1610241417290...|           [1.0,0.0]|       0.0|
|[6.34536840725522...|    0|[7.41176307155662...|[0.99939626004682...|       0.0|
|[6.53649396169062...|    0|[35.1005072799646...|[0.99999999999999...|       0.0|
|[6.61294418346478...|    0|[41.6718766270735...|           [1.0,0.0]|       0.0|
|[6.76584462701309...|    0|[38.1473913914099...|           [1.0,0.0]|       0.0|
|[6.84229484878725...|    0|[20.9337595667173...|[0.99999999918981...|       0.0|
|[6.91874507056141...|    0|[41.6851164701622...|           [1.0,0.0]|       0.0|
|[6.95697018144849...|    0|[6.20589291360671...|[0.99798655559206...|       0.0|
|[6.995195292335

Gradient Boost

In [27]:
# Gradient Boost Classifier
gradient_boost_class = GBTClassifier(labelCol="label", featuresCol="features")
model = gradient_boost_class.fit(train_data)

#Get predictions for Gradient Boost model
predictionGBT = model.transform(test_data)

#Metrics for evaluation
recall = multi_evaluator.evaluate(predictionGBT, {multi_evaluator.metricName: "weightedRecall"})
accuracy = multi_evaluator.evaluate(predictionGBT, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictionGBT, {multi_evaluator.metricName: "weightedPrecision"})
auc = evaluator.evaluate(predictionGBT)
print("AUC-ROC: ", auc)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

AUC-ROC:  0.9956242556056135
Accuracy:  0.9857142857142858
Precision:  0.9857142857142858
Recall:  0.9857142857142858


In [28]:
predictionGBT.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[5.92489218749734...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.11601774193274...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.34536840725522...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.53649396169062...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.61294418346478...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.76584462701309...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.84229484878725...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.91874507056141...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.95697018144849...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.995195292335

RANDOM FOREST CLASSIFIER

In [29]:
random_forest = RandomForestClassifier(labelCol="label", featuresCol="features")
model = random_forest.fit(train_data)

#Get predictions for Gradient Boost model
predictionRDF = model.transform(test_data)
recall = multi_evaluator.evaluate(predictionRDF, {multi_evaluator.metricName: "weightedRecall"})
accuracy = multi_evaluator.evaluate(predictionRDF, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictionRDF, {multi_evaluator.metricName: "weightedPrecision"})
auc = evaluator.evaluate(predictionRDF)
print("AUC-ROC: ", auc)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

AUC-ROC:  1.0
Accuracy:  0.9928571428571429
Precision:  0.9929469901168014
Recall:  0.9928571428571429


In [30]:
predictionRDF.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[5.92489218749734...|    0|[15.5070331899271...|[0.77535165949635...|       0.0|
|[6.11601774193274...|    0|[15.5070331899271...|[0.77535165949635...|       0.0|
|[6.34536840725522...|    0|[14.1950454204384...|[0.70975227102192...|       0.0|
|[6.53649396169062...|    0|[15.3161854132821...|[0.76580927066410...|       0.0|
|[6.61294418346478...|    0|[14.8607873259687...|[0.74303936629843...|       0.0|
|[6.76584462701309...|    0|[15.5070331899271...|[0.77535165949635...|       0.0|
|[6.84229484878725...|    0|[14.5759147434043...|[0.72879573717021...|       0.0|
|[6.91874507056141...|    0|[17.9919142246040...|[0.89959571123020...|       0.0|
|[6.95697018144849...|    0|[14.6731190156815...|[0.73365595078407...|       0.0|
|[6.995195292335