In [31]:
#installing pyspark
#pip install pyspark

In [32]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer,StandardScaler
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

In [33]:
# Create a SparkSession
spark = SparkSession.builder.appName("NPredictor").master("local").getOrCreate()

In [34]:
# Load the dataset
data = spark.read.csv("C:/Users/EZENWAJIAKU CHINEDU/Desktop/Course Work/Big Data/nasa.csv", header=True, inferSchema=True)

In [35]:
import pyspark.sql.functions as fc
print((data.count(), len(data.columns)))
data.describe().show()


# null values in each column
data_null = data.agg(*[fc.count(fc.when(fc.isnull(c), c)).alias(c) for c in data.columns])
data_null.show()   # no null values

(4687, 40)
+-------+-----------------+-----------------+------------------+-------------------+------------------+------------------+------------------+---------------------+---------------------+--------------------+--------------------+-------------------+-------------------------+----------------------------+---------------------------+------------------+-----------------------+-----------------+---------------------+--------------------+-------------+------------------+------------------------+------------------+--------------------------+---------------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------+
|summary| Neo Reference ID|             Name|Absolute Magnitude| Est Dia in KM(min)|Est Dia in KM(max)| Est Dia in M(min)| Est Dia in M(max)|Est Dia in Miles(min)|Est Dia in Miles(max)|E

In [36]:
# Drop columns not neccessary for classification
new_data = data.drop('Neo Reference ID', 'Name', 'Orbit ID','Orbiting Body','Equinox','Miss Dist(Astronomical)', 'Miss Dist(miles)', 'Miss Dist(Astronomical)')
new_data = new_data.drop('Est Dia in KM(min)','Est Dia in M(min)','Est Dia in Miles(min)','Est Dia in Feet(min)','Relative Velocity km per hr')

In [37]:
#new_data.dtypes
#new_data.summary
#new_data.describe().show()

In [38]:
# Changing the Boolean type of Harzadous column to String
from pyspark.sql.functions import when

new_data = new_data.withColumn('Hazardous_Encoded', when(new_data.Hazardous==True, 1).otherwise(0))
new_data = new_data.drop("Hazardous")

In [39]:
new_data.groupBy('Hazardous_Encoded').count().orderBy('count').show()

+-----------------+-----+
|Hazardous_Encoded|count|
+-----------------+-----+
|                1|  755|
|                0| 3932|
+-----------------+-----+



In [40]:
from pyspark.sql.functions import col

major_df = new_data.filter(col("Hazardous_Encoded") == 0)
minor_df = new_data.filter(col("Hazardous_Encoded") == 1)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))
sampled_majority_df = major_df.sample(False, 1/ratio)
new_data = sampled_majority_df.unionAll(minor_df)

ratio: 5


In [41]:
print((new_data.count(), len(new_data.columns)))

(1540, 28)


In [42]:
new_data.groupBy('Hazardous_Encoded').count().orderBy('count').show()

+-----------------+-----+
|Hazardous_Encoded|count|
+-----------------+-----+
|                1|  755|
|                0|  785|
+-----------------+-----+



In [43]:
#new_data.dtypes

In [44]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

#label encoding of categorical columns
categorical_cols = ['Close Approach Date', 'Orbit Determination Date']
label_encoders = [StringIndexer(inputCol=col, outputCol=col + "_encoded").fit(new_data) for col in categorical_cols]
pipeline = Pipeline(stages=label_encoders)
new_data = pipeline.fit(new_data).transform(new_data)

In [45]:
new_data = new_data.drop('Close Approach Date', 'Orbit Determination Date')

In [46]:
new_data.dtypes

[('Absolute Magnitude', 'double'),
 ('Est Dia in KM(max)', 'double'),
 ('Est Dia in M(max)', 'double'),
 ('Est Dia in Miles(max)', 'double'),
 ('Est Dia in Feet(max)', 'double'),
 ('Epoch Date Close Approach', 'double'),
 ('Relative Velocity km per sec', 'double'),
 ('Miles per hour', 'double'),
 ('Miss Dist(lunar)', 'double'),
 ('Miss Dist(kilometers)', 'double'),
 ('Orbit Uncertainity', 'int'),
 ('Minimum Orbit Intersection', 'double'),
 ('Jupiter Tisserand Invariant', 'double'),
 ('Epoch Osculation', 'double'),
 ('Eccentricity', 'double'),
 ('Semi Major Axis', 'double'),
 ('Inclination', 'double'),
 ('Asc Node Longitude', 'double'),
 ('Orbital Period', 'double'),
 ('Perihelion Distance', 'double'),
 ('Perihelion Arg', 'double'),
 ('Aphelion Dist', 'double'),
 ('Perihelion Time', 'double'),
 ('Mean Anomaly', 'double'),
 ('Mean Motion', 'double'),
 ('Hazardous_Encoded', 'int'),
 ('Close Approach Date_encoded', 'double'),
 ('Orbit Determination Date_encoded', 'double')]

In [47]:
#print((new_data.count(), len(new_data.columns)))
features_data = new_data.drop("Hazardous_Encoded")

In [48]:
#features_data.dtypes

In [49]:
features_col = features_data.columns
print(features_col)
assembler = VectorAssembler(inputCols=features_col, outputCol="Vfeatures")
new_data = assembler.transform(new_data)
new_data = new_data.select("Vfeatures", "Hazardous_Encoded")


['Absolute Magnitude', 'Est Dia in KM(max)', 'Est Dia in M(max)', 'Est Dia in Miles(max)', 'Est Dia in Feet(max)', 'Epoch Date Close Approach', 'Relative Velocity km per sec', 'Miles per hour', 'Miss Dist(lunar)', 'Miss Dist(kilometers)', 'Orbit Uncertainity', 'Minimum Orbit Intersection', 'Jupiter Tisserand Invariant', 'Epoch Osculation', 'Eccentricity', 'Semi Major Axis', 'Inclination', 'Asc Node Longitude', 'Orbital Period', 'Perihelion Distance', 'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly', 'Mean Motion', 'Close Approach Date_encoded', 'Orbit Determination Date_encoded']


In [50]:
new_data.show(20)

+--------------------+-----------------+
|           Vfeatures|Hazardous_Encoded|
+--------------------+-----------------+
|[19.6,0.714562102...|                0|
|[22.3,0.206081961...|                0|
|[23.9,0.098637028...|                0|
|[23.8,0.103285648...|                0|
|[18.8,1.032856481...|                0|
|[24.0,0.094197631...|                0|
|[26.2,0.034201093...|                0|
|[18.1,1.425738833...|                0|
|[23.7,0.108153351...|                0|
|[20.3,0.517654482...|                0|
|[20.5,0.472106499...|                0|
|[18.3,1.30028927,...|                0|
|[24.4,0.078350176...|                0|
|[19.5,0.748238376...|                0|
|[24.5,0.074823838...|                0|
|[18.9,0.986370281...|                0|
|[25.3,0.051765448...|                0|
|[19.9,0.622357573...|                0|
|[21.0,0.375007522...|                0|
|[18.3,1.30028927,...|                0|
+--------------------+-----------------+
only showing top

In [51]:
scaled_data = StandardScaler(inputCol="Vfeatures", outputCol="features")
new_data = scaled_data.fit(new_data).transform(new_data)

In [52]:
new_data.show(20)

+--------------------+-----------------+--------------------+
|           Vfeatures|Hazardous_Encoded|            features|
+--------------------+-----------------+--------------------+
|[19.6,0.714562102...|                0|[7.47784101517662...|
|[22.3,0.206081961...|                0|[8.50795176726727...|
|[23.9,0.098637028...|                0|[9.11838776850618...|
|[23.8,0.103285648...|                0|[9.08023551842875...|
|[18.8,1.032856481...|                0|[7.17262301455716...|
|[24.0,0.094197631...|                0|[9.15654001858361...|
|[26.2,0.034201093...|                0|[9.99588952028711...|
|[18.1,1.425738833...|                0|[6.90555726401514...|
|[23.7,0.108153351...|                0|[9.04208326835132...|
|[20.3,0.517654482...|                0|[7.74490676571864...|
|[20.5,0.472106499...|                0|[7.82121126587350...|
|[18.3,1.30028927,...|                0|[6.98186176417000...|
|[24.4,0.078350176...|                0|[9.30914901889334...|
|[19.5,0

In [53]:
new_data = new_data.select("features", "Hazardous_Encoded")
new_data = new_data.withColumnRenamed("Hazardous_Encoded","label")

In [54]:
#splitting into test and train data
train_data, test_data = new_data.randomSplit([0.8, 0.2], seed=42)

In [55]:
# Logistic Regression
log_reg=LogisticRegression().fit(train_data)

#Get Predictions for Logistic Regression Model
predictions = log_reg.transform(test_data)
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")

#Metrics for evaluation
recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
auc = evaluator.evaluate(predictions)
f1_Score = (2*precision*recall)/(precision+recall)

print("AUC-ROC: ", auc)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-Score: ",f1_Score)

AUC-ROC:  0.978496815286624
Accuracy:  0.950354609929078
Precision:  0.9503654892028647
Recall:  0.950354609929078
F-Score:  0.9503600495348361


In [56]:
#Display the Logistic Regresssion predictions
predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[5.87544651192448...|    0|[60.2679814487933...|           [1.0,0.0]|       0.0|
|[6.18066451254394...|    0|[8.62296131637674...|[0.99982010572249...|       0.0|
|[6.37142576293109...|    0|[6.15540512008450...|[0.99788251211540...|       0.0|
|[6.48588251316339...|    0|[38.1965052273155...|           [1.0,0.0]|       0.0|
|[6.56218701331825...|    0|[1.50376744651921...|[0.81813570533181...|       0.0|
|[6.67664376355055...|    0|[13.5970654902531...|[0.99999875586087...|       0.0|
|[6.79110051378284...|    0|[-1.6482806487278...|[0.16134146023173...|       1.0|
|[6.90555726401514...|    0|[0.32884002783634...|[0.58147711100781...|       0.0|
|[6.98186176417000...|    0|[-9.215845481071,...|[9.94411121067952...|       1.0|
|[6.981861764170

Gradient Boost

In [57]:
# Gradient Boost Classifier
gradient_boost_class = GBTClassifier(labelCol="label", featuresCol="features")
model = gradient_boost_class.fit(train_data)

#Get predictions for Gradient Boost model
predictionGBT = model.transform(test_data)

#Metrics for evaluation
recall = multi_evaluator.evaluate(predictionGBT, {multi_evaluator.metricName: "weightedRecall"})
accuracy = multi_evaluator.evaluate(predictionGBT, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictionGBT, {multi_evaluator.metricName: "weightedPrecision"})
auc = evaluator.evaluate(predictionGBT)
f1_Score = (2*precision*recall)/(precision+recall)

print("AUC-ROC: ", auc)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-Score: ",f1_Score)

AUC-ROC:  0.9899363057324841
Accuracy:  0.9787234042553191
Precision:  0.9788552683488345
Recall:  0.9787234042553191
F-Score:  0.9787893318608405


In [58]:
predictionGBT.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[5.87544651192448...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.18066451254394...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.37142576293109...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.48588251316339...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.56218701331825...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.67664376355055...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.79110051378284...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.90555726401514...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.98186176417000...|    0|[1.54350200272498...|[0.95635347857270...|       0.0|
|[6.981861764170

RANDOM FOREST CLASSIFIER

In [59]:
random_forest = RandomForestClassifier(labelCol="label", featuresCol="features")
model = random_forest.fit(train_data)

#Get predictions for Gradient Boost model
predictionRDF = model.transform(test_data)
recall = multi_evaluator.evaluate(predictionRDF, {multi_evaluator.metricName: "weightedRecall"})
accuracy = multi_evaluator.evaluate(predictionRDF, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictionRDF, {multi_evaluator.metricName: "weightedPrecision"})
auc = evaluator.evaluate(predictionRDF)
f1_Score = (2*precision*recall)/(precision+recall)

print("AUC-ROC: ", auc)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-Score: ",f1_Score)

AUC-ROC:  0.9991847133757962
Accuracy:  0.9822695035460993
Precision:  0.9823084716701738
Recall:  0.9822695035460993
F-Score:  0.982288987221663


In [60]:
predictionRDF.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[5.87544651192448...|    0|[15.3100979819300...|[0.76550489909650...|       0.0|
|[6.18066451254394...|    0|[16.0778509641800...|[0.80389254820900...|       0.0|
|[6.37142576293109...|    0|[15.3036901983788...|[0.76518450991894...|       0.0|
|[6.48588251316339...|    0|[17.5961591525357...|[0.87980795762678...|       0.0|
|[6.56218701331825...|    0|[13.5211827553860...|[0.67605913776930...|       0.0|
|[6.67664376355055...|    0|[16.0792779048193...|[0.80396389524096...|       0.0|
|[6.79110051378284...|    0|[14.0553585465342...|[0.70276792732671...|       0.0|
|[6.90555726401514...|    0|[15.3392661016714...|[0.76696330508357...|       0.0|
|[6.98186176417000...|    0|[16.8219983867345...|[0.84109991933672...|       0.0|
|[6.981861764170