In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyspark.sql.functions import *
from pyspark.ml.feature import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import *
from pyspark.ml.stat import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [2]:
# I.1.1
path = "/FileStore/tables/nasa_jpl_small_body_database_mar25_2018.csv"
dataSet = spark.read.load(path, format="csv", sep=",", inferSchema="true", header="true")
dataSet = dataSet.select('*').withColumn("full_name", trim(dataSet.full_name))

In [3]:
# I.1.2
dataSet.printSchema()

In [4]:
# I.1.3
print("Number of rows: ", dataSet.count())
print("Number of Columns: ", len(dataSet.columns))

In [5]:
# I.2.1
display(dataSet.limit(100))

In [6]:
# I.2.2
display(dataSet.describe())

In [7]:
# I.3.1
dataSet_diameterDesc = dataSet.dropna(subset="diameter").sort(desc("diameter")).filter("diameter>100").sort("diameter", ascending=False).limit(50)

In [8]:
# I.3.2
dataSet_diameterDesc_present = dataSet_diameterDesc.select("full_name", "diameter", "q", "first_obs", "producer", "class")\
                              .withColumnRenamed("full_name", "Name").withColumnRenamed("diameter", "Diameter").withColumnRenamed("q", "Mean Distance From Sun (in AU)")\
                              .withColumnRenamed("first_obs", "Date Discovered").withColumnRenamed("producer", "Discoverer").withColumnRenamed("class", "Class")
display(dataSet_diameterDesc_present)

In [9]:
# I.4.1 (ploting using Databricks Histogram)
display(dataSet.select('a'))

The following command counts the number of small bodies in our solar universe that have a semi-major axis value greater than the semi-major axis value of the planet  Saturn. The result is the number 3441. That accounts for about 0.0045 percent of the total number of small bodies in the dataset. That means that the bulk of the small
bodies in our solar system are located(in terms of semi-major axis value) between the sun and planet Saturn. Therefore we can produce a histogram to help us determine 
where most asteroids are located using only the area between the sun and Saturn.

In [11]:
# I.4.2
train.filter(train.a > 9.5388 ).count()

The following block of code produces a histogram which shows the number of small bodies between the different planets of the solar system(excluding Uranus and Neptune). It is evident that the greater part of the small bodies in our solar system are located between planets Mars and Jupiter and more specifically between planet Mars and the asteroid Ceres. The values for the semi-major axis of the planets were taken from https://www.windows2universe.org/our_solar_system/planets_orbits_table.html and are presented below:

Mercury 0.3871, Venus 0.7233, Earth 1.000, Mars 1.5273, Jupiter 5.2028, Saturn 9.5388, Uranus 19.1914,Neptune 30.0611, Ceres 2.76596

Ceres has a different semi-major axis value in our dataset than in the one taken from the website.

In [13]:
# I.4.2 (continue...)
trainPandas = pd.read_csv("/dbfs/FileStore/tables/nasa_jpl_small_body_database_mar25_2018.csv")

In [14]:
# I.4.2 (continue...)
trainPandasNotNan = trainPandas[np.isfinite(trainPandas['a'])]
f, ax = plt.subplots()
ax.hist(trainPandasNotNan.a, bins=[0.3871, 0.7233, 1.000, 1.5273, 2.76596, 5.2028, 9.5388], orientation='horizontal')
#ax.set_xticks([0.3871, 0.7233, 1.000, 1.5273, 2.76596, 5.2028, 9.5388], ["Mercury", "Venus", "Earth", "Mars", "Ceres", "Jupiter", "Saturn"])
plt.yticks([0.3871, 0.7233, 1.000, 1.5273, 2.76596, 5.2028, 9.5388], ["Mercury", "Venus", "Earth", "Mars", "Ceres", "Jupiter", "Saturn"])

display(f)

In [15]:
# I.4.3
dataSet_axisOrbit = dataSet.dropna(subset=["a", "per_y"]).select("a", "per_y")
dataSet_axisOrbit_pd = dataSet_axisOrbit.withColumnRenamed("a", "Semi-Major Axis").withColumnRenamed("per_y", "Orbit Period").toPandas()
scatterPlot = dataSet_axisOrbit_pd.plot.scatter(x='Semi-Major Axis', y='Orbit Period')
display()

In [16]:
# I.4.4
vecAssembler = VectorAssembler(inputCols=["a"], outputCol="features")
assembled_dataSet = vecAssembler.transform(dataSet_axisOrbit)
assembled_dataSet = assembled_dataSet.select("per_y", "features").withColumnRenamed("per_y", "label")
train = assembled_dataSet
lr = LinearRegression(maxIter=10, regParam=0.7, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(train)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
trainingSummary.predictions.show()

In [17]:
ax = dataSet_axisOrbit_pd.plot.scatter(x='Semi-Major Axis', y='Orbit Period', color='DarkBlue', label='Actual')
dataSet_axisOrbit_pd.loc[:,'Orbit Period'] *= 418.824218173
dataSet_axisOrbit_pd.loc[:,'Orbit Period'] -= -1539.7951993985082
dataSet_axisOrbit_pd.plot.scatter(x='Semi-Major Axis', y='Orbit Period', color='Green', label='Prediction', ax=ax);

display()

We observe that using a linear model for the approximation, though accurate enough is not the best. A non-linear model probably a polyonym of a degree between 1 and 2 would offer a better approximation.

In [19]:
# I.5
display(dataSet.filter("lower(full_name) like '%tesla%' OR lower(full_name) like '%spacex%' OR lower(full_name) like '%roadster%' OR lower(name) like '%tesla%' OR lower(name) like '%spacex%' or lower(name) like '%roadster%'"))


Δείτε εικόνες στο αρχείο.