 ## Predicting Crew Size Using Cruise Ship Dataset

In [2]:
import findspark

findspark.init('/home/guipleite/spark-3.0.2-bin-hadoop3.2')

from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StringIndexer


spark = SparkSession.builder.appName('lr_exe').getOrCreate()


In [3]:
df = spark.read.csv("cruise_ship_info.csv",inferSchema=True,header=True)

In [4]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [5]:
df.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



In [6]:

indexer = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_line_idx")
df_idx = indexer.fit(df).transform(df)
df_idx.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_idx|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|            1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|            1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|            1.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+

In [7]:
assembler = VectorAssembler(inputCols=["Age", "Tonnage", 
                                       "passengers",'length', 'cabins',
                                       'passenger_density', 'Cruise_line_idx'],
                            outputCol= "features")

output = assembler.transform(df_idx)

final_data = output.select("features",'crew')
final_data.show(5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
+--------------------+----+
only showing top 5 rows



In [8]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [9]:
lr = LinearRegression(labelCol='crew')
lrModel = lr.fit(train_data)

In [10]:
test_results = lrModel.evaluate(test_data)
test_results.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
|-0.4299484964211757|
|-1.4140419170200182|
| 0.3367788154539788|
|-1.2861990511302164|
|0.36866718372494134|
+-------------------+
only showing top 5 rows



In [11]:
unlabeled_data = test_data.select('features')
predictions = lrModel.transform(unlabeled_data)

predictions.show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[4.0,220.0,54.0,1...|21.429948496421176|
|[5.0,86.0,21.04,9...| 9.414041917020018|
|[5.0,115.0,35.74,...| 11.86322118454602|
|[6.0,90.0,20.0,9....|10.286199051130216|
|[6.0,93.0,23.94,9...|10.721332816275059|
+--------------------+------------------+
only showing top 5 rows



In [12]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))

RMSE: 0.6607394627030658
MSE: 0.4365766375731361


In [13]:
train_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              101|
|   mean|7.626435643564363|
| stddev|3.481110910085576|
|    min|             0.59|
|    max|             19.1|
+-------+-----------------+



Considering that the mean of crew members is 7.62, the RMSE of 0.66 and MSE of 0.44 indicates that the model is fairly good at predicting the necessary crew for each new ship