### Developing a model to predict the number of crew members required for each ship based on the given parameters for Hyundai

In [1]:
#importing the SparkSession
from pyspark.sql import SparkSession

#import the essential packages
from pyspark.ml import linalg
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

#import the ml tools
from pyspark.ml.regression import LinearRegression

#importing the necessary functions
from pyspark.sql.functions import format_number
from pyspark.sql.types import FloatType, DoubleType

#importing the Stringindexer to address the categorical value
from pyspark.ml.feature import StringIndexer

In [2]:
#starting a spark session
spark = SparkSession.builder.appName('cruise_line').getOrCreate()

In [3]:
#loading the data set
all_data = spark.read.csv('10.cruise_ship_info.csv', inferSchema=True, header=True)

#printing the schema
all_data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [4]:
#explore the data
all_data.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)]

In [5]:
all_data.describe().columns

['summary',
 'Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [6]:
#continuing to explore the data
all_data.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       null|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|      NaN|       null| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   

### It seems that there are no missing data, ship_name column is a random string, 
### and Cruise_line is categorical value which will impact the value of the crew members

In [7]:
#create an indexer
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_catg')

#index the column
indexed = indexer.fit(all_data).transform(all_data)

#printing the columns
indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_line_catg']

In [8]:
#create a vector assebler session of the input columns
assembler = VectorAssembler(inputCols=[  'Age','Tonnage','passengers','length','cabins','passenger_density','Cruise_line_catg'],
                           outputCol='features')

#create the input vector
output = assembler.transform(indexed)

#explore the output dataframe
output.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Cruise_line_catg=16.0, features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64, 16.0]))]

#### The input columns are saved in a vector under the features columns.

### Now it is time to perform the regression analysis

In [9]:
#save the input and labels into a new dataframe
final_data =  output.select('features', 'crew')

#split the data into train and test
train_data, test_data = final_data.randomSplit([0.8, 0.2])

In [10]:
#start a linear regression columns
lin_reg_session = LinearRegression(featuresCol='features', labelCol='crew', predictionCol='predictions')

#train the data
lin_reg_model = lin_reg_session.fit(train_data)

In [11]:
#evaluate the model with the test data
results = lin_reg_model.evaluate(test_data)

In [12]:
#explore the metrics
results.predictions.show()

+--------------------+-----+------------------+
|            features| crew|       predictions|
+--------------------+-----+------------------+
|[5.0,133.5,39.59,...|13.13| 13.16904375955811|
|[6.0,30.276999999...| 3.55|  4.33651621607432|
|[6.0,93.0,23.94,9...|11.09|10.613974940561915|
|[7.0,89.6,25.5,9....| 9.87|11.128540371144043|
|[9.0,90.09,25.01,...| 8.69|  9.33256273040741|
|[10.0,68.0,10.8,7...| 6.36| 6.556612323264983|
|[10.0,138.0,31.14...|11.85| 13.15378586551004|
|[11.0,86.0,21.24,...|  9.3| 9.582374684150892|
|[11.0,91.0,20.32,...| 9.99|  9.25516358347648|
|[11.0,91.62700000...|  9.0| 9.260009595710075|
|[12.0,2.329,0.94,...|  0.6|0.7681420222643544|
|[12.0,50.0,7.0,7....| 4.45| 4.482468064145061|
|[12.0,58.6,15.66,...|  7.0| 7.455818840325527|
|[12.0,91.0,20.32,...| 9.99| 9.246344985709559|
|[13.0,25.0,3.82,5...| 2.95| 2.927691832451995|
|[13.0,63.0,14.4,7...| 5.31|   6.7696080101903|
|[13.0,91.0,20.32,...| 9.99| 9.237526387942639|
|[13.0,138.0,31.14...|11.76| 13.12733007

In [13]:
#R square
results.r2

0.9481053725461429

In [14]:
#Mean Absolute Error
results.meanAbsoluteError

0.628647170815592

In [15]:
#Mean Squared Error
results.meanSquaredError

0.6259238991404144

In [16]:
#Root mean Squared Error
results.rootMeanSquaredError

0.7911535243809601

In [17]:
#explore the results
results.predictions.describe().show()

+-------+-----------------+------------------+
|summary|             crew|       predictions|
+-------+-----------------+------------------+
|  count|               33|                33|
|   mean|7.406969696969694| 7.810412356196876|
| stddev|3.526805818343708| 3.590703615310596|
|    min|              0.6|0.7681420222643544|
|    max|            13.13| 13.16904375955811|
+-------+-----------------+------------------+



## deploy the model

In [18]:
#create an unlabeled dataset

#create a temporary SQL database
unlabeled_data = test_data.createOrReplaceTempView('test_data')
unlabeled_data = spark.sql("SELECT (features) FROM test_data")
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[5.0,133.5,39.59,...|
|[6.0,30.276999999...|
|[6.0,93.0,23.94,9...|
|[7.0,89.6,25.5,9....|
|[9.0,90.09,25.01,...|
|[10.0,68.0,10.8,7...|
|[10.0,138.0,31.14...|
|[11.0,86.0,21.24,...|
|[11.0,91.0,20.32,...|
|[11.0,91.62700000...|
|[12.0,2.329,0.94,...|
|[12.0,50.0,7.0,7....|
|[12.0,58.6,15.66,...|
|[12.0,91.0,20.32,...|
|[13.0,25.0,3.82,5...|
|[13.0,63.0,14.4,7...|
|[13.0,91.0,20.32,...|
|[13.0,138.0,31.14...|
|[14.0,83.0,17.5,9...|
|[14.0,138.0,31.14...|
+--------------------+
only showing top 20 rows



In [19]:
#deploy the model on the unlabeled data set
unlabeled_data_results = lin_reg_model.transform(unlabeled_data)

In [20]:
#showing the predictions
unlabeled_data_results.show()

+--------------------+------------------+
|            features|       predictions|
+--------------------+------------------+
|[5.0,133.5,39.59,...| 13.16904375955811|
|[6.0,30.276999999...|  4.33651621607432|
|[6.0,93.0,23.94,9...|10.613974940561915|
|[7.0,89.6,25.5,9....|11.128540371144043|
|[9.0,90.09,25.01,...|  9.33256273040741|
|[10.0,68.0,10.8,7...| 6.556612323264983|
|[10.0,138.0,31.14...| 13.15378586551004|
|[11.0,86.0,21.24,...| 9.582374684150892|
|[11.0,91.0,20.32,...|  9.25516358347648|
|[11.0,91.62700000...| 9.260009595710075|
|[12.0,2.329,0.94,...|0.7681420222643544|
|[12.0,50.0,7.0,7....| 4.482468064145061|
|[12.0,58.6,15.66,...| 7.455818840325527|
|[12.0,91.0,20.32,...| 9.246344985709559|
|[13.0,25.0,3.82,5...| 2.927691832451995|
|[13.0,63.0,14.4,7...|   6.7696080101903|
|[13.0,91.0,20.32,...| 9.237526387942639|
|[13.0,138.0,31.14...| 13.12733007220928|
|[14.0,83.0,17.5,9...| 9.171017310048297|
|[14.0,138.0,31.14...|13.118511474442359|
+--------------------+------------