In [0]:
#You have been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships.#
#They need accurate estimates of how many crew members a ship will requuire#
#They are currently selling ships to customers and want to create a model and use it to predict how many crew members the ship will need#

# We need to create a regression model that will help predict how many crew members will be needed to future ships#
# In other words: Use the features that you think will be useful to predict the value in the crew column#

#The client mentioned that they have found particular cruise lines will differ in acceptable crew counts, so its most likely an import feature to include#

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Ship_Crew').getOrCreate()
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler,StringIndexer
df = spark.read.csv('dbfs:/FileStore/shared_uploads/gkantirisrafael@gmail.com/cruise_ship_info.csv',header=True,inferSchema=True)
df.show(1)
indexer = StringIndexer(inputCol='Cruise_line',outputCol='Cruise_Category')
indexed = indexer.fit(df).transform(df)

+---------+-----------+---+------------------+----------+------+------+-----------------+----+
|Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 1 row



In [0]:
#We convert all those columns into 'features' column
assembler = VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density','Cruise_Category'],outputCol='features')

In [0]:
output = assembler.transform(indexed)

In [0]:
#We create the final data table with the CREW column(that we want to predict) and  FEATURES(That contains all the columns that we will get in consideration for the final crew prediction#
final_data = output.select(['features','crew'])

In [0]:
#We split our data to train and test randomly
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [0]:
#We create a linear regression model here
lr = LinearRegression(labelCol='crew')

In [0]:
lr_model = lr.fit(train_data)

In [0]:
#We evaluate our test data
test_results = lr_model.evaluate(test_data)
test_results.r2

Out[73]: 0.9484687441222431

In [0]:
# ΘΕΛΟΥΜΕ ΝΑ ΔΟΥΜΕ ΑΝ ΥΠΑΡΧΕΙ ΜΕΓΑΛΗ ΣΥΣΧΕΤΙΣΗ (CORR=1=ΤΕΛΕΙΑ ΣΥΣΧΕΤΙΣΗ), ΑΝΑΜΕΣΑ ΣΤΗ ΣΤΗΛΗ CREW ΚΑΙ ΣΕ ΚΑΠΟΙΑ ΑΛΛΗ

In [0]:
from pyspark.sql.functions import corr
df.select(corr('crew','Tonnage')).show()

+-------------------+
|corr(crew, Tonnage)|
+-------------------+
|  0.927568811544939|
+-------------------+



In [0]:
#We predict the crew members
unlabeled_data = test_data.select('features')

In [0]:
predictions = lr_model.transform(unlabeled_data)

In [0]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[6.0,110.23899999...|10.912347143141853|
|[6.0,158.0,43.7,1...|13.841182448685974|
|[7.0,89.6,25.5,9....|11.071184874114655|
|[7.0,158.0,43.7,1...|  13.7861658356047|
|[9.0,88.5,21.24,9...| 9.525180466970063|
|[9.0,110.0,29.74,...| 12.08268219773898|
|[9.0,113.0,26.74,...|11.357404827887178|
|[10.0,46.0,7.0,6....|2.7516109427993536|
|[10.0,81.76899999...| 8.775343928910596|
|[10.0,86.0,21.14,...| 9.688403486350076|
|[10.0,91.62700000...| 9.201254778124538|
|[11.0,86.0,21.24,...| 9.494392506486463|
|[11.0,90.09,25.01...| 8.794419769693897|
|[11.0,108.977,26....|11.106537871334124|
|[11.0,138.0,31.14...|13.050854840994193|
|[12.0,42.0,14.8,7...| 6.678775695523073|
|[12.0,50.0,7.0,7....| 4.393760150395462|
|[12.0,108.865,27....|10.852690082242855|
|[13.0,61.0,13.8,7...| 6.458178778190409|
|[13.0,101.509,27....|11.136650515266366|
+--------------------+------------