In [1]:
import findspark

In [2]:
findspark.init('/home/adeola/spark-2.4.2-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('cruise').getOrCreate()

In [7]:
data = spark.read.csv('cruise_ship_info.csv',inferSchema = True, header = True)

In [8]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [9]:
data.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [10]:
data.select(['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']).show()

+---+------------------+----------+------+------+-----------------+----+
|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+---+------------------+----------+------+------+-----------------+----+
|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
| 23|            70.367|     20.56|  8.55| 10.22|            34.23| 9.2|
| 19|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|  6|110.23899999999999|      37.0|  9.51| 14.87|            29.79|11.5|
| 10|             110.0|     29.74|  9.51| 14.87|  

In [14]:
data.count()

158

In [15]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression

In [49]:
#it us trasform Cruise_line to a categorical variable
data.groupby('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [90]:
indexer = StringIndexer(inputCol="Cruise_line", outputCol="cruise_cat")
indexed = indexer.fit(data).transform(data)
indexed.head(5)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, cruise_cat=1.0),
 Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1, cruise_cat=1.0),
 Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0, cruise_cat=1.0)]

In [52]:
indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_cat']

In [53]:
assembler = VectorAssembler(inputCols = ['Age','Tonnage','passengers','length','cabins','passenger_density','cruise_cat'],
                            outputCol = 'features')

In [54]:
df = assembler.transform(indexed)

In [55]:
df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_cat|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[6.0,30.276999999...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[6.0,30.276999999...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|       1.0|[26.0,47.262,14.8...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|       1.0|[11.0,110.0,29.74...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0| 

In [56]:
df.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_cat',
 'features']

In [59]:
new_data = df.select(['features','crew'])

In [60]:
new_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [61]:
#let us split this data
new_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [62]:
train_data,test_data = new_data.randomSplit([0.7,0.3])

In [63]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               102|
|   mean| 8.189019607843147|
| stddev|3.4914343586554577|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [64]:
test_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|               56|
|   mean|            7.075|
| stddev|3.440444474456488|
|    min|             0.59|
|    max|             13.6|
+-------+-----------------+



In [65]:
#let us create a model with the train_data, specifying the label as crew since featureCol and predictionCol are default
lr = LinearRegression(labelCol = 'crew')

In [66]:
lr_model = lr.fit(train_data)

In [67]:
lr_model.coefficients

DenseVector([-0.0236, 0.0136, -0.1733, 0.4098, 0.8815, -0.0138, 0.0651])

In [68]:
lr_model.intercept

-0.49098585512431586

In [69]:
#let us test the model on the test data

In [70]:
test_results = lr_model.evaluate(test_data)

In [71]:
test_results.r2

0.9465998579059174

In [78]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R2: {}".format(test_results.r2))

RMSE: 0.7879036106922972
MSE: 0.620792099741959
R2: 0.9465998579059174


In [79]:
from pyspark.sql.functions import corr

In [84]:
data.select(corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [87]:
data.select(corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+



In [72]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|  -1.388762714539503|
| -1.6324759286819486|
| -1.0424135165926192|
| -1.4999773515787709|
|  0.3878262097041709|
|-0.44211562080604416|
| -0.2417122344742424|
|   0.322129327983129|
|  -0.620269612138383|
|   0.355721786536618|
| -0.7222344218877428|
|  1.6912653568983593|
|-0.41996122273937964|
| -0.3530966838500875|
|0.013533630201241209|
| -0.5843334579814279|
|-0.28353668689207545|
| -0.2369715107591972|
|  0.8767927640471438|
|-0.35994978378696985|
+--------------------+
only showing top 20 rows



In [73]:
# let us predict the number of crews for the ship with other features
unlabel_test = test_data.select('features')

In [74]:
prediction = lr_model.transform(unlabel_test)

In [75]:
prediction.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[5.0,86.0,21.04,9...| 9.388762714539503|
|[5.0,160.0,36.34,...|15.232475928681948|
|[6.0,30.276999999...| 4.592413516592619|
|[6.0,90.0,20.0,9....| 10.49997735157877|
|[6.0,110.23899999...|11.112173790295829|
|[6.0,112.0,38.0,9...|11.342115620806045|
|[7.0,158.0,43.7,1...|13.841712234474242|
|[8.0,77.499,19.5,...| 8.677870672016871|
|[8.0,110.0,29.74,...|12.220269612138383|
|[9.0,81.0,21.44,9...| 9.644278213463382|
|[9.0,105.0,27.2,8...|11.402234421887742|
|[10.0,46.0,7.0,6....|2.7787346431016404|
|[10.0,58.825,15.6...|  7.41996122273938|
|[10.0,68.0,10.8,7...| 6.713096683850088|
|[10.0,77.0,20.16,...| 8.986466369798759|
|[10.0,86.0,21.14,...| 9.784333457981427|
|[10.0,90.09,25.01...| 8.863536686892076|
|[10.0,91.62700000...| 9.236971510759197|
|[11.0,90.0,22.4,9...|10.123207235952856|
|[11.0,90.09,25.01...|  8.83994978378697|
+--------------------+------------

In [76]:
test_data.show()

+--------------------+-----+
|            features| crew|
+--------------------+-----+
|[5.0,86.0,21.04,9...|  8.0|
|[5.0,160.0,36.34,...| 13.6|
|[6.0,30.276999999...| 3.55|
|[6.0,90.0,20.0,9....|  9.0|
|[6.0,110.23899999...| 11.5|
|[6.0,112.0,38.0,9...| 10.9|
|[7.0,158.0,43.7,1...| 13.6|
|[8.0,77.499,19.5,...|  9.0|
|[8.0,110.0,29.74,...| 11.6|
|[9.0,81.0,21.44,9...| 10.0|
|[9.0,105.0,27.2,8...|10.68|
|[10.0,46.0,7.0,6....| 4.47|
|[10.0,58.825,15.6...|  7.0|
|[10.0,68.0,10.8,7...| 6.36|
|[10.0,77.0,20.16,...|  9.0|
|[10.0,86.0,21.14,...|  9.2|
|[10.0,90.09,25.01...| 8.58|
|[10.0,91.62700000...|  9.0|
|[11.0,90.0,22.4,9...| 11.0|
|[11.0,90.09,25.01...| 8.48|
+--------------------+-----+
only showing top 20 rows

