In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("LR_project").getOrCreate()

In [7]:
df = spark.read.csv("cruise_ship_info.csv",header = True,inferSchema=True)

In [23]:
df.select("Cruise_line").distinct().count()

20

In [15]:
from pyspark.ml.feature import StringIndexer

In [16]:
indexer = StringIndexer(inputCol="Cruise_line",outputCol="Indexed_Cruise_Line")

In [17]:
indexerModel=indexer.fit(df)

In [18]:
indexed_df = indexerModel.transform(df)
indexed_df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Indexed_Cruise_Line|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|                1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.5

In [24]:
from pyspark.ml.regression import LinearRegression

In [25]:
from pyspark.ml.linalg import Vectors

In [26]:
from pyspark.ml.feature import VectorAssembler

In [27]:
indexed_df.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Indexed_Cruise_Line']

In [33]:
assembler = VectorAssembler(
     inputCols= ['Indexed_Cruise_Line',
     'Tonnage',
     'passengers',
     'length',
     'cabins'],outputCol = "features"
)

In [34]:
output = assembler.transform(indexed_df)

In [35]:
output.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Indexed_Cruise_Line=16.0, features=DenseVector([16.0, 30.277, 6.94, 5.94, 3.55]))]

In [36]:
output.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Indexed_Cruise_Line',
 'features']

In [38]:
final_data= output.select("features","crew")

In [39]:
final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,30.27699999...|3.55|
|[16.0,30.27699999...|3.55|
|[1.0,47.262,14.86...| 6.7|
|[1.0,110.0,29.74,...|19.1|
|[1.0,101.353,26.4...|10.0|
|[1.0,70.367,20.52...| 9.2|
|[1.0,70.367,20.52...| 9.2|
|[1.0,70.367,20.56...| 9.2|
|[1.0,70.367,20.52...| 9.2|
|[1.0,110.23899999...|11.5|
|[1.0,110.0,29.74,...|11.6|
|[1.0,46.052,14.52...| 6.6|
|[1.0,70.367,20.52...| 9.2|
|[1.0,70.367,20.52...| 9.2|
|[1.0,86.0,21.24,9...| 9.3|
|[1.0,110.0,29.74,...|11.6|
|[1.0,88.5,21.24,9...|10.3|
|[1.0,70.367,20.52...| 9.2|
|[1.0,88.5,21.24,9...| 9.3|
|[1.0,70.367,20.52...| 9.2|
+--------------------+----+
only showing top 20 rows



In [40]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [41]:
train_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              111|
|   mean|7.620540540540547|
| stddev|3.229370565700729|
|    min|             0.59|
|    max|             13.6|
+-------+-----------------+



In [43]:
test_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|               47|
|   mean|8.204255319148936|
| stddev|4.087790805866346|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [47]:
lr = LinearRegression(featuresCol= "features", labelCol = "crew")

In [48]:
lr_model = lr.fit(train_data)

In [49]:
test_results = lr_model.evaluate(test_data)

In [61]:
test_results.rootMeanSquaredError

1.4762266957806005

In [51]:
test_results.rootMeanSquaredError

1.4762266957806005

In [52]:
test_results.r2

0.8667495112295971

In [53]:
final_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [65]:
from pyspark.sql.functions import corr

In [66]:
indexed_df.select(corr("crew","passengers")).show()## check correlation between columns in order to justify the high r2(root squared) results

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [62]:
predictions = lr_model.transform(test_data)

In [63]:
predictions.show()

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[0.0,70.0,20.76,8...|  7.2| 7.520883189594224|
|[0.0,73.941,27.44...| 8.22|  8.88864313892479|
|[0.0,74.137,19.5,...|  7.6| 8.460351771474961|
|[0.0,90.09,25.01,...| 8.69| 9.131015312835714|
|[0.0,160.0,36.34,...| 13.6|14.795839037488456|
|[0.0,220.0,54.0,1...| 21.0| 20.44034980695921|
|[1.0,46.052,14.52...|  6.6| 6.060103704218314|
|[1.0,47.262,14.86...|  6.7| 6.143792108203828|
|[1.0,70.367,20.52...|  9.2| 8.416129558813857|
|[1.0,70.367,20.56...|  9.2| 8.426241097362926|
|[1.0,88.5,21.24,9...|10.29|  9.30783454282178|
|[1.0,88.5,21.24,9...| 10.3| 9.351822006479438|
|[1.0,101.509,27.5...| 10.0|10.438967703607958|
|[1.0,110.0,29.74,...| 11.6|11.776783652132988|
|[1.0,110.0,29.74,...| 19.1| 11.78817962933812|
|[1.0,110.23899999...| 11.5|10.950289626005656|
|[2.0,69.845,15.9,...| 6.96| 7.119227068037443|
|[2.0,77.499,19.5,...|  9.0| 8.358962350