In [1]:
*Cruise Ship Cruise Employees*

In [2]:
"""Linear Regression Model to Predict Number of Employees Required to man a newly manufactured cruise ship, based on ecommerce data"""

In [3]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('cruise_regression').getOrCreate()

In [5]:
#Load data
df = sqlContext.sql("SELECT * FROM cruise")

In [6]:
#Getting familiar with the data
df.printSchema()

In [7]:
for item in df.head(1)[0]:
  print(item)

In [8]:
for ship in df.head(5):
  print(ship)
  print('\n')

In [9]:
%sql
SELECT * FROM cruise LIMIT 10

Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
Journey,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7
Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0
Ecstasy,Carnival,22,70.367,20.52,8.55,10.2,34.29,9.2
Elation,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2
Fantasy,Carnival,23,70.367,20.56,8.55,10.22,34.23,9.2
Fascination,Carnival,19,70.367,20.52,8.55,10.2,34.29,9.2
Freedom,Carnival,6,110.239,37.0,9.51,14.87,29.79,11.5


In [10]:
[df.where(df[column].isNull()).count() for column in df.columns]

In [11]:
df.groupBy(df['Cruise_line']).count().orderBy('count').show()

In [12]:
from pyspark.ml.feature import StringIndexer

In [13]:
#ML Preprocessing step 1: String index the avatar categories (can also explore one hot encoding)
indexer = StringIndexer(inputCol = 'Cruise_line', outputCol = 'cruise_category')
indexed = indexer.fit(df).transform(df)
indexed.head(1)

In [14]:
indexed.columns

In [15]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [16]:
#ML Preprocessing step 2: Assemble features into single column
assembler = VectorAssembler(inputCols = ['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'cruise_category'], outputCol = 'features')

output = assembler.transform(indexed)

In [17]:
output.columns

In [18]:
#ML Preprocessing step 3: Select features & response variable
final_data = output.select(['features','crew'])
output.select(['features', 'crew']).show()

In [19]:
from pyspark.sql.functions import corr

In [20]:
#ML Preprocessing step 4: Check for high correlations between response variable & predictors for prediction power analysis
[df.select(corr('crew',column)).show() for column in df.columns] #passengers, cabins are highly correlated

In [21]:
#Partition data
train_data, test_data = final_data.randomSplit([0.7,0.3], seed = 1234)
train_data.describe().show()

In [22]:
from pyspark.ml.regression import LinearRegression

In [23]:
#Initiate linear regression model
model = LinearRegression(labelCol= 'crew')

In [24]:
#Train model
trained_model = model.fit(train_data)

In [25]:
#Evaluate model
results = trained_model.evaluate(test_data)

In [26]:
#Model performance
results.rootMeanSquaredError

In [27]:
results.r2

In [28]:
results.residuals.show()

In [29]:
results.meanSquaredError

In [30]:
results.meanAbsoluteError