In [12]:
import pandas as pd
import pathlib
import statsmodels.api as sm
from sklearn.linear_model import Lasso

In [13]:
path = pathlib.Path().cwd().parents[1] / 'CSVs' / 'CruiseData.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
0,Journey,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
1,Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
2,Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7
3,Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
4,Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0


In [14]:
y = df['crew']

X = df.drop(['crew', 'Ship_name'], axis=1)

X = sm.add_constant(pd.get_dummies(X, dtype=int))

model = Lasso(alpha=0.7).fit(X, y)

In [17]:
X_train = X.iloc[:, model.coef_ != 0]

sm_model = sm.OLS(y, sm.add_constant(X_train)).fit()
sm_model.summary()

0,1,2,3
Dep. Variable:,crew,R-squared:,0.913
Model:,OLS,Adj. R-squared:,0.911
Method:,Least Squares,F-statistic:,537.0
Date:,"Thu, 07 Dec 2023",Prob (F-statistic):,2.6e-81
Time:,13:21:41,Log-Likelihood:,-229.1
No. Observations:,158,AIC:,466.2
Df Residuals:,154,BIC:,478.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0512,0.602,0.085,0.932,-1.138,1.240
Tonnage,0.0120,0.009,1.282,0.202,-0.006,0.030
cabins,0.6630,0.080,8.245,0.000,0.504,0.822
passenger_density,0.0259,0.013,1.970,0.051,-7.83e-05,0.052

0,1,2,3
Omnibus:,119.832,Durbin-Watson:,1.752
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1407.645
Skew:,2.626,Prob(JB):,2.16e-306
Kurtosis:,16.647,Cond. No.,645.0


In [16]:
X_test = sm.add_constant(pd.get_dummies(df.drop(['crew', 'Ship_name'], axis=1), dtype=int))

full_model = sm.OLS(y, X_test).fit()
full_model.summary()

0,1,2,3
Dep. Variable:,crew,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.937
Method:,Least Squares,F-statistic:,95.15
Date:,"Thu, 07 Dec 2023",Prob (F-statistic):,3.47e-72
Time:,13:21:13,Log-Likelihood:,-189.09
No. Observations:,158,AIC:,430.2
Df Residuals:,132,BIC:,509.8
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.6608,1.131,-1.469,0.144,-3.898,0.576
Age,0.0073,0.016,0.462,0.645,-0.024,0.038
Tonnage,0.0146,0.012,1.177,0.241,-0.010,0.039
passengers,-0.0979,0.050,-1.951,0.053,-0.197,0.001
length,0.4851,0.122,3.969,0.000,0.243,0.727
cabins,0.7249,0.092,7.842,0.000,0.542,0.908
passenger_density,0.0007,0.021,0.035,0.972,-0.041,0.042
Cruise_line_Azamara,-0.0808,0.641,-0.126,0.900,-1.350,1.188
Cruise_line_Carnival,0.0270,0.274,0.098,0.922,-0.515,0.569

0,1,2,3
Omnibus:,174.008,Durbin-Watson:,2.267
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6004.955
Skew:,4.012,Prob(JB):,0.0
Kurtosis:,32.117,Cond. No.,8240000000000000.0
