In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import io
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Fish.csv')
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [3]:
corr = df.corr()
corr

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
Weight,1.0,0.915712,0.918618,0.923044,0.724345,0.886507
Length1,0.915712,1.0,0.999517,0.992031,0.625378,0.86705
Length2,0.918618,0.999517,1.0,0.994103,0.640441,0.873547
Length3,0.923044,0.992031,0.994103,1.0,0.703409,0.87852
Height,0.724345,0.625378,0.640441,0.703409,1.0,0.792881
Width,0.886507,0.86705,0.873547,0.87852,0.792881,1.0


In [4]:
X = df.drop(['Weight','Species'], axis=1)
y = df['Weight']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((127, 5), (32, 5), (127,), (32,))

In [6]:
#linear regression

In [7]:
from sklearn import linear_model
lasso = linear_model.Lasso()
lasso.fit(X_train,y_train)
predictions =lasso.predict(X_test)

In [8]:
from sklearn.metrics import mean_squared_error, r2_score
print('R^2 score: %.3f' % r2_score(y_test, predictions))


R^2 score: 0.865


In [9]:
print(lasso.coef_, lasso.intercept_)

[ 55.20097913  -1.14190956 -30.22373696  30.33152738  22.74097888] -450.12481827907584


In [10]:
import statsmodels.api as sm
X = sm.add_constant(X_train)
lasso_reg=sm.OLS(y_train, X).fit()
print(lasso_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                 Weight   R-squared:                       0.887
Model:                            OLS   Adj. R-squared:                  0.882
Method:                 Least Squares   F-statistic:                     189.1
Date:                Fri, 25 Feb 2022   Prob (F-statistic):           2.03e-55
Time:                        20:50:25   Log-Likelihood:                -779.00
No. Observations:                 127   AIC:                             1570.
Df Residuals:                     121   BIC:                             1587.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -446.1174     31.027    -14.378      0.0

In [11]:
lasso_reg.bse

const      31.027019
Length1    40.822442
Length2    42.552569
Length3    17.439377
Height      8.908654
Width      21.365560
dtype: float64

In [12]:
print('R2 score:', lasso_reg.rsquared)
print(lasso_reg.pvalues)

R2 score: 0.8865221536639928
const      5.982512e-28
Length1    5.123003e-02
Length2    5.751781e-01
Length3    7.311111e-02
Height     4.379380e-04
Width      2.628781e-01
dtype: float64


In [13]:
print(round(lasso_reg.pvalues))

const      0.0
Length1    0.0
Length2    1.0
Length3    0.0
Height     0.0
Width      0.0
dtype: float64


In [14]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, predictions, squared=False)

In [15]:
rmse

160.0283399337106

In [16]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

In [17]:
pipeline = Pipeline([
                     ('scaler',StandardScaler()),
                     ('model',Lasso())
])

In [18]:
search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=3
                      )

In [19]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV 1/5] END .............model__alpha=0.1;, score=-22194.455 total time=   0.0s
[CV 2/5] END .............model__alpha=0.1;, score=-16740.193 total time=   0.0s
[CV 3/5] END .............model__alpha=0.1;, score=-10121.901 total time=   0.0s
[CV 4/5] END .............model__alpha=0.1;, score=-15075.578 total time=   0.0s
[CV 5/5] END ..............model__alpha=0.1;, score=-9906.452 total time=   0.0s
[CV 1/5] END .............model__alpha=0.2;, score=-21716.679 total time=   0.0s
[CV 2/5] END .............model__alpha=0.2;, score=-16726.577 total time=   0.0s
[CV 3/5] END .............model__alpha=0.2;, score=-10194.739 total time=   0.0s
[CV 4/5] END .............model__alpha=0.2;, score=-15566.018 total time=   0.0s
[CV 5/5] END ..............model__alpha=0.2;, score=-9808.683 total time=   0.0s
[CV 1/5] END model__alpha=0.30000000000000004;, score=-21318.020 total time=   0.0s
[CV 2/5] END model__alpha=0.3000000000000000

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', Lasso())]),
             param_grid={'model__alpha': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2,
       5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5,
       6.6, 6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8,
       7.9, 8. , 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9. , 9.1,
       9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9])},
             scoring='neg_mean_squared_error', verbose=3)

In [20]:
search.best_params_

{'model__alpha': 0.9}

In [21]:
coefficients = search.best_estimator_.named_steps['model'].coef_

In [22]:
importance = np.abs(coefficients)

In [23]:
importance

array([188.14620339,   0.        ,   0.        ,  64.69718806,
        79.32128096])