In [62]:
import pandas as pd
import seaborn as sns

In [63]:
p_df = sns.load_dataset("penguins")
p_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [64]:
p_df.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [65]:
p_df.species.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [66]:
# drop null values and duplicates

p_df = p_df.dropna()
p_df = p_df.drop_duplicates()

### X-y split

In [67]:
X = p_df[['flipper_length_mm','bill_depth_mm',]]
y = p_df[['body_mass_g']]

In [68]:
y

Unnamed: 0,body_mass_g
0,3750.0
1,3800.0
2,3250.0
4,3450.0
5,3650.0
...,...
338,4925.0
340,4850.0
341,5750.0
342,5200.0


### Train-test split

In [69]:
from sklearn.model_selection import train_test_split

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42) # default test_size = 75/25 split

In [71]:
y_test

Unnamed: 0,body_mass_g
30,3250.0
317,4875.0
79,4000.0
201,3675.0
63,4050.0
...,...
247,5650.0
122,3450.0
146,4250.0
182,3200.0


### Model training

In [72]:
from sklearn.linear_model import LinearRegression

In [73]:
lm = LinearRegression()
model = lm.fit(X_train,y_train)

In [74]:
model.coef_ 
# coefficient/slope attributes of X_train, 
#i.e. slopes of Beta1 and Beta2, i.e. slopes of bill depth and flipper length

array([[52.20343571, 13.10815533]])

In [75]:
model.intercept_ # intercept attribute, i.e. Beta0

#body_mass = -6509.21 + 52.2*flipper_length + 13.1*bill_depth

array([-6509.21617362])

### Predict outputs

In [76]:
random_penguin = X_test.sample() # get random row from test set
random_penguin

Unnamed: 0,flipper_length_mm,bill_depth_mm
260,208.0,13.7


In [77]:
model.predict(random_penguin) # predicted body mass in grams

array([[4528.68018142]])

### Create predictions for test set

In [78]:
y_pred = model.predict(X_test) 
y_pred = pd.DataFrame(y_pred)
y_pred # to be compared against y_test

Unnamed: 0,0
0,3001.901576
1,5271.325621
2,3920.819556
3,4053.835184
4,3752.411909
...,...
79,4911.144833
80,2901.427151
81,3653.248300
82,3470.421682


In [79]:
y_test = y_test.reset_index(drop=True) # need to make sure we reset the index before concat
y_test

Unnamed: 0,body_mass_g
0,3250.0
1,4875.0
2,4000.0
3,3675.0
4,4050.0
...,...
79,5650.0
80,3450.0
81,4250.0
82,3200.0


In [83]:
residuals_df = pd.concat([y_test,y_pred],axis=1)
residuals_df = residuals_df.rename(columns={'body_mass_g':'y_test',0:'y_pred'})
residuals_df

Unnamed: 0,y_test,y_pred
0,3250.0,3001.901576
1,4875.0,5271.325621
2,4000.0,3920.819556
3,3675.0,4053.835184
4,4050.0,3752.411909
...,...,...
79,5650.0,4911.144833
80,3450.0,2901.427151
81,4250.0,3653.248300
82,3200.0,3470.421682


In [84]:
residuals_df['residual'] = residuals_df['y_test']-residuals_df['y_pred']

In [85]:
residuals_df

Unnamed: 0,y_test,y_pred,residual
0,3250.0,3001.901576,248.098424
1,4875.0,5271.325621,-396.325621
2,4000.0,3920.819556,79.180444
3,3675.0,4053.835184,-378.835184
4,4050.0,3752.411909,297.588091
...,...,...,...
79,5650.0,4911.144833,738.855167
80,3450.0,2901.427151,548.572849
81,4250.0,3653.248300,596.751700
82,3200.0,3470.421682,-270.421682


### Calculating metrics

In [88]:
# mean of residuals would give us Mean Error (ME)
mean_error = residuals_df.residual.mean()
mean_error # very small, because we have error compensation 

0.42567127691797524

In [89]:
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae

In [90]:
mse(y_test,y_pred) # mean squared error

135012.9173072701

In [91]:
mae(y_test,y_pred) # mean absolute error

295.239293718512

In [92]:
import numpy as np
np.sqrt(mse(y_test,y_pred))  # root mean squared error / RSE
# can also be use mse function and set squared false

367.4410392257105

In [95]:
rmse = mse(y_test,y_pred,squared=False)
rmse # same as above! : )

367.4410392257105

In [None]:
# make sure you scale only after train-test-split
# if you scale before splitting, then you have scaled using information that the machine should not have seen yet
# scaler on train set, scale the test set
# 1. scale data on train set (fit and transform) scaler.fit_transform(X_train)
# 2. scale data in test set with same scaler (transform only) scaler.transform(X_test)