In [1]:
import pandas as pd

import sklearn.metrics as mt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection  import train_test_split as tts
from sklearn.feature_selection import SequentialFeatureSelector as SFS

<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>

In [2]:
data = pd.read_csv('data/05_winequality-red.csv')
print(data.shape)
data.head()

(1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
y = data['quality']
X = data.drop(columns='quality')

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print(model.intercept_, model.coef_)
print('R2   :', mt.r2_score(y_test, predictions))

34.99871062872153 [ 4.12835075e-02 -1.14952802e+00 -1.77927063e-01  2.78700036e-02
 -1.87340739e+00  2.68362616e-03 -2.77748370e-03 -3.15166657e+01
 -2.54486051e-01  9.24040106e-01  2.67797417e-01]
R2   : 0.3283887639580201


<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>

**Stepwise Regression - Forward**

In [4]:
y = data['quality']
X = data.drop(columns='quality')

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0)

model = LinearRegression()

forward = SFS(model, n_features_to_select='auto', direction='forward', 
              scoring="neg_mean_squared_error")  # n_features_to_select='auto'
forward = forward.fit(X_train, y_train)

print(forward.get_feature_names_out())

['volatile acidity' 'chlorides' 'total sulfur dioxide' 'sulphates'
 'alcohol']


In [5]:
fX = data[[*forward.get_feature_names_out()]]
fX_train, fX_test, y_train, y_test = tts(fX, y, test_size=0.2, random_state=0)

f_model = LinearRegression()
f_model.fit(fX_train, y_train)
predictions = f_model.predict(fX_test)

print(f_model.intercept_, f_model.coef_)
print('R2   :', mt.r2_score(y_test, predictions))

3.010526323885073 [-1.21046416 -1.7655295  -0.00217522  0.89491841  0.28268935]
R2   : 0.31563789039192613


<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>

**Stepwise Regression - Backward**

In [6]:
y = data['quality']
X = data.drop(columns='quality')

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0)

model = LinearRegression()
backward = SFS(model, n_features_to_select='auto', direction='backward', 
              scoring="neg_mean_squared_error")
backward = backward.fit(X_train, y_train)

print(backward.get_feature_names_out())

['fixed acidity' 'volatile acidity' 'chlorides' 'total sulfur dioxide'
 'sulphates' 'alcohol']


In [7]:
bX = data[[*backward.get_feature_names_out()]]
bX_train, bX_test, y_train, y_test = tts(bX, y, test_size=0.2, random_state=0)

b_model = LinearRegression()
b_model.fit(bX_train, y_train)
predictions = b_model.predict(bX_test)

print(b_model.intercept_, b_model.coef_)
print('R2   :', mt.r2_score(y_test, predictions))

2.737345463033121 [ 0.02398203 -1.15103249 -1.79686449 -0.00200784  0.86717873  0.28795711]
R2   : 0.3184365464236124


<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>

**Weighted Regression**

In [8]:
data = pd.read_csv('data/02_advertising.csv')
data.head(3)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3


In [9]:
# labels = ['Low', 'Normal', 'High', 'Very High']
labels = [1, 2, 3, 4]
data['Weight'] = pd.cut(data['Sales'], bins=4, labels=labels)
data.head(3)

Unnamed: 0,TV,Radio,Newspaper,Sales,Weight
0,230.1,37.8,69.2,22.1,4
1,44.5,39.3,45.1,10.4,2
2,17.2,45.9,69.3,9.3,2


In [10]:
y = data['Sales']
X = data.drop(columns=['Sales', 'Weight'])
weights = data['Weight']

l_model = LinearRegression()
w_model = LinearRegression()

l_model.fit(X, y)
w_model.fit(X, y, sample_weight=weights)

pd.concat([
    pd.DataFrame({
        'predictor': ['Intercept'],
        'sales_lm': l_model.intercept_,
        'sales_wm': w_model.intercept_ }),
    pd.DataFrame({
        'predictor': X.columns,
        'sales_lm': l_model.coef_,
        'sales_wm': w_model.coef_ })
])

Unnamed: 0,predictor,sales_lm,sales_wm
0,Intercept,2.938889,2.659389
0,TV,0.045765,0.04513
1,Radio,0.18853,0.210075
2,Newspaper,-0.001037,-0.001113


<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>

**Polynomial Regression**

In [11]:
data = pd.read_csv('data/08_cylinder.csv')
data_backup = data.copy()
data.head(3)

Unnamed: 0,Radius,Height,Volume
0,7.33,13.23,2253.51
1,3.65,15.73,673.19
2,7.85,14.04,2738.02


In [12]:
y = data['Volume']
X = data.drop(columns='Volume')

poly = PolynomialFeatures(degree=3, interaction_only=False)
pX = poly.fit_transform(X)
# display(X.head(), pd.DataFrame(pX).head())

pX_train, pX_test, y_train, y_test = tts(pX, y, test_size=0.2, random_state=0)

p_model = LinearRegression()
p_model.fit(pX_train, y_train)
predictions = p_model.predict(pX_test)

print(p_model.intercept_, p_model.coef_)
print('R2   :', mt.r2_score(y_test, predictions))

data = data_backup.copy()

9.264841831396325 [ 0.          4.05382356 -3.43106304 -0.41223203 -0.23497005  0.45272833
  0.02142417  3.13876301  0.01104062 -0.01554473]
R2   : 0.999951860639409


In [13]:
y = data['Volume']
X = data.drop(columns='Volume')

poly = PolynomialFeatures(degree=3, interaction_only=False)
pX = pd.DataFrame(poly.fit_transform(X))
#display(X.head(3), pX.head(3))

pX_train, pX_test, y_train, y_test = tts(pX, y, test_size=0.2, random_state=0)

p_model = LinearRegression(fit_intercept=False) 
forward = SFS(p_model, n_features_to_select=1)
forward = forward.fit(pX_train, y_train)

# print(forward.get_feature_names_out())
feature = int(forward.get_feature_names_out()[0][-1]) # adjust according to column name 
pX_train = pX_train[[feature]]
pX_test = pX_test[[feature]]

p_model.fit(pX_train, y_train)
predictions = p_model.predict(pX_test)

print(p_model.intercept_, p_model.coef_)
print('R2   :', mt.r2_score(y_test, predictions))

data = data_backup.copy()

0.0 [3.15646971]
R2   : 0.9999230369099843


In [14]:
y = data['Volume']
X = data.drop(columns='Volume')
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0)

estimator = LinearRegression(fit_intercept=False)
pipeline = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=3)),
    ('feature_select', SFS(estimator, n_features_to_select=1, direction='forward')),
    ('model', estimator)
])

pipeline.fit(X_train, y_train)
model = pipeline.named_steps['model']
predictions = pipeline.predict(X_test)


print(model.intercept_, model.coef_)
print('R2   :', mt.r2_score(y_test, predictions))

0.0 [3.15646971]
R2   : 0.9999230369099843


<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>