In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

df = pd.DataFrame({
    "beach": ["A","A","A","A",  "B","B","B","B",  "C","C","C","C"],
    "wave": [1.5,1.4,1.5,1.6,  2.3,2.3,2.6,2.5,  3.6,3.4,3.5,3.4]
})
train, test = train_test_split(df, test_size=0.5, random_state=320)
train

Unnamed: 0,beach,wave
4,B,2.3
0,A,1.5
1,A,1.4
5,B,2.3
10,C,3.5
2,A,1.5


In [5]:
model = Pipeline([
    ("oh", OneHotEncoder()),
    ("lr", LinearRegression())
])
model.fit(train[["beach"]], train["wave"])
model.score(test[["beach"]], test["wave"])

0.9390127692014484

# Can we build the above model from scratch?

In [56]:
class Pipeline:
    def __init__(self, steps):
        self.steps = steps
        
    def fit(self, X, y):
        for name, transformer in self.steps[:-1]:
            transformer.fit(X)
            X = transformer.transform(X)
        self.steps[-1][1].fit(X, y)
        
    def predict(self, X):
        for name, transformer in self.steps[:-1]:
            X = transformer.transform(X)
        return self.steps[-1][1].predict(X)
    
    def score(self, X, y):
        predictions = self.predict(X)
        return r2_score(y, predictions)
    
class OneHotEncoder:
    def fit(self, X):
        assert len(X.columns) == 1
        self.columns = sorted(set(X.iloc[:, 0]))
    
    def transform(self, X):
        newX = pd.DataFrame()
        for c in self.columns:
            newX[c] = (X.iloc[:, 0] == c).astype(int)
        print(newX)
        return newX
    
class LinearRegression:
    def fit(self, X, y):
        print(X.shape, y.shape)
        self.c = np.linalg.solve(X.T @ X, X.T @ y)
        
    def predict(self, X):
        print("predict")
        return X @ self.c

model = Pipeline([
    ("oh", OneHotEncoder()),
    ("lr", LinearRegression())
])
model.fit(train[["beach"]], train["wave"])
model.score(test[["beach"]], test["wave"])

    A  B  C
4   0  1  0
0   1  0  0
1   1  0  0
5   0  1  0
10  0  0  1
2   1  0  0
(6, 3) (6,)
    A  B  C
6   0  1  0
7   0  1  0
11  0  0  1
9   0  0  1
8   0  0  1
3   1  0  0
predict


0.9390127692014484