## Exercise 1 and 2
Does linear regression fit the data1.csv ?

1.1

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor

In [2]:
df = pd.read_csv("data1.csv")

fig = px.scatter_3d(df, x='x1', y='x2', z='y', color='y')
fig.show()

In [3]:
X = df[['x1', 'x2']]
y = df['y']

model = LinearRegression()
model.fit(X, y)

a, b = model.coef_
c = model.intercept_

x1_range = np.linspace(df['x1'].min(), df['x1'].max(), 30)
x2_range = np.linspace(df['x2'].min(), df['x2'].max(), 30)
x1_surf, x2_surf = np.meshgrid(x1_range, x2_range)
y_surf = a * x1_surf + b * x2_surf + c


fig = go.Figure()
fig.add_trace(go.Scatter3d(
    x=df['x1'], y=df['x2'], z=df['y'],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.7),
    name='Dane'
))
fig.add_trace(go.Surface(
    x=x1_surf, y=x2_surf, z=y_surf,
    opacity=0.5,
))
fig.update_layout(
    scene=dict(
        xaxis_title='x1',
        yaxis_title='x2',
        zaxis_title='y'
    ),
    title="Visualization of data and simple regression model",
    width=900,
    height=700
)

fig.show()

Looks like linear regression is a good fit for the data1.csv

1.2

In [4]:
def preprocess_data(
        X_train: pd.DataFrame,
        y_train: pd.Series
) -> tuple[pd.DataFrame, pd.Series, StandardScaler, PolynomialFeatures]:
    # 1. deleting outliers (like y=805 in our data1.csv)
    mean_y = y_train.mean()
    std_y = y_train.std()
    mask = (y_train > mean_y - 3 * std_y) & (y_train < mean_y + 3 * std_y)
    X_train = X_train[mask]
    y_train = y_train[mask]

    # 2. standardization
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)

    #. adding polynomial features to X
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly.fit_transform(X_scaled)

    return X_poly, y_train, scaler, poly

## Exercise 2

In [5]:
def preprocess_test_data(X_test: pd.DataFrame, scaler: StandardScaler, poly: PolynomialFeatures):
    X_scaled = scaler.transform(X_test)
    X_poly = poly.transform(X_scaled)
    return X_poly

â‰ #### 2.1
for building one model, we can use 80/20 random split.
if we're building more than one, we have 2 options:
1. 60/20/20 - train,val,test
2. Cross validation
3. or just train both on the same 80/20 and compare on test

In [6]:
X = df[['x1', 'x2']]
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#### 2.2, 2.3, 2.4

In [7]:
# raw models

model_raw = LinearRegression()
model_raw.fit(X_train, y_train)

baseline_raw = DummyRegressor(strategy='mean')
baseline_raw.fit(X_train, y_train)

y_pred_raw = model_raw.predict(X_test)
y_pred_raw_base = baseline_raw.predict(X_test)

In [8]:
# preprocessed models

X_train_proc, y_train_proc, scaler, poly = preprocess_data(X_train, y_train)
X_test_proc = preprocess_test_data(X_test, scaler, poly)

model_proc = LinearRegression()
model_proc.fit(X_train_proc, y_train_proc)

baseline_proc = DummyRegressor(strategy='mean')
baseline_proc.fit(X_train_proc, y_train_proc)

y_pred_proc = model_proc.predict(X_test_proc)
y_pred_proc_base = baseline_proc.predict(X_test_proc)

In [9]:
# evaluating

def evaluate(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "R^2": r2_score(y_true, y_pred),
    }

results = pd.DataFrame([
    { "Model": "Baseline/raw", **evaluate(y_test, y_pred_raw_base) },
    { "Model": "Baseline/proc", **evaluate(y_test, y_pred_proc_base) },
    { "Model": "LinearRegression/raw", **evaluate(y_test, y_pred_raw) },
    { "Model": "LinearRegression/proc", **evaluate(y_test, y_pred_proc) },
])

results

Unnamed: 0,Model,MSE,R^2
0,Baseline/raw,70092.425271,-0.039327
1,Baseline/proc,71561.278594,-0.061107
2,LinearRegression/raw,1043.043946,0.984534
3,LinearRegression/proc,0.895085,0.999987
