Part 1

In [22]:
# ====== 0) Mount Google Drive (Colab 里需要) ======
from google.colab import drive
drive.mount('/content/drive')

# ====== 1) Set BASE_DIR to your Drive folder ======
import os
from pathlib import Path

# 你的截图里文件在：MyDrive / "linear regression AS" / day.csv
BASE_DIR = Path("/content/drive/MyDrive/linear regression AS")


day_csv = BASE_DIR / "day.csv"

# ====== 2) Step 1: check missing, drop unwanted columns, shuffle ======
import pandas as pd

data = pd.read_csv(day_csv)

missing = data.isna().sum()
print("Number of missing values:", int(missing.sum()))

unwanted_columns = ["instant", "dteday", "yr", "workingday", "casual", "registered"]
data = data.drop(columns=[c for c in unwanted_columns if c in data.columns])

data = data.sample(frac=1, random_state=38).reset_index(drop=True)

step1_path = BASE_DIR / "Step1_Checking&Dropping_unwanted_columns&Shuffling.csv"
data.to_csv(step1_path, index=False)

# ====== 3) Step 2: one-hot for selected columns, keep cnt as last column ======
data = pd.read_csv(step1_path)

columns_need_splitting = ["season", "mnth", "weekday", "weathersit"]
data_with_One_hot = pd.get_dummies(
    data,
    columns=columns_need_splitting,
    drop_first=True,
    dtype=int
)

# move "cnt" to the last column (same logic as you wrote)
cols = [c for c in data_with_One_hot.columns if c != "cnt"] + ["cnt"]
data_with_One_hot = data_with_One_hot[cols]

step2_path = BASE_DIR / "Step2_One_hot_state_splitting.csv"
data_with_One_hot.to_csv(step2_path, index=False)

# ====== 4) Step 3: 80/20 split, save 4 csv files ======
data_with = pd.read_csv(step2_path)
data_without = pd.read_csv(step1_path)

n = len(data_with)
split = int(0.8 * n)

with_train = data_with.iloc[:split].reset_index(drop=True)
with_test  = data_with.iloc[split:].reset_index(drop=True)

without_train = data_without.iloc[:split].reset_index(drop=True)
without_test  = data_without.iloc[split:].reset_index(drop=True)

with_train_path = BASE_DIR / "With_One_hot_training_set.csv"
with_test_path  = BASE_DIR / "With_One_hot_testing_set.csv"
without_train_path = BASE_DIR / "Without_One_hot_training_set.csv"
without_test_path  = BASE_DIR / "Without_One_hot_testing_set.csv"

with_train.to_csv(with_train_path, index=False)
with_test.to_csv(with_test_path, index=False)
without_train.to_csv(without_train_path, index=False)
without_test.to_csv(without_test_path, index=False)

# ====== 5) Load training set -> X, y (same as your code) ======
import numpy as np

#train_path = with_train_path
train_path = without_train_path  # uncomment this if you train WITHOUT one-hot model

data_train = pd.read_csv(train_path)

X = data_train.iloc[:, :-1].to_numpy(dtype=float)
y = data_train.iloc[:, -1].to_numpy(dtype=float)

print("X shape:", X.shape)
print("y shape:", y.shape)

#Part two starts from here

# I have loaded the matrix of features into 'X'
# The tags, the columns of cnt, is loaded into 'y'

# X[i][1]  = holiday
# X[i][2]  = temp
# X[i][3]  = atemp
# X[i][4]  = hum
# X[i][5]  = windspeed
# All X[i][6-8] are 0 = season_1
# X[i][6]  = season_2
# X[i][7]  = season_3
# X[i][8]  = season_4
# All X[i][9-19] are 0 = mnth_1
# X[i][9]  = mnth_2
# X[i][10] = mnth_3
# X[i][11] = mnth_4
# X[i][12] = mnth_5
# X[i][13] = mnth_6
# X[i][14] = mnth_7
# X[i][15] = mnth_8
# X[i][16] = mnth_9
# X[i][17] = mnth_10
# X[i][18] = mnth_11
# X[i][19] = mnth_12
# All X[i][20-25] are 0 = weekday_0
# X[i][20] = weekday_1
# X[i][21] = weekday_2
# X[i][22] = weekday_3
# X[i][23] = weekday_4
# X[i][24] = weekday_5
# X[i][25] = weekday_6
# All X[i][26,27] are 0 = weathersit_1
# X[i][26] = weathersit_2
# X[i][27] = weathersit_3
# y[i] = cnt

########### For without one hot training set:
# X[i][1] = season
# X[i][2] = mnth
# X[i][3] = holiday
# X[i][4] = weekday
# X[i][5] = weathersit
# X[i][6] = temp
# X[i][7] = atemp
# X[i][8] = hum
# X[i][9] = windspeed
# y[i]    = cnt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of missing values: 0
X shape: (584, 9)
y shape: (584,)


Part 2

In [23]:
import numpy as np

class LinearRegression:
    def __init__(self, add_bias=True):
        """
        add_bias: if True, adds an intercept term to the model
        """
        self.add_bias = add_bias
        self.w = None

    def fit(self, x, y):
        """
        x: (N,) or (N, D)
        y: (N,)
        """
        x = np.asarray(x, dtype=float)
        y = np.asarray(y, dtype=float).reshape(-1)

        # If x is 1D -> make it (N, 1)
        if x.ndim == 1:
            x = x[:, None]

        N = x.shape[0]

        # Add bias column (ones)
        if self.add_bias:
            x = np.column_stack([x, np.ones(N)])

        # Numerically stable least squares (no explicit inverse)
        self.w = np.linalg.lstsq(x, y, rcond=None)[0]
        return self

    def predict(self, x):
        """
        x: (N,) or (N, D)
        return: (N,)
        """
        if self.w is None:
            raise ValueError("Model not fitted. Call fit() first.")

        x = np.asarray(x, dtype=float)

        if x.ndim == 1:
            x = x[:, None]

        if self.add_bias:
            x = np.column_stack([x, np.ones(x.shape[0])])

        return x @ self.w

    @staticmethod
    def mse(y_true, y_pred):
        y_true = np.asarray(y_true, dtype=float).reshape(-1)
        y_pred = np.asarray(y_pred, dtype=float).reshape(-1)
        return float(np.mean((y_true - y_pred) ** 2))

# ========== Load your train/test sets made by Part1 ==========
# Choose ONE of the two pipelines:

# (A) With one-hot
#train_csv = BASE_DIR / "With_One_hot_training_set.csv"
#test_csv  = BASE_DIR / "With_One_hot_testing_set.csv"

#(B) Without one-hot (uncomment to use)
train_csv = BASE_DIR / "Without_One_hot_training_set.csv"
test_csv  = BASE_DIR / "Without_One_hot_testing_set.csv"

train_df = pd.read_csv(train_csv)
test_df  = pd.read_csv(test_csv)

X_train = train_df.iloc[:, :-1].to_numpy(dtype=float)
y_train = train_df.iloc[:, -1].to_numpy(dtype=float)

X_test  = test_df.iloc[:, :-1].to_numpy(dtype=float)
y_test  = test_df.iloc[:, -1].to_numpy(dtype=float)

# ========== Train + Evaluate ==========
model = LinearRegression(add_bias=True)
model.fit(X_train, y_train)

pred_train = model.predict(X_train)
pred_test  = model.predict(X_test)

train_mse = model.mse(y_train, pred_train)
test_mse  = model.mse(y_test, pred_test)

print("Train MSE:", train_mse)
print("Test  MSE:", test_mse)

# Optional: show first few predictions
print("\nFirst 5 predictions vs true:")
for i in range(5):
    print(f"pred={pred_test[i]:.3f}, true={y_test[i]:.3f}")


Train MSE: 1783980.9127857122
Test  MSE: 1754347.2362024519

First 5 predictions vs true:
pred=5538.161, true=7665.000
pred=3123.086, true=3624.000
pred=5144.847, true=4748.000
pred=4959.198, true=4186.000
pred=6295.611, true=4661.000


Non-linear part


In [26]:
import pandas as pd
# =========================================================
# FULL (simple prints, no fancy formatting, no plots)
# Methods:
#   0) Baseline
#   1) Polynomial (squares only)
#   2) Interaction (pairwise only)
#   3) Gaussian bases (D=10)
#   4) Sigmoid bases (D=5)
# =========================================================

# ============ Load train/test ============
train_df = pd.read_csv(without_train_path)
test_df  = pd.read_csv(without_test_path)

target_col = "cnt"
cont_cols = ["temp", "atemp", "hum", "windspeed"]

# other features keep as-is (categorical / one-hot / other numeric)
other_cols = [c for c in train_df.columns if c not in cont_cols + [target_col]]
X_train_other = train_df[other_cols].to_numpy(dtype=float)
X_test_other  = test_df[other_cols].to_numpy(dtype=float)

y_train = train_df[target_col].to_numpy(dtype=float)
y_test  = test_df[target_col].to_numpy(dtype=float)


# ============ 0) Baseline ============
X_train_base = train_df.drop(columns=[target_col]).to_numpy(dtype=float)
X_test_base  = test_df.drop(columns=[target_col]).to_numpy(dtype=float)

base_model = LinearRegression(add_bias=True).fit(X_train_base, y_train)
pred_train_base = base_model.predict(X_train_base)
pred_test_base  = base_model.predict(X_test_base)

base_train_mse = LinearRegression.mse(y_train, pred_train_base)
base_test_mse  = LinearRegression.mse(y_test, pred_test_base)

print("===== Baseline =====")
print("train mse:", base_train_mse)
print("test  mse:", base_test_mse)


# ============ 1) Polynomial (squares only) ============
train_poly = train_df.copy()
test_poly  = test_df.copy()

for c in cont_cols:
    if c in train_poly.columns:
        train_poly[c + "_sq"] = train_poly[c] ** 2
        test_poly[c + "_sq"]  = test_poly[c] ** 2

X_train_poly = train_poly.drop(columns=[target_col]).to_numpy(dtype=float)
X_test_poly  = test_poly.drop(columns=[target_col]).to_numpy(dtype=float)

poly_model = LinearRegression(add_bias=True).fit(X_train_poly, y_train)
pred_train_poly = poly_model.predict(X_train_poly)
pred_test_poly  = poly_model.predict(X_test_poly)

poly_train_mse = LinearRegression.mse(y_train, pred_train_poly)
poly_test_mse  = LinearRegression.mse(y_test, pred_test_poly)

print("\n===== Polynomial (squares only) =====")
print("train mse:", poly_train_mse)
print("test  mse:", poly_test_mse)


# ============ 2) Interaction (pairwise only) ============
train_inter = train_df.copy()
test_inter  = test_df.copy()

# all pairwise interactions among continuous features
for i in range(len(cont_cols)):
    for j in range(i + 1, len(cont_cols)):
        a, b = cont_cols[i], cont_cols[j]
        if a in train_inter.columns and b in train_inter.columns:
            train_inter[a + "_x_" + b] = train_inter[a] * train_inter[b]
            test_inter[a + "_x_" + b]  = test_inter[a] * test_inter[b]

X_train_inter = train_inter.drop(columns=[target_col]).to_numpy(dtype=float)
X_test_inter  = test_inter.drop(columns=[target_col]).to_numpy(dtype=float)

inter_model = LinearRegression(add_bias=True).fit(X_train_inter, y_train)
pred_train_inter = inter_model.predict(X_train_inter)
pred_test_inter  = inter_model.predict(X_test_inter)

inter_train_mse = LinearRegression.mse(y_train, pred_train_inter)
inter_test_mse  = LinearRegression.mse(y_test, pred_test_inter)

print("\n===== Interaction (pairwise only) =====")
print("train mse:", inter_train_mse)
print("test  mse:", inter_test_mse)


# ============ 3) Gaussian bases (D=10) ============
D_g = 10
gaussian = lambda x, mu, sigma: np.exp(-((x - mu) / sigma) ** 2)

phi_train_list = []
phi_test_list  = []

for col in cont_cols:
    xtr = train_df[col].to_numpy(dtype=float)
    xte = test_df[col].to_numpy(dtype=float)

    mu = np.linspace(xtr.min(), xtr.max(), D_g)
    sigma = (mu[-1] - mu[0]) / max(D_g - 1, 1)
    if sigma <= 1e-8:
        sigma = 1e-2

    phi_tr = gaussian(xtr[:, None], mu[None, :], sigma)  # (N_train, D_g)
    phi_te = gaussian(xte[:, None], mu[None, :], sigma)  # (N_test, D_g)

    phi_train_list.append(phi_tr)
    phi_test_list.append(phi_te)

Phi_train_g = np.hstack([X_train_other, np.hstack(phi_train_list)])
Phi_test_g  = np.hstack([X_test_other,  np.hstack(phi_test_list)])

g_model = LinearRegression(add_bias=True).fit(Phi_train_g, y_train)
pred_train_g = g_model.predict(Phi_train_g)
pred_test_g  = g_model.predict(Phi_test_g)

g_train_mse = LinearRegression.mse(y_train, pred_train_g)
g_test_mse  = LinearRegression.mse(y_test, pred_test_g)

print("\n===== Gaussian bases (D=10) =====")
print("train mse:", g_train_mse)
print("test  mse:", g_test_mse)


# ============ 4) Sigmoid bases (D=5) ============
D_s = 5
sigmoid = lambda x, mu, s: 1 / (1 + np.exp(-(x - mu) / s))

phi_train_list = []
phi_test_list  = []

for col in cont_cols:
    xtr = train_df[col].to_numpy(dtype=float)
    xte = test_df[col].to_numpy(dtype=float)

    mu = np.linspace(xtr.min(), xtr.max(), D_s)
    s = (mu[-1] - mu[0]) / max(D_s - 1, 1)
    if s <= 1e-8:
        s = 1e-2

    phi_tr = sigmoid(xtr[:, None], mu[None, :], s)  # (N_train, D_s)
    phi_te = sigmoid(xte[:, None], mu[None, :], s)  # (N_test, D_s)

    phi_train_list.append(phi_tr)
    phi_test_list.append(phi_te)

Phi_train_s = np.hstack([X_train_other, np.hstack(phi_train_list)])
Phi_test_s  = np.hstack([X_test_other,  np.hstack(phi_test_list)])

s_model = LinearRegression(add_bias=True).fit(Phi_train_s, y_train)
pred_train_s = s_model.predict(Phi_train_s)
pred_test_s  = s_model.predict(Phi_test_s)

s_train_mse = LinearRegression.mse(y_train, pred_train_s)
s_test_mse  = LinearRegression.mse(y_test, pred_test_s)

print("\n===== Sigmoid bases (D=5) =====")
print("train mse:", s_train_mse)
print("test  mse:", s_test_mse)


# ============ Simple summary ============
print("\n===== Summary =====")
print("Baseline train/test:", base_train_mse, base_test_mse)
print("Poly     train/test:", poly_train_mse, poly_test_mse)
print("Inter    train/test:", inter_train_mse, inter_test_mse)
print("Gaussian train/test:", g_train_mse, g_test_mse)
print("Sigmoid  train/test:", s_train_mse, s_test_mse)


===== Baseline =====
train mse: 1783980.9127857122
test  mse: 1754347.2362024519

===== Polynomial (squares only) =====
train mse: 1391124.2545031824
test  mse: 1474820.648210448

===== Interaction (pairwise only) =====
train mse: 1484071.3749942277
test  mse: 1613529.0864299294

===== Gaussian bases (D=10) =====
train mse: 1215247.7282587355
test  mse: 1493471.9879626255

===== Sigmoid bases (D=5) =====
train mse: 1277469.9881892847
test  mse: 1426506.5941072472

===== Summary =====
Baseline train/test: 1783980.9127857122 1754347.2362024519
Poly     train/test: 1391124.2545031824 1474820.648210448
Inter    train/test: 1484071.3749942277 1613529.0864299294
Gaussian train/test: 1215247.7282587355 1493471.9879626255
Sigmoid  train/test: 1277469.9881892847 1426506.5941072472
