<a href="https://colab.research.google.com/github/crojasac/490AI/blob/main/train_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

rng = np.random.default_rng(490)  # reproducible

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

def plot_scatter_with_line(df, xcol, ycol, model=None, title="", color="C0"):
    xs = df[xcol].values
    ys = df[ycol].values
    plt.figure()
    plt.scatter(xs, ys, alpha=0.7, label="Data", color=color)
    if model is not None:
        a = float(model.intercept_)
        b = float(model.coef_[0])
        xgrid = np.linspace(xs.min(), xs.max(), 100)
        yhat_line = a + b * xgrid
        plt.plot(xgrid, yhat_line, linewidth=2, label="Fitted line")
    plt.xlabel(xcol); plt.ylabel(ycol)
    plt.title(title); plt.grid(True, alpha=0.3); plt.legend(); plt.show()




In [None]:
# Create two seasons: 180 days winter, 180 days summer
n_winter = 180
n_summer = 180

# Prices fluctuate similarly across seasons (exogenous here)
price_winter = rng.normal(loc=10.0, scale=1.2, size=n_winter)
price_summer = rng.normal(loc=10.1, scale=1.2, size=n_summer)

# ... keep all your existing code above, then set a smaller sigma:
beta = -0.8
alpha_winter = 28.0
alpha_summer = alpha_winter + 3

sigma = 1.0   # << was 2.0; smaller noise -> clearer relationships, higher R^2

q_winter = alpha_winter + beta * price_winter + rng.normal(0, sigma, n_winter)
q_summer = alpha_summer + beta * price_summer + rng.normal(0, sigma, n_summer)

winter_df = pd.DataFrame({"Season": "Winter", "Price": price_winter, "Quantity": q_winter})
summer_df = pd.DataFrame({"Season": "Summer", "Price": price_summer, "Quantity": q_summer})
df = pd.concat([winter_df, summer_df], ignore_index=True)

df.head(), df["Season"].value_counts()


In [None]:
plt.figure()
for season, color in [("Winter","C0"), ("Summer","C1")]:
    sub = df[df["Season"]==season]
    plt.scatter(sub["Price"], sub["Quantity"], alpha=0.6, label=season, color=color)
plt.xlabel("Price"); plt.ylabel("Quantity")
plt.title("Beer demand with seasonal intercept shift")
plt.grid(True, alpha=0.3); plt.legend(); plt.show()



In [None]:
train_w = df[df["Season"]=="Winter"].copy()   # train
test_s  = df[df["Season"]=="Summer"].copy()   # test (OOD)

Xw = train_w[["Price"]].values
yw = train_w["Quantity"].values

lin_w = LinearRegression().fit(Xw, yw)
r2_train_w = lin_w.score(Xw, yw)
print("Winter-only TRAIN R^2:", round(r2_train_w, 3))

plot_scatter_with_line(train_w, "Price", "Quantity", model=lin_w,
                       title="Winter fit: looks great in-sample", color="C0")


In [None]:
Xs = test_s[["Price"]].values
ys = test_s["Quantity"].values

r2_test_s = lin_w.score(Xs, ys)
print("Apply winter model to SUMMER TEST R^2:", round(r2_test_s, 3))

# Visualize: summer scatter with the winter-fitted line overlaid
plot_scatter_with_line(test_s, "Price", "Quantity", model=lin_w,
                       title="Summer test: winter line underpredicts (domain shift)", color="C1")
# Winter-only train metrics (already printed R^2 train)
yhat_train_w = lin_w.predict(Xw)
rmse_train_w = rmse(yw, yhat_train_w)
print("Winter-only TRAIN RMSE:", round(rmse_train_w, 3))

# Summer test metrics with winter model
yhat_s = lin_w.predict(Xs)
r2_test_s = lin_w.score(Xs, ys)  # (already printed above, but keep for clarity)
rmse_test_s = rmse(ys, yhat_s)
print("Apply winter model to SUMMER TEST R^2:", round(r2_test_s, 3))
print("Apply winter model to SUMMER TEST RMSE:", round(rmse_test_s, 3))


In [26]:
X = df[["Price"]].values
y = df["Quantity"].values
season = df["Season"].values  # for stratification

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=490, stratify=season
)

lin_mix = LinearRegression().fit(X_train, y_train)
print("Mixed TRAIN R^2:", round(lin_mix.score(X_train, y_train), 3))
print("Mixed TEST  R^2:", round(lin_mix.score(X_test,  y_test),  3))


Mixed TRAIN R^2: 0.188
Mixed TEST  R^2: 0.149


In [None]:
# Build a test DataFrame with season labels (for coloring) and compute metrics
idx = np.arange(len(df))
idx_train, idx_test = train_test_split(
    idx, test_size=0.5, random_state=490, stratify=df["Season"].values
)

test_mix = pd.DataFrame({
    "Price": X_test.ravel(),
    "Quantity": y_test,
    "Season": df.loc[idx_test, "Season"].values
})

# Global test metrics
yhat_mix_test = lin_mix.predict(X_test)
r2_mix_test = lin_mix.score(X_test, y_test)
rmse_mix_test = rmse(y_test, yhat_mix_test)
print("Mixed TEST R^2 :", round(r2_mix_test, 3))
print("Mixed TEST RMSE:", round(rmse_mix_test, 3))

# Optional: per-season metrics on the mixed TEST set (nice for discussion)
for season_name in ["Winter", "Summer"]:
    mask = (test_mix["Season"] == season_name).values
    if mask.sum() > 1:
        r2_season = lin_mix.score(X_test[mask], y_test[mask])
        rmse_season = rmse(y_test[mask], yhat_mix_test[mask])
        print(f"  {season_name} TEST: R^2={r2_season:.3f} | RMSE={rmse_season:.3f}")

# Visualization: colored by season, with the fitted line from the mixed model
plt.figure()
for season_name, color in [("Winter","C0"), ("Summer","C1")]:
    sub = test_mix[test_mix["Season"]==season_name]
    plt.scatter(sub["Price"], sub["Quantity"], alpha=0.6,
                label=f"Test {season_name}", color=color)

a = float(lin_mix.intercept_); b = float(lin_mix.coef_[0])
xgrid = np.linspace(test_mix["Price"].min(), test_mix["Price"].max(), 100)
plt.plot(xgrid, a + b*xgrid, linewidth=2, label="Fitted line (mixed training)")

plt.xlabel("Price"); plt.ylabel("Quantity")
plt.title("Mixed test split: fit is more honest and improves metrics")
plt.grid(True, alpha=0.3); plt.legend(); plt.show()
