<p>
50.007 MACHINE LEARNING<br />
2024 SUMMER<br />
<b>COGITO</b><br />
</p>

# Setup

In [1]:
from base import *
from grid_search import cv_grid_search
from random_search import cv_random_search
from simulated_annealing import SimulatedAnnealing, cv_simulated_annealing

logging.basicConfig(filename="log", level=logging.INFO, format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M")

# Task 1

In [None]:
def σ(z): return 1 / (1 + np.exp(-z))
# def ce_loss(y, ŷ): return (-1/(y.shape[0])) * np.sum(y * np.log(ŷ) + (1 - y) * np.log(1 - ŷ))

# Return dw and db, for some X, y, ŷ, w, R, and λ.
def gradients_logreg(X, y, ŷ, w, R=None, λ=0):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (ŷ - y))
	db = 1/m * np.sum(ŷ - y)
	if R == "L2":
		dw += λ * w / m
	elif R == "L1":
		dw += λ * np.sign(w) / m
	return dw, db

# Return (w, b) from gradient descent on X_train and y_train, for some τ, α, G, β, R, and λ.
def train_logreg(X_train, y_train, τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
	m, n = X_train.shape
	w, b = np.zeros((n, 1)), 0
	for epoch in range(τ):
		if G == "full-batch":
			X_batch, y_batch = X_train, y_train
			ŷ = σ(np.dot(X_batch, w) + b)
			dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
			w, b = w - α*dw, b - α*db
		elif G == "mini-batch":
			for i in range(0, m, β):
				X_batch, y_batch = X_train[i:i+β], y_train[i:i+β]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
				w, b = w - α*dw, b - α*db
		elif G == "stochastic":
			for i in range(m):
				X_batch, y_batch = X_train[i:i+1], y_train[i:i+1]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
				w, b = w - α*dw, b - α*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding ŷ entry > 0.5, and 0 otherwise.
def predict_logreg(wb_tuple, X):
	w, b = wb_tuple
	ŷ = σ(np.dot(X, w) + b)
	return np.array([1 if p > 0.5 else 0 for p in ŷ])

# Train model, make predictions, and save predictions to CSV file.
def generate_predictions_logreg(τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
	start_time = time.time()
	w, b = train_logreg(np.array(X_train), np.array(y_train), τ, α, G, β, R, λ)
	predictions = predict_logreg((w, b), np.array(X_test))
	os.makedirs("./predictions/logreg/", exist_ok=True)
	file_name = os.path.join("./predictions/logreg/", f"τ={τ},α={α},G={G},β={β},R={R},λ={λ}.csv")
	pd.DataFrame({"id": S_test["id"], "label": predictions}).to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

In [None]:
# generate_predictions_logreg(τ=1, α=0.08, G="mini-batch", β=128, R=None, λ=0)
# generate_predictions_logreg(τ=1000, α=0.08, G="stochastic", R=L2, λ=0.5)

## Task 2

In [None]:
def apply_pca(x):
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)
	if 0 <= x <= 1:
		# x is variance threshold.
		pca = PCA(n_components=None)
		pca.fit(X_train_scaled)
		c = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= x) + 1
		v = x
	else:
		# x is number of components.
		pca = PCA(n_components=x)
		pca.fit(X_train_scaled)
		c = x
		v = sum(pca.explained_variance_ratio_)
	# Transform train and test datasets.
	X_train_pca = pca.transform(X_train_scaled)
	X_test_pca = pca.transform(X_test_scaled)
	return X_train_pca, X_test_pca, c, v

def train_SKLknn(X_train, y_train, k=5, W="uniform", p=2, m="minkowski"):
	model = KNeighborsClassifier(
		n_neighbors=k,
		weights=W,
		p=p,
		metric=m,
		n_jobs=-1
	)
	model.fit(X_train, y_train)
	return model

def predict_SKLknn(model, X): return model.predict(X)

# Train model, make model predictions, and save model predictions to CSV file.
def generate_predictions_pcaknn(x):
	start_time = time.time()
	X_train_pca, X_test_pca, c, v = apply_pca(x)
	model = train_SKLknn(X_train_pca, y_train, k=2)
	predictions = predict_SKLknn(model, X_test_pca)
	os.makedirs("./predictions/pcaknn/", exist_ok=True)
	file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ρ={c},ν={v:.2f}).csv")
	pd.DataFrame({"id": S_test["id"], "label": predictions}).to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

In [None]:
# generate_predictions_pcaknn(100)
# generate_predictions_pcaknn(500)
# generate_predictions_pcaknn(1000)
# generate_predictions_pcaknn(2000)
# generate_predictions_pcaknn(5000)

## Task 3

## SKLknn

In [None]:
# Default:
# generate_predictions(
# 	"SKLknn",
# 	n_neighbors=5,
# 	weights="uniform",
# 	p=2,
# 	metric="minkowski"
# )

# cv_grid_search("SKLknn", grid={
# 	"n_neighbors": [2, 3, 4],
# 	"weights": ["uniform", "distance"],
# 	"leaf_size": [20, 30, 40]
# }, k=5)

## SKLrf

In [None]:
# Default:
# generate_predictions(
# 	"SKLrf",
# 	n_estimators=100,
# 	criterion="gini",
# 	max_depth=None,
# 	min_samples_split=2,
# 	min_samples_leaf=1,
# 	max_leaf_nodes=None,
# 	random_state=None
# )

## SKLsvm

In [None]:
# Default:
# generate_predictions(
# 	"SKLsvm",
# 	C=1.0,
# 	kernel="rbf",
# 	degree=3,
# 	gamma="scale",
# 	max_iter=-1,
# 	random_state=None
# )


In [None]:
# cv_grid_search("SKLsvm", grid={
# 	"C": [0.8, 0.9, 1.0],
# 	"kernel": ["linear", "poly", "rbf"],
# 	"gamma": ["scale"]
# }, k=5)

## SKLlogreg

In [None]:
# generate_predictions(
# 	"SKLlogreg",
# 	C=5.1,
# 	max_iter=300
# )

cv_grid_search("SKLlogreg", grid={
	"tol": [0.00005],
	"C": [0.8, 1.6, 2.4, 3.2, 4.0, 4.8],
	"max_iter": [800, 1600, 2400, 3200, 4000]
}, k=5)

## SKLgb

In [None]:
# Default:
# generate_predictions(
# 	"SKLgb",
# 	learning_rate=0.1,
# 	n_estimators=100,
# 	subsample=1.0,
# 	min_samples_split=2,
# 	min_samples_leaf=1,
# 	max_depth=3,
# 	random_state=None,
# 	max_leaf_nodes=None
# )

## SKLhgb

In [None]:
# Default:
# generate_predictions(
# 	"SKLhgb",
# 	learning_rate=0.1,
# 	max_iter=100,
# 	max_leaf_nodes=31,
# 	max_depth=None,
# 	min_samples_leaf=20,
# 	l2_regularization=0.0,
# 	random_state=None
# )

# cv_grid_search("SKLhgb", grid={
# 	"learning_rate": [0.08],
# 	"max_depth": [80]
# 	"max_iter": [500],
# 	"max_leaf_nodes": [70, 75, 80],
# 	"min_samples_leaf": [15, 18, 20]
# }, k=5)

## XGBgb

In [None]:
# Default:
# generate_predictions(
# 	"XGBgb",
# 	n_estimators=100,
# 	max_depth=6,
# 	learning_rate=0.3,
# 	booster="gbtree",
# 	subsample=1.0,
# 	reg_alpha=0,
# 	reg_lambda=1,
# 	random_state=None
# )

cv_grid_search("XGBgb", grid={
	"n_estimators": [600, 800, 1000],
	"max_depth": [8, 12, 16],
	"learning_rate": [0.1],
	"subsample": [0.8],
	"reg_alpha": [0.1],
	"reg_lambda": [2, 2.4]
}, k=5)

## CBgb

In [None]:
# Expected default:
# generate_predictions(
# 	"CBgb",
# 	learning_rate=0.03,
# 	subsample=0.8,
# 	max_depth=6,
# 	n_estimators=1000,
# 	reg_lambda=3.0,
# 	random_state=None
# )

# generate_predictions(
# 	"CBgb",
# 	learning_rate=0.1,
# 	subsample=0.8,
# 	max_depth=12,
# 	n_estimators=800,
# 	reg_lambda=3.6,
# 	random_state=None
# )

cv_grid_search("CBgb", grid={
	"learning_rate": [0.1],
	"max_depth": [12, 14],
	"n_estimators": [800],
	"reg_lambda": [3.6]
}, k=5)

# cv_random_search("CBgb", grid={
# 	"learning_rate": uniform(0.001, 0.1),
# 	"max_depth": randint(1, 2),
# 	"n_estimators": randint(1, 2),
# }, k=2, n_iter=1)

# cv_simulated_annealing(model_name="CBgb", grid={
# 	"learning_rate": (0.06, 0.12),
# 	"max_depth": [12],
# 	"reg_lambda": (0.1, 1)
# })

## SKLstack

In [3]:
generate_predictions(
	"SKLstack",
	verbose=2
)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


0:	learn: 0.6663046	total: 911ms	remaining: 911ms
1:	learn: 0.6526261	total: 1.8s	remaining: 0us
0:	learn: 0.6679540	total: 835ms	remaining: 835ms
1:	learn: 0.6512620	total: 1.62s	remaining: 0us
0:	learn: 0.6690208	total: 847ms	remaining: 847ms
1:	learn: 0.6545719	total: 1.64s	remaining: 0us
0:	learn: 0.6705788	total: 850ms	remaining: 850ms
1:	learn: 0.6533094	total: 1.61s	remaining: 0us
0:	learn: 0.6717868	total: 809ms	remaining: 809ms
1:	learn: 0.6552555	total: 1.55s	remaining: 0us
0:	learn: 0.6682308	total: 792ms	remaining: 792ms
1:	learn: 0.6524243	total: 1.59s	remaining: 0us
0:	learn: 0.6663046	total: 1.03s	remaining: 1.03s
1:	learn: 0.6526261	total: 1.99s	remaining: 0us
0:	learn: 0.6679540	total: 814ms	remaining: 814ms
1:	learn: 0.6512620	total: 1.57s	remaining: 0us
0:	learn: 0.6690208	total: 789ms	remaining: 789ms
1:	learn: 0.6545719	total: 1.51s	remaining: 0us
0:	learn: 0.6705788	total: 827ms	remaining: 827ms
1:	learn: 0.6533094	total: 1.64s	remaining: 0us
0:	learn: 0.6717868	t

In [2]:
# cv_grid_search("SKLstack", grid={
# 	"penalty": ["l2"],
# 	"C": [1]
# }, k=5)

cv_grid_search("SKLstack", grid={
	"penalty": ["l2"],
	"C": [1.0]
}, k=2)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
0:	learn: 0.6720152	total: 585ms	remaining: 585ms
1:	learn: 0.6550579	total: 1.07s	remaining: 0us
0:	learn: 0.6741770	total: 442ms	remaining: 442ms
1:	learn: 0.6575064	total: 842ms	remaining: 0us
0:	learn: 0.6729465	total: 450ms	remaining: 450ms
1:	learn: 0.6532547	total: 855ms	remaining: 0us
0:	learn: 0.6689039	total: 416ms	remaining: 416ms
1:	learn: 0.6556181	total: 789ms	remaining: 0us
0:	learn: 0.6697402	total: 402ms	remaining: 402ms
1:	learn: 0.6538554	total: 791ms	remaining: 0us
0:	learn: 0.6684058	total: 395ms	remaining: 395ms
1:	learn: 0.6519758	total: 769ms	remaining: 0us
[CV 1/2] END final_estimator__C=1.0, final_estimator__penalty=l2;, score=0.544 total time= 2.4min
0:	learn: 0.6753126	total: 520ms	remaining: 520ms
1:	learn: 0.6582344	total: 992ms	remaining: 0us
0:	learn: 0.6728565	total: 442ms	remaining: 442ms
1:	learn: 0.6578969	total: 870ms	remaining: 0us
0:	learn: 0.6734101	total: 509ms	remaining: 509ms
1:	learn:

(StackingClassifier(estimators=[('SKLhgb', HistGradientBoostingClassifier()),
                                ('XGBgb',
                                 XGBClassifier(base_score=None, booster=None,
                                               callbacks=None,
                                               colsample_bylevel=None,
                                               colsample_bynode=None,
                                               colsample_bytree=None,
                                               device=None,
                                               early_stopping_rounds=None,
                                               enable_categorical=False,
                                               eval_metric=None,
                                               feature_types=None, gamma=None,
                                               grow_policy=None,
                                               importance_ty...
                                               