<p>
50.007 MACHINE LEARNING<br />
2024 SUMMER<br />
<b>COGITO</b><br />
</p>

https://sutd-1007485.notion.site/50-007-Project-Report
https://docs.google.com/spreadsheets/d/1xIcica8zDbkq8prRumnc9A9_lcsVXxXDGon1YXelMTQ/edit?usp=sharing

# Setup

In [3]:
from base import *
from grid_search import cv_grid_search
from random_search import cv_random_search
from simulated_annealing import SimulatedAnnealing, cv_simulated_annealing

logging.basicConfig(filename="log", level=logging.INFO, format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M")

# Task 1

In [None]:
def σ(z): return 1 / (1 + np.exp(-z))
# def ce_loss(y, ŷ): return (-1/(y.shape[0])) * np.sum(y * np.log(ŷ) + (1 - y) * np.log(1 - ŷ))

# Return dw and db, for some X, y, ŷ, w, R, and λ.
def gradients_logreg(X, y, ŷ, w, R=None, λ=0):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (ŷ - y))
	db = 1/m * np.sum(ŷ - y)
	if R == "L2":
		dw += λ * w / m
	elif R == "L1":
		dw += λ * np.sign(w) / m
	return dw, db

# Return (w, b) from gradient descent on X_train and y_train, for some τ, α, G, β, R, and λ.
def train_logreg(X_train, y_train, τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
	m, n = X_train.shape
	w, b = np.zeros((n, 1)), 0
	for epoch in range(τ):
		if G == "full-batch":
			X_batch, y_batch = X_train, y_train
			ŷ = σ(np.dot(X_batch, w) + b)
			dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
			w, b = w - α*dw, b - α*db
		elif G == "mini-batch":
			for i in range(0, m, β):
				X_batch, y_batch = X_train[i:i+β], y_train[i:i+β]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
				w, b = w - α*dw, b - α*db
		elif G == "stochastic":
			for i in range(m):
				X_batch, y_batch = X_train[i:i+1], y_train[i:i+1]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
				w, b = w - α*dw, b - α*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding ŷ entry > 0.5, and 0 otherwise.
def predict_logreg(wb_tuple, X):
	w, b = wb_tuple
	ŷ = σ(np.dot(X, w) + b)
	return np.array([1 if p > 0.5 else 0 for p in ŷ])

# Train model, make predictions, and save predictions to CSV file.
def generate_predictions_logreg(τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
	start_time = time.time()
	w, b = train_logreg(np.array(X_train), np.array(y_train), τ, α, G, β, R, λ)
	predictions = predict_logreg((w, b), np.array(X_test))
	output_dir = "./predictions/logreg/"
	os.makedirs(output_dir, exist_ok=True)
	file_name = os.path.join(output_dir, f"predictions.csv")
	pd.DataFrame({"id": S_test["id"], "label": predictions}).to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

In [None]:
# generate_predictions_logreg(τ=1, α=0.08, G="mini-batch", β=128, R=None, λ=0)
# generate_predictions_logreg(τ=1000, α=0.08, G="stochastic", R=L2, λ=0.5)

## Task 2
- F1 score for `c=100`: 0.55415.
- F1 score for `c=500`: 0.54648.
- F1 score for `c=1000`: 0.55766.
- F1 score for `c=2000`: 0.54434.

In [None]:
def apply_pca(x):
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)
	if 0 <= x <= 1:
		# x is variance threshold.
		pca = PCA(n_components=None)
		pca.fit(X_train_scaled)
		c = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= x) + 1
		v = x
	else:
		# x is number of components.
		pca = PCA(n_components=x)
		pca.fit(X_train_scaled)
		c = x
		v = sum(pca.explained_variance_ratio_)
	# Transform train and test datasets.
	X_train_pca = pca.transform(X_train_scaled)
	X_test_pca = pca.transform(X_test_scaled)
	return X_train_pca, X_test_pca, c, v

def train_SKLknn(X_train, y_train, k=5):
	model = KNeighborsClassifier(n_neighbors=k)
	model.fit(X_train, y_train)
	return model

def predict_SKLknn(model, X): return model.predict(X)

# Train model, make model predictions, and save model predictions to CSV file.
def generate_predictions_pcaknn(x):
	start_time = time.time()
	X_train_pca, X_test_pca, c, v = apply_pca(x)
	model = train_SKLknn(X_train_pca, y_train, k=2)
	predictions = predict_SKLknn(model, X_test_pca)
	output_dir = "./predictions/pcaknn/"
	os.makedirs(output_dir, exist_ok=True)
	file_name = os.path.join(output_dir, f"predictions.csv")
	pd.DataFrame({"id": S_test["id"], "label": predictions}).to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

In [None]:
# generate_predictions_pcaknn(100)
# generate_predictions_pcaknn(500)
# generate_predictions_pcaknn(1000)
# generate_predictions_pcaknn(2000)
# generate_predictions_pcaknn(5000)

## Task 3
- We implemented the models listed here, and some others (which are no longer in this notebook), including Scikit-Learn Gradient Boosting Classifier (SKLgb), Scikit-Learn K-Neighbors Classifier (SKLknn), Scikit-Learn Support Vector Classifier (SKLsvm), Scikit-Learn Random Forest Classifier (SKLrf), and Scikit-Learn Stochastic Gradient Descent Classifier (SKLsgd).
- Specific model parameters tried and performance metrics are viewable via the links above.

## SKLet

In [None]:
# generate_predictions(
# 	n_estimators=100,
# 	criterion="gini",
# 	max_depth=None,
# 	min_samples_split=2,
# 	min_samples_leaf=1,
# 	min_weight_fraction_leaf=0.0,
# 	max_features="sqrt",
# 	max_leaf_nodes=None,
# 	min_impurity_decrease=0.0,
# 	bootstrap=False,
# 	random_state=None,
# 	class_weight=None,
# 	ccp_alpha=0.0,
# 	max_samples=None
# )

## SKLlsvm

In [None]:
# generate_predictions(
# 	"SKLlsvm",
# 	penalty="l2",
# 	loss="squared_hinge",
# 	tol=0.0001,
# 	dual="auto",
# 	C=1.0,
# 	random_state=None,
# 	max_iter=1000
# )

## SKLmnb

In [None]:
# generate_predictions(
# 	"SKLmnb",
# 	alpha=1.0,
# 	fit_prior=True
# )

## SKLlogreg

In [None]:
# generate_predictions(
# 	"SKLlogreg",
# 	C=1.0,
# 	max_iter=100
# )

## SKLhgb

In [None]:
# generate_predictions(
# 	"SKLhgb",
# 	learning_rate=0.1,
# 	max_iter=100,
# 	max_leaf_nodes=31,
# 	max_depth=None,
# 	min_samples_leaf=20,
# 	l2_regularization=0.0,
# 	random_state=None
# )

## XGBgb

In [None]:
# generate_predictions(
# 	"XGBgb",
# 	n_estimators=100,
# 	max_depth=6,
# 	learning_rate=0.3,
# 	booster="gbtree",
# 	subsample=1.0,
# 	reg_alpha=0,
# 	reg_lambda=1,
# 	random_state=None
# )

## CBgb

In [None]:
# generate_predictions(
# 	"CBgb",
# )

## SKLstack
After cross-validation for tuning hyperparameters, we stacked various combinations of our best performing models together (with a generic LogisticRegression as the final/meta model each time):
1. SKLlogreg with `(C=0.64, class_weight="balanced")`
2. SKLlsvm with `(penalty="l2", loss="hinge", dual=True, tol=1e-7, C=0.3425, class_weight="balanced", max_iter=5000)`
3. CBgb with `(iterations=900, learning_rate=0.1, depth=6, rsm=0.8, auto_class_weights="Balanced", random_strength=1.2, bootstrap_type="MVS", subsample=0.8)`
4. SKLet with `(n_estimators=150, max_depth=600, min_samples_split=160, min_impurity_decrease=0.00001, bootstrap=True, class_weight="balanced_subsample", ccp_alpha=0.00001, max_samples=0.9)`
5. SKLmnb with `(alpha=1.38, fit_prior=False)`

Each stacked combination would generate its own predictions, and the predictions would be randomly combined to form the final predictions.

In [2]:
# generate_predictions(
# 	"SKLstack"
# )

In [None]:
# np.random.seed(int.from_bytes(os.urandom(8), "big"))
# input_dir = "predictions/SKLstack"
# input_paths = glob.glob(f"{input_dir}/*.csv")
# dfs = [pd.read_csv(file) for file in input_paths]
# final_predictions = []
# for i in range(len(dfs[0])):
# 	predictions = [df.loc[i, "label"] for df in dfs]
# 	final_predictions.append(np.random.choice(predictions))
# pd.DataFrame({"id": dfs[0]["id"], "label": final_predictions}).to_csv("final_predictions.csv", index=False)