# Cogito

## Definitions
- `m` is # of data points.
- `n` is # of features.
- `X` is `m`-by-`n` matrix representing features.
- `y` is `m`-by-1 vector representing labels.
- `w` is `n`-by-1 vector representing weights.
- `b` is scalar representing bias.
- `dw` is loss gradient w.r.t. `w`.
- `db` is loss gradient w.r.t. `b`.

### logreg(τ,α,G,β,R,λ)
1. `τ` is # of epochs: non-negative integer.
2. `α` is learning rate: float in range ${[0,1]}$.
3. `G` is gradient descent type: `"full-batch"`, `"mini-batch"`, xor `"stochastic"`.
4. `β` is batch size: non-negative integer xor `None`.
5. `R` is regularisation type: `"L2"`, `"L1"`, xor `None`.
6. `λ` is regularisation degree: real in range ${[0,\infin)}$.

### pcaknn(ρ,ν)
1. `ρ` is # of principal components: integer in range ${[0,n]}$.
2. `ν` is variance ratio: float in range ${[0,1]}$.

### svm(λ,K,p,γ,κ,τ,seed)
1. `λ` is margin regularisation degree.
2. `K` is kernel type: `"linear"`, `"poly"`, `"rbf"`, `"sigmoid"`, xor `"precomputed"`.
3. `p` is polynomial kernel degree for polynomial kernel type: non-negative integer.
4. `γ` is kernel coefficient for RBF, polynomial, xor sigmoid kernel type: `"scale"`, `"auto"`, xor float in range ${[0,1]}$.
5. `κ` is independent term of kernel function with effect for polynomial xor sigmoid kernel type.
6. `τ` is maximum # of iterations: non-negative integer xor `-1`.
7. `seed` is pseudorandom random state: integer xor `None`.

#### rforest(η,C,Δ,ψ,ℓ,θ,δ,B,seed)
1. `η` is # of estimators: non-negative integer.
2. `C` is criterion to measure split quality: `"entropy"`, `"log_loss"`, xor `"gini"`.
3. `Δ` is maximum tree depth: non-negative integer xor `None`.
4. `ψ` is minimum # of samples needed to split internal node: non-negative integer.
5. `ℓ` is minimum # of samples needed at leaf node: non-negative integer.
6. `θ` is maximum # of leaf nodes: non-negative integer xor `None`.
7. `δ` is minimum impurity decrease: float in range ${[0,1]}$.
8. `B` is whether bootstrapping is used: `False` xor `True`.
9. `seed` is pseudorandom random state: integer xor `None`.

### gbdtree(α,η,ss,ψ,ℓ,Δ,δ,seed,θ)
1. `α` is learning rate: float in range ${[0,1]}$.
2. `η` is # of estimators: non-negative integer.
3. `ss` is fraction of samples used to fit individual base learners: float in range ${[0,1]}$.
4. `ψ` is minimum # of samples needed to split internal node: non-negative integer.
5. `ℓ` is minimum # of samples needed at leaf node: non-negative integer.
6. `Δ` is maximum tree depth: non-negative integer xor `None`.
7. `δ` is minimum impurity decrease: float in range ${[0,1]}$.
8. `seed` is pseudorandom random state: integer xor `None`.
9. `θ` is maximum # of leaf nodes: non-negative integer xor `None`.

#### hgbdtree(α,τ,θ,Δ,l,seed)
1. `α` is learning rate: float in range ${[0,1]}$.
2. `τ` is maximum # of iterations: non-negative integer.
3. `θ` is maximum # of leaf nodes: non-negative integer xor `None`.
4. `Δ` is maximum tree depth: non-negative integer xor `None`.
5. `l` is minimum # of samples needed at leaf node: non-negative integer xor `None`.
6. `seed` is pseudorandom random state: integer xor `None`.


## Setup
- numpy, pandas, and sklearn (scikit-learn) must be installed and available.

In [9]:
# Import os for operating system functions.
import os
# Import numpy for mathematical computation.
import numpy as np
# Import pandas for data manipulation.
import pandas as pd
# Import datetime for local time retrieval.
from datetime import datetime
# Import logging for file logging.
import logging
# Import time for duration measurement.
import time

# Import KFold and f1_score for cross-validation.
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

# Import PCA and KNeighborsClassifier for pcaknn.
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
# Import RandomForestClassifier for rforest.
from sklearn.ensemble import RandomForestClassifier
# Import SVC for svm.
from sklearn.svm import SVC
# Import GradientBoostingClassifier for gbdtree.
from sklearn.ensemble import GradientBoostingClassifier
# Import HistGradientBoostingClassifier for hgbdtree.
from sklearn.ensemble import HistGradientBoostingClassifier
# Import xgboost for xgbdtree.
import xgboost as xgb

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.preprocessing import StandardScaler

# Initialise file logging configuration.
logging.basicConfig(
	filename="main.log",
	level=logging.INFO,
	format="%(asctime)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S"
)

# Load train and test datasets.
S_train = pd.read_csv("./data/train.csv")
S_train_tfidf = pd.read_csv("./data/train_tfidf_features.csv")
S_test = pd.read_csv("./data/test.csv")
S_test_tfidf = pd.read_csv("./data/test_tfidf_features.csv")

# Extract train features, train labels, and test features.
X_train = S_train_tfidf.iloc[:, 2:].values
y_train = S_train["label"].values.reshape(-1, 1)
X_test = S_test_tfidf.iloc[:, 1:].values

# Return σ(z) for some z.
def σ(z): return 1 / (1 + np.exp(-z))

# Return cross-entropy loss for some y and ŷ.
def loss(y, ŷ): return (-1/(y.shape[0])) * np.sum(y * np.log(ŷ) + (1 - y) * np.log(1 - ŷ))

# Return PCA-transformed train and test featuresets, for either some ν or ρ.
def apply_pca(mode_value, mode):
	# Instantiate PCA and StandardScaler objects.
	pca = PCA(n_components=mode_value)
	scaler = StandardScaler()
	# PCA-transform train and test featuresets.
	X_train_pca = pca.fit_transform(scaler.fit_transform(X_train))
	X_test_pca = pca.transform(scaler.transform(X_test))
	# Return PCA-transformed featuresets and # of components.
	return X_train_pca, X_test_pca, pca.n_components_

def crossvalidate(train_fn, predict_fn, grid, X, y, k=2):
	kf = KFold(n_splits=k, shuffle=True, random_state=7)
	best_score = -np.inf
	best_hyperparameters = None
	for hyperparameters in grid:
		scores = []
		for train_index, val_index in kf.split(X):
			X_train, X_val = X[train_index], X[val_index]
			y_train, y_val = y[train_index], y[val_index]
			print(hyperparameters)
			model = train_fn(X_train, y_train, **hyperparameters)
			y_pred = predict_fn(model, X_val)
			scores.append(f1_score(y_val, y_pred, average='macro'))
		mean_score = np.mean(scores)
		if mean_score > best_score:
			best_score = mean_score
			best_hyperparameters = hyperparameters
	return best_hyperparameters, best_score

def crossvalidate_and_generate_predictions(model_name, grid, k=5):
	train_fn = globals().get(f"train_{model_name.lower()}")
	predict_fn = globals().get(f"predict_{model_name.lower()}")
	generate_predictions_fn = globals().get(f"generate_predictions_{model_name.lower()}")

	if not train_fn or not predict_fn or not generate_predictions_fn:
		raise ValueError(f"Functions for '{model_name}' not found. Ensure that 'train_{model_name.lower()}', 'predict_{model_name.lower()}', and 'generate_predictions_{model_name.lower()}' exist.")

	best_hyperparameters, best_score = crossvalidate(train_fn, predict_fn, grid, X_train, y_train, k=k)

	print(f"Best hyperparameters for {model_name}:", best_hyperparameters)
	print(f"Best cross-validation f1 score for {model_name}:", best_score)

	generate_predictions_fn(**best_hyperparameters)

## Task 1
### Logistic Regression Model (`logreg`)

In [2]:
# Return dw and db, for some X, y, ŷ, w, λ, and R.
def gradients_logreg(X, y, ŷ, w, R=None, λ=0):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (ŷ - y))
	db = 1/m * np.sum(ŷ - y)
	if R == "L2":
		dw += λ * w / m
	elif R == "L1":
		dw += λ * np.sign(w) / m
	return dw, db

# Return w and b from gradient descent on X and y, for some τ, α, G, β, R, and λ.
def train_logreg(X_train, y_train, τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
	m, n = X_train.shape
	w, b = np.zeros((n, 1)), 0
	for epoch in range(τ):
		if G == "full-batch":
			X_batch, y_batch = X_train, y_train
			ŷ = σ(np.dot(X_batch, w) + b)
			dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
			w, b = w - α*dw, b - α*db
		elif G == "mini-batch":
			for i in range(0, m, β):
				X_batch, y_batch = X_train[i:i+β], y_train[i:i+β]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
				w, b = w - α*dw, b - α*db
		elif G == "stochastic":
			for i in range(m):
				X_batch, y_batch = X_train[i:i+1], y_train[i:i+1]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
				w, b = w - α*dw, b - α*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding ŷ entry > 0.5, and 0 otherwise.
def predict_logreg(wb_tuple, X):
	w, b = wb_tuple
	ŷ = σ(np.dot(X, w) + b)
	return np.array([1 if p > 0.5 else 0 for p in ŷ])

# Train model, make model predictions, and save model predictions to CSV file, for some β, τ, α, λ, L, and G.
def generate_predictions_logreg(τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
	start_time = time.time()
	w, b = train_logreg(np.array(X_train), np.array(y_train), τ, α, G, β, R, λ)
	predictions = predict_logreg((w, b), np.array(X_test))
	os.makedirs("./predictions/logreg/", exist_ok=True)
	file_name = os.path.join("./predictions/logreg/", f"τ={τ},α={α},G={G},β={β},R={R},λ={λ}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

crossvalidate_and_generate_predictions(
	model_name="logreg",
	grid=[
		{"τ": 1, "α": 0.0825, "G": "mini-batch", "β": 128, "R": None, "λ": 0},
		{"τ": 2, "α": 0.085, "G": "full-batch", "β": 64, "R": "L2", "λ": 0.1},
	],
	k=5
)

Best hyperparameters for logreg: {'τ': 1, 'α': 0.0825, 'G': 'mini-batch', 'β': 128, 'R': None, 'λ': 0}
Best cross-validation f1 score for logreg: 0.38224160599504164
Predictions file ./predictions/logreg/τ=1,α=0.0825,G=mini-batch,β=128,R=None,λ=0.csv generated in 1.00s.


In [3]:
# def train_logreg(τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
# 	m, n = X_train.shape
# 	w, b = np.zeros((n, 1)), 0
# 	for epoch in range(τ):
# 		if G == "full-batch":
# 			X_batch, y_batch = X_train, y_train
# 			ŷ = σ(np.dot(X_batch, w) + b)
# 			dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
# 			w, b = w - α*dw, b - α*db
# 		elif G == "mini-batch":
# 			for i in range(0, m, β):
# 				X_batch, y_batch = X_train[i:i+β], y_train[i:i+β]
# 				ŷ = σ(np.dot(X_batch, w) + b)
# 				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
# 				w, b = w - α*dw, b - α*db
# 		elif G == "stochastic":
# 			for i in range(m):
# 				X_batch, y_batch = X_train[i:i+1], y_train[i:i+1]
# 				ŷ = σ(np.dot(X_batch, w) + b)
# 				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
# 				w, b = w - α*dw, b - α*db
# 	return w, b

# def predict_logreg(w, b):
# 	# Compute ŷ = σ(w ⋅ X + b).
# 	ŷ = σ(np.dot(X_test, w) + b)
# 	return [1 if p > 0.5 else 0 for p in ŷ]

# def generate_predictions_logreg(τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
# 	start_time = time.time()
# 	w, b = train_logreg(τ, α, G, β, R, λ)
# 	os.makedirs("./predictions/logreg/", exist_ok=True)
# 	file_name = os.path.join("./predictions/logreg/", f"τ={τ},α={α},G={G},β={β},R={R},λ={λ}.csv")
# 	pd.DataFrame({
# 		"id": S_test["id"],
# 		"label": predict_logreg(w, b)
# 	}).to_csv(file_name, index=False)
# 	end_time = time.time()
# 	# Log and print success message.
# 	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
# 	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

## Task 2
### PCA-KNN Model (`pcaknn`)

In [4]:
# Train model, make model predictions, and save model predictions to CSV file, for some ν or ρ.
def generate_predictions_pcaknn(mode_value, mode="ρ"):
	# Validate mode and mode value.
	if mode not in ["ν", "ρ"] or (mode == "ν" and not (0 < mode_value <= 1)) or (mode == "ρ" and not (mode_value > 1 and isinstance(mode_value, int))): raise ValueError("Mode must either be ν with value between 0 and 1, or ρ with mode_value as an integer greater than 1.")

	start_time = time.time()
	X_train_pca, X_test_pca, n_components = apply_pca(mode_value, mode)
	knn = KNeighborsClassifier(n_neighbors=2)
	knn.fit(X_train_pca, y_train)
	predictions = knn.predict(X_test_pca)
	os.makedirs("./predictions/pcaknn/", exist_ok=True)
	if mode == "ν":
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ν={mode_value},ρ={n_components}).csv")
	else:
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ρ={n_components}).csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()

	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_pcaknn(mode_value=0.95, mode="ν")
# generate_predictions_pcaknn(mode_value=1000, mode="ρ")

## Task 3: Other Models

### Random Forest Model (`rforest`)

In [5]:
# Train RandomForest classifier, make predictions, and save predictions to CSV file.
def generate_predictions_rforest(η=100, C="gini", Δ=None, ψ=2, ℓ=1, θ=None, δ=0.0, B=True, seed=None):
	start_time = time.time()
	rf = RandomForestClassifier(
		n_estimators=η,
		criterion=C,
		max_depth=Δ,
		min_samples_split=ψ,
		min_samples_leaf=ℓ,
		max_leaf_nodes=θ,
		min_impurity_decrease=δ,
		bootstrap=B,
		n_jobs=-1,
		random_state=seed
	)
	rf.fit(X_train, y_train.ravel())
	predictions = rf.predict(X_test)
	os.makedirs("./predictions/rforest/", exist_ok=True)
	file_name = os.path.join("./predictions/rforest/", f"η={η},C={C},Δ={Δ},ψ={ψ},ℓ={ℓ},θ={θ},δ={δ},B={B},seed={seed}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_rforest(η=100, C="gini", Δ=None, ψ=2, ℓ=1, θ=None, δ=0.0, B=True, seed=None)

### SVM Model (`svm`)

In [6]:
# Train SVM classifier, make predictions, and save predictions to CSV file.
def generate_predictions_svm(λ=1.0, K="rbf", p=3, γ="scale", κ=0.0, τ=-1, seed=None):
	start_time = time.time()
	svm_clf = SVC(
		C=λ,
		kernel=K,
		degree=p,
		gamma=γ,
		coef0=κ,
		max_iter=τ,
		random_state=seed
	)
	svm_clf.fit(X_train, y_train.ravel())
	predictions = svm_clf.predict(X_test)
	os.makedirs("./predictions/svm/", exist_ok=True)
	file_name = os.path.join("./predictions/svm/", f"λ={λ},K={K},p={p},γ={γ},κ={κ},τ={τ},seed={seed}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_svm(λ=1.0, K="rbf", p=3, γ="scale", κ=0.0, τ=-1, seed=None)

### Gradient Boosted Decision Tree Model (`gbdtree`)

In [7]:
def generate_predictions_gbdtree(α=0.1, η=100, ss=1.0, ψ=2, ℓ=1, Δ=3, δ=0.0, seed=None, θ=None):
	start_time = time.time()
	gbc = GradientBoostingClassifier(
		learning_rate=α,
		n_estimators=η,
		subsample=ss,
		min_samples_split=ψ,
		min_samples_leaf=ℓ,
		max_depth=Δ,
		min_impurity_decrease=δ,
		random_state=seed,
		max_leaf_nodes=θ
	)
	gbc.fit(X_train, y_train)
	predictions = gbc.predict(X_test)
	os.makedirs("./predictions/gbdtree/", exist_ok=True)
	file_name = os.path.join("./predictions/gbdtree/", f"α={α},η={η},ss={ss},ψ={ψ},ℓ={ℓ},Δ={Δ},δ={δ},seed={seed},θ={θ}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

### Histogram Gradient Boosted Decision Tree Model (`hgbdtree`)

In [11]:
def train_hgbdtree(X_train, y_train, α=0.1, τ=100, θ=31, Δ=None, l=20, seed=None):
	hgbc = HistGradientBoostingClassifier(
		learning_rate=α,
		max_iter=τ,
		max_leaf_nodes=θ,
		max_depth=Δ,
		min_samples_leaf=l,
		random_state=seed
	)
	hgbc.fit(X_train, y_train)
	return hgbc

def predict_hgbdtree(model, X):
	return model.predict(X)

def generate_predictions_hgbdtree(α=0.1, τ=100, θ=31, Δ=None, l=20, seed=None):
	start_time = time.time()
	hgbc = train_hgbdtree(np.array(X_train), np.array(y_train), α, τ, θ, Δ, l, seed)
	predictions = predict_hgbdtree(hgbc, np.array(X_test))
	os.makedirs("./predictions/hgbdtree/", exist_ok=True)
	file_name = os.path.join("./predictions/hgbdtree/", f"α={α},τ={τ},θ={θ},Δ={Δ},l={l},seed={seed}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

crossvalidate_and_generate_predictions(
	model_name="hgbdtree",
	grid=[
		{"α": 0.1, "τ": 1, "θ": 31, "Δ": None, "l": 20, "seed": 42},
		{"α": 0.1, "τ": 2, "θ": 31, "Δ": None, "l": 20, "seed": 42}
	],
	k=5
)

{'α': 0.1, 'τ': 1, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


{'α': 0.1, 'τ': 1, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


{'α': 0.1, 'τ': 1, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


{'α': 0.1, 'τ': 1, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


{'α': 0.1, 'τ': 1, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


{'α': 0.1, 'τ': 2, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


{'α': 0.1, 'τ': 2, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


{'α': 0.1, 'τ': 2, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


{'α': 0.1, 'τ': 2, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


{'α': 0.1, 'τ': 2, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}


  y = column_or_1d(y, warn=True)


Best hyperparameters for hgbdtree: {'α': 0.1, 'τ': 2, 'θ': 31, 'Δ': None, 'l': 20, 'seed': 42}
Best cross-validation f1 score for hgbdtree: 0.3840559402758433


  y = column_or_1d(y, warn=True)


Predictions file ./predictions/hgbdtree/α=0.1,τ=2,θ=31,Δ=None,l=20,seed=42.csv generated in 3.86s.


### K-Nearest Neighbours Model (`knn`)

In [None]:
def generate_predictions_knn(k=5, W="uniform", algo="auto", ℓ=30, p=2, 𝓂="minkowski"):
	start_time = time.time()
	knn = KNeighborsClassifier(
		n_neighbors=k,
		weights=W,
		algorithm=algo,
		leaf_size=ℓ,
		p=p,
		metric=𝓂,
		n_jobs=-1
	)
	knn.fit(X_train, y_train)
	predictions = knn.predict(X_test)
	os.makedirs("./predictions/knn/", exist_ok=True)
	file_name = os.path.join("./predictions/knn/", f"k={k},W={W},algo={algo},ℓ={ℓ},p={p},𝓂={𝓂}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

### XGBoost Model (`xgboost`)

In [None]:
def generate_predictions_xgbdtree(η=100, Δ=3, θ=0, G="depthwise", α=0.1, O="binary:logistic", B="gbtree", γ=0, ss=1, λ=0, Λ=1, seed=None):
	start_time = time.time()
	xgb_model = xgb.XGBClassifier(
		n_estimators=η,
		max_depth=Δ,
		max_leaves=θ if θ > 0 else None,
		grow_policy=G,
		learning_rate=α,
		objective=O,
		booster=B,
		gamma=γ,
		subsample=ss,
		reg_alpha=λ,
		reg_lambda=Λ,
		random_state=seed,
		n_jobs=-1
	)
	xgb_model.fit(X_train, y_train.ravel())
	predictions = xgb_model.predict(X_test)
	os.makedirs("./predictions/xgboost/", exist_ok=True)
	file_name = os.path.join("./predictions/xgboost/", f"η={η},Δ={Δ},θ={θ},G={G},α={α},O={O},B={B},γ={γ},ss={ss},λ={λ},Λ={Λ},seed={seed}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

### Voting Area

In [None]:
def generate_voted_predictions(models_params, scores):
	prediction_files = []
	for model, params in models_params:
		if model == "hgbdtree":
			file_name = os.path.join("./predictions/hgbdtree/", f"α={params["α"]},τ={params["τ"]},θ={params["θ"]},Δ={params["Δ"]},ℓ={params["ℓ"]},seed={params["seed"]}.csv")
			if not os.path.exists(file_name):
				file_name = generate_predictions_hgbdtree(**params)
		elif model == "logreg":
			file_name = os.path.join("./predictions/logreg/", f"τ={params["τ"]},α={params["α"]},G={params["G"]},β={params["β"]},R={params["R"]},λ={params["λ"]}.csv")
			if not os.path.exists(file_name):
				file_name = generate_predictions_logreg(**params)
		prediction_files.append(file_name)
	predictions_list = []
	ids = None
	for file in prediction_files:
		df = pd.read_csv(file)
		if ids is None: ids = df["id"].values
		predictions_list.append(df["label"].values)

	if not all(len(pred) == len(ids) for pred in predictions_list): raise ValueError("Mismatch in number of predictions across files")

	predictions_array = np.array(predictions_list)
	probabilities = np.array(scores) / np.sum(scores)
	final_predictions = []
	for i in range(len(ids)):
		model_predictions = predictions_array[:, i]
		weighted_votes = np.zeros(2)
		for j in range(len(predictions_list)):
			weighted_votes[int(model_predictions[j])] += probabilities[j]
		final_label = np.argmax(weighted_votes)
		final_predictions.append(final_label)
	os.makedirs("./predictions/", exist_ok=True)
	file_name = os.path.join("./predictions/", "probabilistic_voting.csv")
	submission = pd.DataFrame({
		"id": ids,
		"label": final_predictions
	})
	submission.to_csv(file_name, index=False)
	print(f"Probabilistic voting predictions saved to {file_name}")

models_params = [
	("logreg", {"τ": 1000, "α": 0.085, "G": "mini-batch", "β": 128, "R": None, "λ": 0}),
	("hgbdtree", {"α": 0.1, "τ": 100, "θ": None, "Δ": None, "ℓ": 20, "seed": 7}),
	("hgbdtree", {"α": 0.1, "τ": 1000, "θ": 31, "Δ": None, "ℓ": 20, "seed": 8})
]
scores = [0.70475, 0.71533, 0.71628]

generate_voted_predictions(models_params, scores)