# Machine Learning Project

## Setup

In [14]:
from base import *

# Import for cross-validation.
from grid_search import cv_grid_search
from randomsearch import cv_randomized_search
from simulated_annealing import SimulatedAnnealing, cv_simulated_annealing

logging.basicConfig(
	filename="main.log",
	level=logging.INFO,
	format="%(asctime)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S"
)

def train_model(model_type, X_train, y_train, **kwargs):
	model_class, default_params = MODEL_TYPE_TO_CLASS_TO_HYPERPARAMETER_MAP[model_type]
	params = {**default_params, **kwargs}
	model = model_class(**params)
	model.fit(X_train, y_train)
	return model

def predict_model(model, X):
	return model.predict(X)

def generate_predictions(model_type, **kwargs):
	start_time = time.time()
	model = train_model(model_type, np.array(X_train), np.array(y_train), **kwargs)
	predictions = predict_model(model, np.array(X_test))
	output_dir = f"./predictions/{model_type}/"
	os.makedirs(output_dir, exist_ok=True)
	#file_name = os.path.join(output_dir, f"{kwargs}.csv")
	file_name = os.path.join(output_dir, "_".join(f"{k}={v}" for k, v in kwargs.items()) + ".csv")
	pd.DataFrame({"id": S_test["id"], "label": predictions}).to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

ImportError: cannot import name 'cv_randomized_search' from 'randomsearch' (c:\Users\Du Bowei\Desktop\Downloads\cogito-main-kfold\randomsearch.py)

## Task 1

### logreg Model

In [None]:
def σ(z): return 1 / (1 + np.exp(-z))
def bce_loss(y, ŷ): return (-1/(y.shape[0])) * np.sum(y * np.log(ŷ) + (1 - y) * np.log(1 - ŷ))

# Return dw and db, for some X, y, ŷ, w, R, and λ.
def gradients_logreg(X, y, ŷ, w, R=None, λ=0):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (ŷ - y))
	db = 1/m * np.sum(ŷ - y)
	if R == "L2":
		dw += λ * w / m
	elif R == "L1":
		dw += λ * np.sign(w) / m
	return dw, db

# Return (w, b) from gradient descent on X_train and y_train, for some τ, α, G, β, R, and λ.
def train_logreg(X_train, y_train, τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
	m, n = X_train.shape
	w, b = np.zeros((n, 1)), 0
	for epoch in range(τ):
		if G == "full-batch":
			X_batch, y_batch = X_train, y_train
			ŷ = σ(np.dot(X_batch, w) + b)
			dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
			w, b = w - α*dw, b - α*db
		elif G == "mini-batch":
			for i in range(0, m, β):
				X_batch, y_batch = X_train[i:i+β], y_train[i:i+β]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
				w, b = w - α*dw, b - α*db
		elif G == "stochastic":
			for i in range(m):
				X_batch, y_batch = X_train[i:i+1], y_train[i:i+1]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, R, λ)
				w, b = w - α*dw, b - α*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding ŷ entry > 0.5, and 0 otherwise.
def predict_logreg(wb_tuple, X):
	w, b = wb_tuple
	ŷ = σ(np.dot(X, w) + b)
	return np.array([1 if p > 0.5 else 0 for p in ŷ])

# Train model, make predictions, and save predictions to CSV file.
def generate_predictions_logreg(τ=1000, α=0.1, G="mini-batch", β=128, R=None, λ=0):
	start_time = time.time()
	w, b = train_logreg(np.array(X_train), np.array(y_train), τ, α, G, β, R, λ)
	predictions = predict_logreg((w, b), np.array(X_test))
	os.makedirs("./predictions/logreg/", exist_ok=True)
	file_name = os.path.join("./predictions/logreg/", f"τ={τ},α={α},G={G},β={β},R={R},λ={λ}.csv")
	pd.DataFrame({"id": S_test["id"], "label": predictions}).to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

## Task 2

In [None]:
def apply_pca(x):
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)
	if 0 <= x <= 1:
		# x is variance threshold.
		pca = PCA(n_components=None)
		pca.fit(X_train_scaled)
		c = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= x) + 1
		v = x
	else:
		# x is number of components.
		pca = PCA(n_components=x)
		pca.fit(X_train_scaled)
		c = x
		v = sum(pca.explained_variance_ratio_)
	# Transform train and test datasets.
	X_train_pca = pca.transform(X_train_scaled)
	X_test_pca = pca.transform(X_test_scaled)
	return X_train_pca, X_test_pca, c, v

def train_SKLknn(X_train, y_train, k=5, W="uniform", p=2, m="minkowski"):
	model = KNeighborsClassifier(
		n_neighbors=k,
		weights=W,
		p=p,
		metric=m,
		n_jobs=-1
	)
	model.fit(X_train, y_train)
	return model

def predict_SKLknn(model, X): return model.predict(X)

# Train model, make model predictions, and save model predictions to CSV file.
def generate_predictions_pcaknn(x):
	start_time = time.time()
	X_train_pca, X_test_pca, c, v = apply_pca(x)
	model = train_SKLknn(X_train_pca, y_train, k=2)
	predictions = predict_SKLknn(model, X_test_pca)
	os.makedirs("./predictions/pcaknn/", exist_ok=True)
	file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ρ={c},ν={v:.2f}).csv")
	pd.DataFrame({"id": S_test["id"], "label": predictions}).to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

### pcaknn Area

In [None]:
generate_predictions_pcaknn(100)
generate_predictions_pcaknn(500)
generate_predictions_pcaknn(1000)
generate_predictions_pcaknn(2000)
generate_predictions_pcaknn(5000)

## Task 3

### SKLknn Area

In [None]:
generate_predictions(
	"SKLknn",
	n_neighbors=7,
	weights="distance",
	p=1,
	metric="minkowski"
)

### SKLrf Area

In [None]:
generate_predictions(
	"SKLrf",
	n_estimators=150,
	criterion="entropy",
	max_depth=10,
	min_samples_split=5,
	min_samples_leaf=2,
	max_leaf_nodes=50,
	random_state=None
)

### SKLsvm Area

In [None]:
generate_predictions(
	"SKLsvm",
	C=0.5,
	kernel="linear",
	degree=3,
	gamma="auto",
	max_iter=1000,
	random_state=None
)

### SKLgb Area

In [None]:
generate_predictions(
	"SKLgb",
	learning_rate=0.05,
	n_estimators=200,
	subsample=0.8,
	min_samples_split=3,
	min_samples_leaf=2,
	max_depth=5,
	random_state=None,
	max_leaf_nodes=50
)

### SKLhgb Area

In [6]:
generate_predictions(
	"SKLhgb",
	learning_rate=0.1,
	max_iter=230,
	max_leaf_nodes=20,
	max_depth=11,
	min_samples_leaf=14,
	random_state=7
)

  y = column_or_1d(y, warn=True)


Predictions file ./predictions/SKLhgb/learning_rate=0.1_max_iter=230_max_leaf_nodes=20_max_depth=11_min_samples_leaf=14_random_state=7.csv generated in 41.35s.


### XGBgb Area

In [None]:
generate_predictions(
	"XGBgb",
	n_estimators=150,
	max_depth=4,
	max_leaves=15,
	learning_rate=0.05,
	objective="binary:logistic",
	booster="gbtree",
	subsample=0.9,
	reg_alpha=0.1,
	reg_lambda=1.2,
	random_state=None
)

### CBgb Area

In [2]:
generate_predictions(
	"CBgb",
	learning_rate=0.05,
	subsample=0.9,
	max_depth=6,
	n_estimators=150,
	reg_lambda=0.5,
	random_state=42
)

0:	learn: 0.6878966	total: 259ms	remaining: 38.6s
1:	learn: 0.6828537	total: 332ms	remaining: 24.6s
2:	learn: 0.6778515	total: 398ms	remaining: 19.5s
3:	learn: 0.6735781	total: 461ms	remaining: 16.8s
4:	learn: 0.6696646	total: 529ms	remaining: 15.3s
5:	learn: 0.6662035	total: 603ms	remaining: 14.5s
6:	learn: 0.6631690	total: 655ms	remaining: 13.4s
7:	learn: 0.6597114	total: 703ms	remaining: 12.5s
8:	learn: 0.6567821	total: 759ms	remaining: 11.9s
9:	learn: 0.6541460	total: 821ms	remaining: 11.5s
10:	learn: 0.6516551	total: 881ms	remaining: 11.1s
11:	learn: 0.6495775	total: 939ms	remaining: 10.8s
12:	learn: 0.6476608	total: 998ms	remaining: 10.5s
13:	learn: 0.6456923	total: 1.05s	remaining: 10.2s
14:	learn: 0.6434364	total: 1.11s	remaining: 10s
15:	learn: 0.6423033	total: 1.17s	remaining: 9.77s
16:	learn: 0.6402954	total: 1.22s	remaining: 9.54s
17:	learn: 0.6388210	total: 1.27s	remaining: 9.35s
18:	learn: 0.6374374	total: 1.32s	remaining: 9.14s
19:	learn: 0.6354907	total: 1.38s	remaining

### Cross-Validation Area (Grid Search)

In [None]:
grid = {
	"n_neighbors": [2],
	"weights": ["uniform"],
	"p": [1]
}
cv_grid_search("SKLknn", grid, 5)

In [5]:
grid = {
        "learning_rate": [0.08,0.1],
        "max_iter": [230],
        "max_leaf_nodes": [16,17,18,19,20],
        "max_depth": [11],
        "min_samples_leaf": [13,14,15,16],
        "random_state": [7],
    }

cv_grid_search("SKLhgb", grid, 5)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best model: HistGradientBoostingClassifier(max_depth=11, max_iter=230, max_leaf_nodes=20,
                               min_samples_leaf=14, random_state=7)
Best hyperparameters: {'learning_rate': 0.1, 'max_depth': 11, 'max_iter': 230, 'max_leaf_nodes': 20, 'min_samples_leaf': 14, 'random_state': 7}
Mean cross-validated f1 for best model: 0.5540533201495075
Model 1:
Hyperparameters: {'learning_rate': 0.08, 'max_depth': 11, 'max_iter': 230, 'max_leaf_nodes': 16, 'min_samples_leaf': 13, 'random_state': 7}
Mean cross-validated f1: 0.5440906945314383

Model 2:
Hyperparameters: {'learning_rate': 0.08, 'max_depth': 11, 'max_iter': 230, 'max_leaf_nodes': 16, 'min_samples_leaf': 14, 'random_state': 7}
Mean cross-validated f1: 0.5457913100231488

Model 3:
Hyperparameters: {'learning_rate': 0.08, 'max_depth': 11, 'max_iter': 230, 'max_leaf_nodes': 16, 'min_samples_leaf': 15, 'random_state': 7}
Mean cross-validated f1: 0.5406898916664398

Model 4:
Hyperparameters: {'learning_rate': 0.08, 'max_de

### Cross Validation (Randomized Search)

In [None]:
param_distributions = {
	'n_estimators': [100, 200, 300, 400, 500],
	'max_features': ['auto', 'sqrt', 'log2'],
	'max_depth': [10, 20, 30, 40, 50, None],
	'min_samples_split': [2, 5, 10],
	'min_samples_leaf': [1, 2, 4],
	'bootstrap': [True, False]
}
cv_randomized_search('random_forest', param_distributions, k=5, n_iter=50)

### Cross Validation (Simulated Annealing)

In [None]:
grid = {
	"learning_rate": (0.06, 0.12),
	"max_depth": [12],
	"reg_lambda": (0.1, 1),
}

cv_simulated_annealing(model_name="CBgb", grid=grid)