# Cogito

## Definitions
- `m` is # of data points.
- `n` is # of features.
- `X` is `m`-by-`n` matrix representing features.
- `y` is `m`-by-1 vector representing labels.
- `w` is `n`-by-1 vector representing weights.
- `b` is scalar representing bias.
- `dw` is loss gradient w.r.t. `w`.
- `db` is loss gradient w.r.t. `b`.

### logreg(τ,α,G,β,L,λ)
1. `τ` is # of epochs: non-negative integer.
2. `α` is learning rate: float in range ${[0,1]}$.
3. `G` is gradient descent type: `"full-batch"`, `"mini-batch"`, xor `"stochastic"`.
4. `β` is batch size: non-negative integer xor `None`.
5. `L` is regularisation type: `"L2"`, `"L1"`, xor `None`.
6. `λ` is regularisation degree: real in range ${[0,\infin)}$.

### pcaknn(ρ,ν)
1. `ρ` is # of principal components: integer in range ${[0,n]}$.
2. `ν` is variance ratio: float in range ${[0,1]}$.

### svm(λ,K,°,γ,κ,τ,seed)
1. `λ` is margin regularisation degree.
2. `K` is kernel type: `"linear"`, `"poly"`, `"rbf"`, `"sigmoid"`, xor `"precomputed"`.
3. `°` is polynomial kernel degree for polynomial kernel type: non-negative integer.
4. `γ` is kernel coefficient for RBF, polynomial, xor sigmoid kernel type: `"scale"`, `"auto"`, xor float in range ${[0,1]}$.
5. `κ` is independent term of kernel function with effect for polynomial xor sigmoid kernel type.
6. `τ` is maximum # of iterations: non-negative integer xor `-1`.
7. `seed` is pseudorandom random state: integer xor `None`.

### gbdtree(α,η,γ,ψ,ℓ,Δ,δ,seed,θ)
1. `α` is learning rate: float in range ${[0,1]}$.
2. `η` is # of estimators: non-negative integer.
3. `γ` is fraction of samples used to fit individual base learners: float in range ${[0,1]}$.
4. `ψ` is minimum # of samples needed to split internal node: non-negative integer.
5. `ℓ` is minimum # of samples needed at leaf node: non-negative integer.
6. `Δ` is maximum tree depth: non-negative integer xor `None`.
7. `δ` is minimum impurity decrease: float in range ${[0,1]}$.
8. `seed` is pseudorandom random state: integer xor `None`.
9. `θ` is maximum # of leaf nodes: non-negative integer xor `None`.

#### hgbdtree(α,τ,θ,Δ,ℓ,seed)
1. `α` is learning rate: float in range ${[0,1]}$.
2. `τ` is maximum # of iterations: non-negative integer.
3. `θ` is maximum # of leaf nodes: non-negative integer xor `None`.
4. `Δ` is maximum tree depth: non-negative integer xor `None`.
5. `ℓ` is minimum # of samples needed at leaf node: non-negative integer xor `None`.
6. `seed` is pseudorandom random state: integer xor `None`.

#### rforest(η,C,Δ,ψ,ℓ,θ,δ,B,seed)
1. `η` is # of estimators: non-negative integer.
2. `C` is criterion to measure split quality: `"entropy"`, `"log_loss"`, xor `"gini"`.
3. `Δ` is maximum tree depth: non-negative integer xor `None`.
4. `ψ` is minimum # of samples needed to split internal node: non-negative integer.
5. `ℓ` is minimum # of samples needed at leaf node: non-negative integer.
6. `θ` is maximum # of leaf nodes: non-negative integer xor `None`.
7. `δ` is minimum impurity decrease: float in range ${[0,1]}$.
8. `B` is whether bootstrapping is used: `False` xor `True`.
9. `seed` is pseudorandom random state: integer xor `None`.


## Setup
- numpy, pandas, and sklearn (scikit-learn) must be installed and available.

In [None]:
# Import os for operating system functions.
import os
# Import numpy for mathematical computation.
import numpy as np
# Import pandas for data manipulation.
import pandas as pd
# Import datetime for local time retrieval.
from datetime import datetime
# Import logging for file logging.
import logging
# Import time for duration measurement.
import time

from sklearn.preprocessing import StandardScaler
# Import PCA and KNeighborsClassifier for pcaknn.
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# Import RandomForestClassifier for rforest.
from sklearn.ensemble import RandomForestClassifier

# Import SVC for svm model.
from sklearn.svm import SVC

# Import GradientBoostingClassifier for gbdtree.
from sklearn.ensemble import GradientBoostingClassifier
# Import HistGradientBoostingClassifier for hgbdtree.
from sklearn.ensemble import HistGradientBoostingClassifier

# Import StackingClassifier, LogisticRegression, BaseEstimator, and ClassifierMixin for stack.
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin

# Initialise file logging configuration.
logging.basicConfig(
	filename="main.log",
	level=logging.INFO,
	format="%(asctime)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S"
)

# Load train and test datasets.
S_train = pd.read_csv("train.csv")
S_train_tfidf = pd.read_csv("train_tfidf_features.csv")
S_test = pd.read_csv("test.csv")
S_test_tfidf = pd.read_csv("test_tfidf_features.csv")

# Extract train features, train labels, and test features.
X_train = S_train_tfidf.iloc[:, 2:].values
y_train = S_train["label"].values.reshape(-1, 1)
X_test = S_test_tfidf.iloc[:, 1:].values

# Define generic functions.
# Return σ(z) for some z.
def σ(z): return 1 / (1 + np.exp(-z))

# Return cross-entropy loss for some y and ŷ.
# def loss(y, ŷ): return (-1/(y.shape[0])) * np.sum(y * np.log(ŷ) + (1 - y) * np.log(1 - ŷ))

## Task 1: Logistic Regression Model (`logreg`)

In [None]:
# Return dw and db, for some X, y, ŷ, w, λ, and L.
def gradients_logreg(X, y, ŷ, w, L=None, λ=0):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (ŷ - y))
	db = 1/m * np.sum(ŷ - y)
	if L == "L2":
		dw += λ * w / m
	elif L == "L1":
		dw += λ * np.sign(w) / m
	return dw, db

# Return w and b from gradient descent on X and y, for some β, τ, α, λ, L, and G.
def train_logreg(τ=1000, α=0.1, G="mini-batch", β=128, L=None, λ=0):
	m, n = X_train.shape
	w, b = np.zeros((n, 1)), 0
	for epoch in range(τ):
		if G == "full-batch":
			X_batch, y_batch = X_train, y_train
			ŷ = σ(np.dot(X_batch, w) + b)
			dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, L, λ)
			w, b = w - α*dw, b - α*db
		elif G == "mini-batch":
			for i in range(0, m, β):
				X_batch, y_batch = X_train[i:i+β], y_train[i:i+β]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, L, λ)
				w, b = w - α*dw, b - α*db
		elif G == "stochastic":
			for i in range(m):
				X_batch, y_batch = X_train[i:i+1], y_train[i:i+1]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, L, λ)
				w, b = w - α*dw, b - α*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding ŷ entry > 0.5, and 0 otherwise.
def predict_logreg(w, b):
	# Compute ŷ = σ(w ⋅ X + b).
	ŷ = σ(np.dot(X_test, w) + b)
	return [1 if p > 0.5 else 0 for p in ŷ]

# Train model, make model predictions, and save model predictions to CSV file, for some β, τ, α, λ, L, and G.
def generate_predictions_logreg(τ=1000, α=0.1, G="mini-batch", β=128, L=None, λ=0):
	start_time = time.time()
	w, b = train_logreg(τ, α, G, β, L, λ)
	os.makedirs("./predictions/logreg/", exist_ok=True)
	file_name = os.path.join("./predictions/logreg/", f"G={G},β={β},L={L},λ={λ},τ={τ},α={α}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predict_logreg(w, b)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_logreg(10, 0.1, "stochastic")

## Task 2: PCA-KNN Model (`pcaknn`)

In [None]:
# Return PCA-transformed train and test featuresets, for either some ν or ρ.
def apply_pca(mode_value, mode):
	# Validate mode and mode value.
	if mode not in ["ν", "ρ"] or (mode == "ν" and not (0 < mode_value <= 1)) or (mode == "ρ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ν with value between 0 and 1, or ρ with mode_value as an integer greater than 1.")
	# Instantiate PCA and StandardScaler objects.
	pca = PCA(n_components=mode_value)
	scaler = StandardScaler()
	# PCA-transform train and test featuresets.
	X_train_pca = pca.fit_transform(scaler.fit_transform(X_train))
	X_test_pca = pca.transform(scaler.transform(X_test))
	# Return PCA-transformed featuresets and # of components.
	return X_train_pca, X_test_pca, pca.n_components_

# Return KNN predictions for PCA-transformed train and test datasets.
def train_and_predict_knn(X_train_pca, y_train, X_test_pca):
	# Instantiate KNeighborsClassifier object.
	knn = KNeighborsClassifier(n_neighbors=2)
	knn.fit(X_train_pca, y_train)
	return knn.predict(X_test_pca)

# Train model, make model predictions, and save model predictions to CSV file, for some ν or ρ.
def generate_predictions_pcaknn(mode_value, mode="ρ"):
	# Validate mode and mode value.
	if mode not in ["ν", "ρ"] or (mode == "ν" and not (0 < mode_value <= 1)) or (mode == "ρ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ν with value between 0 and 1, or ρ with mode_value as an integer greater than 1.")

	start_time = time.time()
	X_train_pca, X_test_pca, n_components = apply_pca(mode_value, mode)
	os.makedirs("./predictions/pcaknn/", exist_ok=True)
	if mode == "ν":
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ν={mode_value},ρ={n_components}).csv")
	else:
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ρ={n_components}).csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": train_and_predict_knn(X_train_pca, y_train, X_test_pca)
	}).to_csv(file_name, index=False)
	end_time = time.time()

	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_pcaknn(mode_value=5000, mode="ρ")

## Task 3: Other Models

### Random Forest Model (`rforest`)

In [None]:
# Train RandomForest classifier, make predictions, and save predictions to CSV file.
def generate_predictions_rforest(η=100, C="gini", Δ=None, ψ=2, ℓ=1, θ=None, δ=0.0, B=True, seed=None):
	start_time = time.time()
	rf = RandomForestClassifier(
		n_estimators=η,
		criterion=C,
		max_depth=Δ,
		min_samples_split=ψ,
		min_samples_leaf=ℓ,
		max_leaf_nodes=θ,
		min_impurity_decrease=δ,
		bootstrap=B,
		random_state=seed
	)
	rf.fit(X_train, y_train.ravel())
	predictions = rf.predict(X_test)
	os.makedirs("./predictions/rforest/", exist_ok=True)
	file_name = os.path.join("./predictions/rforest/", f"𝒹={𝒹},𝓈={𝓈},ℓ={ℓ},η={η}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_rforest(η=100, C="gini", Δ=None, ψ=2, ℓ=1, θ=None, δ=0.0, B=True, seed=None)

### SVM Model (`svm`)

In [None]:
# Train SVM classifier, make predictions, and save predictions to CSV file.
def generate_predictions_svm(λ=1.0, K="rbf", °=3, γ="scale", κ=0.0, τ=-1, seed=None):
	start_time = time.time()
	svm_clf = SVC(
		C=λ,
		kernel=K,
		degree=°,
		gamma=γ,
		coef0=κ,
		max_iter=τ,
		random_state=seed
	)
	svm_clf.fit(X_train, y_train.ravel())
	predictions = svm_clf.predict(X_test)
	os.makedirs("./predictions/svm/", exist_ok=True)
	file_name = os.path.join("./predictions/svm/", f"λ={λ},K={K},°={°},γ={γ},κ={κ},τ={τ},seed={seed}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_svm(λ=1.0, K="rbf", °=3, γ="scale", κ=0.0, τ=-1, seed=None)

### Gradient Boosted Decision Tree Model (`gbdtree`)

In [None]:
def generate_predictions_gbdtree(α=0.1, η=100, γ=1.0, ψ=2, ℓ=1, Δ=3, δ=0.0, seed=None, θ=None):
	start_time = time.time()
	gbc = GradientBoostingClassifier(
		learning_rate=α,
		n_estimators=η,
		subsample=γ,
		min_samples_split=ψ,
		min_samples_leaf=ℓ,
		max_depth=Δ,
		min_impurity_decrease=δ,
		random_state=seed,
		max_leaf_nodes=θ
	)
	gbc.fit(X_train, y_train)
	predictions = gbc.predict(X_test)
	os.makedirs("./predictions/gbdtree/", exist_ok=True)
	file_name = os.path.join("./predictions/gbdtree/", f"α={α},η={η},γ={γ},ψ={ψ},ℓ={ℓ},Δ={Δ},δ={δ},seed={seed},θ={θ}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

generate_predictions_gbdtree(α=0.1, η=100, γ=1.0, ψ=2, ℓ=1, Δ=3, δ=0.0, seed=None, θ=None)

### Histogram Gradient Boosted Decision Tree Model (`hgbdtree`)

In [None]:
def generate_predictions_hgbdtree(α=0.1, τ=100, θ=31, Δ=None, ℓ=20, seed=None):
	start_time = time.time()
	hgbc = HistGradientBoostingClassifier(
		learning_rate=α,
		max_iter=τ,
		max_leaf_nodes=θ,
		max_depth=Δ,
		min_samples_leaf=ℓ,
		random_state=seed
	)
	hgbc.fit(X_train, y_train)
	predictions = hgbc.predict(X_test)
	os.makedirs("./predictions/hgbdtree/", exist_ok=True)
	file_name = os.path.join("./predictions/hgbdtree/", f"α={α},τ={τ},θ={θ},Δ={Δ},ℓ={ℓ},seed={seed}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_hgbdtree(α=0.1, τ=100, θ=31, Δ=None, ℓ=20, seed=None)

### Bagging Model

In [None]:
# Define wrapper classes for original (i.e. non-imported) models.

