# Cogito
αβτρν𝒹𝓈ℓηλε

## Introduction
### Definitions
- Let `X` be a `m`-by-`n` matrix of input data, and `y` be a `m`-by-1 vector of output data, where `m` is the # of data points, and `n` is the # of features, so that `X` represents the features and `y` represents the labels.
- Let `w` be a `n`-by-1 vector of weights, and `b` be a scalar of bias.
- Let `dw` be the loss gradient w.r.t. `w`, and `db` be the loss gradient w.r.t. `b`, where for least squares we have ${dw=\frac{1}{m}\sum_{i=1}^{m}(ŷ^{(i)} - y^{(i)})x^{(i)}+\frac{\lambda w}{m}}$ and ${db = \frac{1}{m}\sum_{i=1}^{m}(ŷ^{(i)} - y^{(i)})}$.
- Let `β` be the batch size.
- Let `τ` be the # of epochs.
- Let `α` be the learning rate.
- Let `ρ` be the # of principal components (where `n` is the maximum possible).
- Let `ν` be the variance ratio.
- Let `𝒹` be the maximum depth.
- Let `𝓈` be the minimum number of samples to split an internal node.
- Let `ℓ` be the minimum number of samples at a leaf node.
- Let `η` be the number of estimators.
- Let `λ` be either the L2 regularisation parameter (in context of logistic regression), or the C parameter (in the context of SVM).
- Let `ε` be a small positive constant.

### Naming Conventions
1. Dataset is treated as a single word.
2. Featureset refers to the features of a dataset.
3. Labelset refers to the labels of a dataset.

## Setup
- numpy, pandas, and sklearn (scikit-learn) must be installed and available.

In [None]:
# Import os for operating system functions.
import os
# Import numpy for mathematical computation.
import numpy as np
# Import pandas for data manipulation.
import pandas as pd
# Import datetime for local time retrieval.
from datetime import datetime
# Import logging for file logging.
import logging
# Import time for duration measurement.
import time

# from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
# Import PCA and KNeighborsClassifier for PCA-KNN model.
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
# Import DecisionTreeClassifier for dtree model.
from sklearn.tree import DecisionTreeClassifier
# Import RandomForestClassifier for rforest model.
from sklearn.ensemble import RandomForestClassifier
# Import SVC for SVM model.
from sklearn.svm import SVC
# Import StackingClassifier, LogisticRegression, BaseEstimator, and ClassifierMixin for stack model.
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin

# Initialise file logging configuration.
logging.basicConfig(filename="main.log", level=logging.INFO, format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

# Load train and test datasets.
S_train = pd.read_csv("train.csv")
S_train_tfidf = pd.read_csv("train_tfidf_features.csv")
S_test = pd.read_csv("test.csv")
S_test_tfidf = pd.read_csv("test_tfidf_features.csv")

# Extract train features, train labels, and test features.
X_train = S_train_tfidf.iloc[:, 2:].values
y_train = S_train["label"].values.reshape(-1, 1)
X_test = S_test_tfidf.iloc[:, 1:].values

# Define generic functions.
# Return σ(z) for some z.
def σ(z): return 1 / (1 + np.exp(-z))

# Return cross-entropy loss for some y and ŷ.
# def loss(y, ŷ): return (-1/(y.shape[0])) * np.sum(y * np.log(ŷ) + (1 - y) * np.log(1 - ŷ))

## Task 1: Mini-Batch Gradient Descent Logistic Regression Model (mgdlogreg)

In [None]:
# Return dw and db, for some X, y, and ŷ.
def gradients_mgdlogreg(X, y, ŷ):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (ŷ - y))
	db = 1/m * np.sum(ŷ - y)
	return dw, db

# Return w and b from mini-batch gradient descent on X and y, for batch size β, epochs τ, and learning rate α.
def train_mgdlogreg(β, τ, α):
	m, n = X_train.shape
	# Initialise w to 0 vector and b to 0.
	w, b = np.zeros((n, 1)), 0
	# For each epoch:
	for epoch in range(τ):
		# For every batch within epoch:
		for i in range(0, m, β):
			X_batch, y_batch = X_train[i:i+β], y_train[i:i+β]
			ŷ = σ(np.dot(X_batch, w) + b)
			dw, db = gradients_mgdlogreg(X_batch, y_batch, ŷ)
			w, b = w - α*dw, b - α*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding ŷ entry > 0.5, and 0 otherwise.
def predict_mgdlogreg(w, b):
	# Compute ŷ = σ(w⋅X + b).
	ŷ = σ(np.dot(X_test, w) + b)
	return [1 if p > 0.5 else 0 for p in ŷ]

# Train model, make model predictions, and save model predictions to CSV file, for some β, τ, and α.
def generate_predictions_mgdlogreg(β, τ, α):
	start_time = time.time()
	w, b = train_mgdlogreg(β, τ, α)
	os.makedirs("./predictions/", exist_ok=True)
	file_name = os.path.join("./predictions/", f"mgdlogreg(β={β},τ={τ},α={α}).csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predict_mgdlogreg(w, b)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# Example usage:
# generate_predictions_mgdlogreg(128, 1200, 0.084)

## Task 2: PCA-KNN Model (pcaknn)
### Usage Examples
1. `generate_predictions_pcaknn(X_train, y_train, X_test, 0.95, "ν")`
2. `generate_predictions_pcaknn(X_train, y_train, X_test, 2000, "ρ")`

In [None]:
# Return PCA-transformed train and test featuresets, for either some ν or ρ.
def apply_pca(mode_value, mode):
	# Validate mode and mode value.
	if mode not in ["ν", "ρ"] or (mode == "ν" and not (0 < mode_value <= 1)) or (mode == "ρ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ν with value between 0 and 1, or ρ with mode_value as an integer greater than 1.")
	# Instantiate PCA and StandardScaler objects.
	pca = PCA(n_components=mode_value)
	scaler = StandardScaler()
	# PCA-transform train and test featuresets.
	X_train_pca = pca.fit_transform(scaler.fit_transform(X_train))
	X_test_pca = pca.transform(scaler.transform(X_test))
	# Return PCA-transformed featuresets and # of components.
	return X_train_pca, X_test_pca, pca.n_components_

# Return KNN predictions for PCA-transformed train and test datasets.
def train_and_predict_knn(X_train_pca, y_train, X_test_pca):
	# Instantiate KNeighborsClassifier object.
	knn = KNeighborsClassifier(n_neighbors=2)
	knn.fit(X_train_pca, y_train)
	return knn.predict(X_test_pca)

# Train model, make model predictions, and save model predictions to CSV file, for some ν or ρ.
def generate_predictions_pcaknn(mode_value, mode):
	# Validate mode and mode value.
	if mode not in ["ν", "ρ"] or (mode == "ν" and not (0 < mode_value <= 1)) or (mode == "ρ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ν with value between 0 and 1, or ρ with mode_value as an integer greater than 1.")

	start_time = time.time()
	X_train_pca, X_test_pca, n_components = apply_pca(mode_value, mode)
	os.makedirs("./predictions/pcaknn/", exist_ok=True)
	if mode == "ν":
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ν={mode_value},ρ={n_components}).csv")
	else:
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ρ={n_components}).csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": train_and_predict_knn(X_train_pca, y_train, X_test_pca)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_pcaknn(5000, "ρ")

## Task 3: Other Models
### Decision Tree Model (dtree)

In [None]:
def generate_predictions_dtree(𝒹, 𝓈, ℓ):
	start_time = time.time()
	model = DecisionTreeClassifier(max_depth=𝒹, min_samples_split=𝓈, min_samples_leaf=ℓ)
	model.fit(X_train, y_train)
	predictions = model.predict(X_test)
	os.makedirs("./predictions/dtree/", exist_ok=True)
	file_name = os.path.join("./predictions/dtree/", f"𝒹={𝒹},𝓈={𝓈},ℓ={ℓ}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_dtree(𝒹=50, 𝓈=10, ℓ=2)

### Random Forest Model (rforest)

In [None]:
# Train RandomForest classifier, make predictions, and save predictions to CSV file.
def train_and_predict_rforest(𝒹, 𝓈, ℓ, η):
	start_time = time.time()
	rf = RandomForestClassifier(max_depth=𝒹, min_samples_split=𝓈, min_samples_leaf=ℓ, n_estimators=η, random_state=42)
	rf.fit(X_train, y_train.ravel())
	predictions = rf.predict(X_test)
	os.makedirs("./predictions/rforest/", exist_ok=True)
	file_name = os.path.join("./predictions/rforest/", f"𝒹={𝒹},𝓈={𝓈},ℓ={ℓ},η={η}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# train_and_predict_rforest(𝒹=10000, 𝓈=10, ℓ=2, η=100)

### L2 Adagrad Mini-Batch Gradient Descent Logistic Regression Model (l2amgdlogreg)

In [None]:
# Return dw and db, for some X, y, ŷ, and λ.
def gradients_l2amgdlogreg(X, y, ŷ, w, λ):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (ŷ - y)) + (λ/m) * w
	db = 1/m * np.sum(ŷ - y)
	return dw, db

# Return w and b from L2 adagrad mini-batch gradient descent on X_train and y_train, for batch size β, epochs τ, initial learning rate α, and regularisation λ.
def train_l2amgdlogreg(β, τ, α, λ=0, ε=1e-8):
	m, n = X_train.shape
	# Initialise w to 0 vector and b to 0.
	w, b = np.zeros((n, 1)), 0
	# Initialise gradient accumulation variables for Adagrad.
	sdw, sdb = np.zeros((n, 1)), 0
	# For each epoch:
	for epoch in range(τ):
		# For every batch within epoch:
		for i in range(0, m, β):
			X_batch, y_batch = X_train[i:i+β], y_train[i:i+β]
			ŷ = σ(np.dot(X_batch, w) + b)
			dw, db = gradients_l2amgdlogreg(X_batch, y_batch, ŷ, w, λ)
			sdw += dw ** 2
			sdb += db ** 2
			w -= (α / (np.sqrt(sdw) + ε)) * dw
			b -= (α / (np.sqrt(sdb) + ε)) * db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding ŷ entry > 0.5, and 0 otherwise.
def predict_l2amgdlogreg(w, b):
	# Compute ŷ = σ(w⋅X + b).
	ŷ = σ(np.dot(X_test, w) + b)
	return [1 if p > 0.5 else 0 for p in ŷ]

# Train model, make model predictions, and save model predictions to CSV file, for some β, τ, α, and λ.
def generate_predictions_l2amgdlogreg(β, τ, α, λ=0):
	start_time = time.time()
	w, b = train_l2amgdlogreg(β, τ, α, λ)
	os.makedirs("./predictions/l2amgdlogreg/", exist_ok=True)
	file_name = os.path.join("./predictions/l2amgdlogreg/", f"β={β},τ={τ},α={α},λ={λ}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predict_l2amgdlogreg(w, b)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_l2amgdlogreg(β=128, τ=2000, α=0.0825, λ=0.01)

### SVM Model

In [None]:
# Train SVM classifier, make predictions, and save predictions to CSV file.
def generate_predictions_svm(κ, λ):
	start_time = time.time()
	svm_clf = SVC(kernel=κ, C=λ, random_state=42)
	svm_clf.fit(X_train, y_train.ravel())
	predictions = svm_clf.predict(X_test)
	os.makedirs("./predictions/svm/", exist_ok=True)
	file_name = os.path.join("./predictions/svm/", f"κ={κ},λ={λ}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_svm("linear", 1.0)

### Stacking Ensemble Model (stack)

In [None]:
# Define wrapper classes for original (i.e. non-imported) models.
class CustomL2AMGDLogReg(BaseEstimator, ClassifierMixin):
	def __init__(self, β=128, τ=2000, α=0.0825, λ=0.01):
		self.β = β
		self.τ = τ
		self.α = α
		self.λ = λ
		self.w = None
		self.b = None

	def fit(self, X, y):
		self.w, self.b = train_l2amgdlogreg(self.β, self.τ, self.α, self.λ)
		return self

	def predict(self, X):
		return predict_l2amgdlogreg(self.w, self.b)

In [None]:
def train_models(hyperparameters):
	models = []
	l2amgdlogreg_clf = CustomL2AMGDLogReg(
		β=hyperparameters["l2amgdlogreg"]["β"],
		τ=hyperparameters["l2amgdlogreg"]["τ"],
		α=hyperparameters["l2amgdlogreg"]["α"],
		λ=hyperparameters["l2amgdlogreg"]["λ"]
	)
	l2amgdlogreg_clf.fit(X_train, y_train.ravel())
	models.append(("l2amgdlogreg", l2amgdlogreg_clf))

	knn_clf = KNeighborsClassifier(
		n_neighbors=hyperparameters["knn"]["n_neighbors"]
	)
	knn_clf.fit(X_train, y_train.ravel())
	models.append(("knn", knn_clf))

	dtree_clf = DecisionTreeClassifier(
		random_state=hyperparameters["dtree"]["random_state"],
		max_depth=hyperparameters["dtree"]["𝒹"],
		min_samples_split=hyperparameters["dtree"]["𝓈"],
		min_samples_leaf=hyperparameters["dtree"]["ℓ"]
	)
	dtree_clf.fit(X_train, y_train.ravel())
	models.append(("dtree", dtree_clf))

	rforest_clf = RandomForestClassifier(
		random_state=hyperparameters["rforest"]["random_state"],
		n_estimators=hyperparameters["rforest"]["η"]
	)
	rforest_clf.fit(X_train, y_train.ravel())
	models.append(("rforest", rforest_clf))

	svm_clf = SVC(
		kernel=hyperparameters["svm"]["kernel"],
		C=hyperparameters["svm"]["λ"],
		probability=hyperparameters["svm"]["probability"],
		random_state=hyperparameters["svm"]["random_state"]
	)
	svm_clf.fit(X_train, y_train.ravel())
	models.append(("svm", svm_clf))
	return models

def generate_predictions_stack(hyperparameters):
	start_time = time.time()
	models = train_models(hyperparameters)
	stacking_clf = StackingClassifier(
		estimators=models,
		final_estimator=LogisticRegression(),
		cv=5
	)
	stacking_clf.fit(X_train, y_train.ravel())
	predictions = stacking_clf.predict(X_test)
	os.makedirs("./predictions/stack/", exist_ok=True)
	file_name = os.path.join("./predictions/stack/", "predictions.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

hyperparameters = {
	"l2amgdlogreg": {"β": 128, "τ": 2000, "α": 0.0825, "λ": 0.01},
	"knn": {"n_neighbors": 5},
	"dtree": {"random_state": 42, "𝒹": 10, "𝓈": 2, "ℓ": 1},
	"rforest": {"random_state": 42, "η": 100},
	"svm": {"kernel": "linear", "λ": 1.0, "probability": True, "random_state": 42}
}

generate_predictions_stack(hyperparameters)