# Cogito

## Definitions
- `m`: # of data points.
- `n`: # of features.
- `X`: `m`-by-`n` matrix representing features.
- `y`: `m`-by-1 vector representing labels.
- `w`: `n`-by-1 vector representing weights.
- `b`: Scalar representing bias.
- `dw`: Loss gradient w.r.t. `w`.
- `db`: Loss gradient w.r.t. `b`.

### Hyperparameters
#### `logreg`
1. `τ`: # of epochs.
2. `α`: Learning rate.
3. `G`: Gradient descent method.
4. `β`: Batch size.
5. `L`: Regularisation type.
6. `λ`: Regularisation degree.

#### `svm`
1. `K`: Kernel type.
2. `λ`: C.

#### `pcaknn`
1. `ρ`: # of principal components (out of `n`).
2. `ν`: Variance ratio.

#### `rforest`
1. `𝒹`: Maximum depth.
2. `ℓ`: Minimum # of samples at leaf node.
3. `η`: # of estimators.
4. `𝓈`: Minimum # of samples to split internal node.
5. `seed`: Random seed.

#### `gb`
1. `α`: Learning rate.
2. `𝒹`: Maximum depth.
3. `ℓ`: Minimum # of samples at leaf node.
4. `η`: # of estimators.
5. `𝓈`: Minimum # of samples to split internal node.
6. `seed`: Random seed.

## Setup
- numpy, pandas, and sklearn (scikit-learn) must be installed and available.

In [9]:
# Import os for operating system functions.
import os
# Import numpy for mathematical computation.
import numpy as np
# Import pandas for data manipulation.
import pandas as pd
# Import datetime for local time retrieval.
from datetime import datetime
# Import logging for file logging.
import logging
# Import time for duration measurement.
import time

from sklearn.preprocessing import StandardScaler
# Import PCA and KNeighborsClassifier for pcaknn.
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# Import RandomForestClassifier for rforest.
from sklearn.ensemble import RandomForestClassifier

# Import SVC for svm model.
from sklearn.svm import SVC

# Import GradientBoostingClassifier for gb.
from sklearn.ensemble import GradientBoostingClassifier

# Import StackingClassifier, LogisticRegression, BaseEstimator, and ClassifierMixin for stack.
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin

# Initialise file logging configuration.
logging.basicConfig(
    filename="main.log",
    level=logging.INFO,
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Load train and test datasets.
S_train = pd.read_csv("train.csv")
S_train_tfidf = pd.read_csv("train_tfidf_features.csv")
S_test = pd.read_csv("test.csv")
S_test_tfidf = pd.read_csv("test_tfidf_features.csv")

# Extract train features, train labels, and test features.
X_train = S_train_tfidf.iloc[:, 2:].values
y_train = S_train["label"].values.reshape(-1, 1)
X_test = S_test_tfidf.iloc[:, 1:].values

# Define generic functions.
# Return σ(z) for some z.
def σ(z): return 1 / (1 + np.exp(-z))

# Return cross-entropy loss for some y and ŷ.
# def loss(y, ŷ): return (-1/(y.shape[0])) * np.sum(y * np.log(ŷ) + (1 - y) * np.log(1 - ŷ))

## Task 1: Logistic Regression Model (`logreg`)

In [10]:
# Return dw and db, for some X, y, ŷ, w, λ, and L.
def gradients_logreg(X, y, ŷ, w, L=None, λ=0):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (ŷ - y))
	db = 1/m * np.sum(ŷ - y)
	if L == "L2":
		dw += λ * w / m
	elif L == "L1":
		dw += λ * np.sign(w) / m
	return dw, db

# Return w and b from gradient descent on X and y, for some β, τ, α, λ, L, and G.
def train_logreg(τ, α, G="mini-batch", β=0, L=None, λ=0):
	m, n = X_train.shape
	w, b = np.zeros((n, 1)), 0
	for epoch in range(τ):
		if G == "full-batch":
			X_batch, y_batch = X_train, y_train
			ŷ = σ(np.dot(X_batch, w) + b)
			dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, L, λ)
			w, b = w - α*dw, b - α*db
		elif G == "mini-batch":
			for i in range(0, m, β):
				X_batch, y_batch = X_train[i:i+β], y_train[i:i+β]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, L, λ)
				w, b = w - α*dw, b - α*db
		elif G == "stochastic":
			for i in range(m):
				X_batch, y_batch = X_train[i:i+1], y_train[i:i+1]
				ŷ = σ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, ŷ, w, L, λ)
				w, b = w - α*dw, b - α*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding ŷ entry > 0.5, and 0 otherwise.
def predict_logreg(w, b):
	# Compute ŷ = σ(w ⋅ X + b).
	ŷ = σ(np.dot(X_test, w) + b)
	return [1 if p > 0.5 else 0 for p in ŷ]

# Train model, make model predictions, and save model predictions to CSV file, for some β, τ, α, λ, L, and G.
def generate_predictions_logreg(τ, α, G="mini-batch", β=1, L=None, λ=0):
	start_time = time.time()
	w, b = train_logreg(τ, α, G, β, L, λ)
	os.makedirs("./predictions/logreg/", exist_ok=True)
	file_name = os.path.join("./predictions/logreg/", f"G={G},β={β},L={L},λ={λ},τ={τ},α={α}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predict_logreg(w, b)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_logreg(10, 0.1, "stochastic")

## Task 2: PCA-KNN Model (`pcaknn`)

In [11]:
# Return PCA-transformed train and test featuresets, for either some ν or ρ.
def apply_pca(mode_value, mode):
	# Validate mode and mode value.
	if mode not in ["ν", "ρ"] or (mode == "ν" and not (0 < mode_value <= 1)) or (mode == "ρ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ν with value between 0 and 1, or ρ with mode_value as an integer greater than 1.")
	# Instantiate PCA and StandardScaler objects.
	pca = PCA(n_components=mode_value)
	scaler = StandardScaler()
	# PCA-transform train and test featuresets.
	X_train_pca = pca.fit_transform(scaler.fit_transform(X_train))
	X_test_pca = pca.transform(scaler.transform(X_test))
	# Return PCA-transformed featuresets and # of components.
	return X_train_pca, X_test_pca, pca.n_components_

# Return KNN predictions for PCA-transformed train and test datasets.
def train_and_predict_knn(X_train_pca, y_train, X_test_pca):
	# Instantiate KNeighborsClassifier object.
	knn = KNeighborsClassifier(n_neighbors=2)
	knn.fit(X_train_pca, y_train)
	return knn.predict(X_test_pca)

# Train model, make model predictions, and save model predictions to CSV file, for some ν or ρ.
def generate_predictions_pcaknn(mode_value, mode):
	# Validate mode and mode value.
	if mode not in ["ν", "ρ"] or (mode == "ν" and not (0 < mode_value <= 1)) or (mode == "ρ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ν with value between 0 and 1, or ρ with mode_value as an integer greater than 1.")
	start_time = time.time()
	X_train_pca, X_test_pca, n_components = apply_pca(mode_value, mode)
	os.makedirs("./predictions/pcaknn/", exist_ok=True)
	if mode == "ν":
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ν={mode_value},ρ={n_components}).csv")
	else:
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ρ={n_components}).csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": train_and_predict_knn(X_train_pca, y_train, X_test_pca)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_pcaknn(5000, "ρ")

## Task 3: Other Models

### Random Forest Model (`rforest`)

In [12]:
# Train RandomForest classifier, make predictions, and save predictions to CSV file.
def generate_predictions_rforest(𝒹, η, ℓ, 𝓈, seed):
	start_time = time.time()
	rf = RandomForestClassifier(
		max_depth=𝒹,
		min_samples_split=𝓈,
		min_samples_leaf=ℓ,
		n_estimators=η,
		random_state=seed
	)
	rf.fit(X_train, y_train.ravel())
	predictions = rf.predict(X_test)
	os.makedirs("./predictions/rforest/", exist_ok=True)
	file_name = os.path.join("./predictions/rforest/", f"𝒹={𝒹},𝓈={𝓈},ℓ={ℓ},η={η}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_rforest(𝒹=10000, η=100, ℓ=2, 𝓈=10)

### SVM Model (`svm`)

In [13]:
# Train SVM classifier, make predictions, and save predictions to CSV file.
def generate_predictions_svm(K, λ):
	start_time = time.time()
	svm_clf = SVC(kernel=K, C=λ, random_state=42)
	svm_clf.fit(X_train, y_train.ravel())
	predictions = svm_clf.predict(X_test)
	os.makedirs("./predictions/svm/", exist_ok=True)
	file_name = os.path.join("./predictions/svm/", f"K={K},λ={λ}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_svm("linear", 1.0)

### Gradient Boosted Decision Tree Model (`gbdtree`)

In [14]:
def generate_predictions_gbdtree(α=0.1, 𝒹=None, ℓ=1, η=100, 𝓈=2, seed=42):
    start_time = time.time()
    gbc = GradientBoostingClassifier(
        learning_rate=α,
        max_depth=𝒹,
        min_samples_leaf=ℓ,
        n_estimators=η,
        min_samples_split=𝓈,
        random_state=seed
    )
    gbc.fit(X_train, y_train)
    predictions = gbc.predict(X_test)
    os.makedirs("./predictions/gbdtree/", exist_ok=True)
    file_name = os.path.join("./predictions/gbdtree/", f"α={α},𝒹={𝒹},ℓ={ℓ},η={η},𝓈={𝓈},seed={seed}.csv")
    submission = pd.DataFrame({
        "id": S_test["id"],
        "label": predictions
    })
    submission.to_csv(file_name, index=False)
    end_time = time.time()
    logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
    print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_gbdtree(α=0.1, 𝒹=None, ℓ=1, η=100, 𝓈=2, seed=42)

### Histogram Gradient Boosted Decision Tree Model (`hgbdtree`)

In [19]:
from sklearn.ensemble import HistGradientBoostingClassifier

def generate_predictions_hgbdtree(α=0.1, 𝒹=None, ℓ=31, η=100, seed=42):
    start_time = time.time()
    hgbc = HistGradientBoostingClassifier(
        learning_rate=α,
        max_depth=𝒹,
        max_leaf_nodes=ℓ,
        max_iter=η,
        random_state=seed
    )
    hgbc.fit(X_train, y_train)
    predictions = hgbc.predict(X_test)
    os.makedirs("./predictions/hgbdtree/", exist_ok=True)
    file_name = os.path.join("./predictions/hgbdtree/", f"α={α},𝒹={𝒹},ℓ={ℓ},η={η},seed={seed}.csv")
    submission = pd.DataFrame({
        "id": S_test["id"],
        "label": predictions
    })
    submission.to_csv(file_name, index=False)
    end_time = time.time()
    logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
    print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_hgbdtree(α=0.12, 𝒹=None, ℓ=None, η=600, seed=8)

  y = column_or_1d(y, warn=True)


Predictions file ./predictions/hgbdtree/α=0.12,𝒹=None,ℓ=None,η=600,seed=8.csv generated in 110.69s.


### Stacking Ensemble Model (`stack`) [WIP]

In [16]:
# Define wrapper classes for original (i.e. non-imported) models.
class CustomL2AMGDLogReg(BaseEstimator, ClassifierMixin):
	def __init__(self, β=128, τ=2000, α=0.0825, λ=0.01):
		self.β = β
		self.τ = τ
		self.α = α
		self.λ = λ
		self.w = None
		self.b = None

	def fit(self, X, y):
		self.w, self.b = train_l2amgdlogreg(self.β, self.τ, self.α, self.λ)
		return self

	def predict(self, X):
		return predict_l2amgdlogreg(self.w, self.b)

def train_models(hyperparameters):
	models = []
	l2amgdlogreg_clf = CustomL2AMGDLogReg(
		β=hyperparameters["l2amgdlogreg"]["β"],
		τ=hyperparameters["l2amgdlogreg"]["τ"],
		α=hyperparameters["l2amgdlogreg"]["α"],
		λ=hyperparameters["l2amgdlogreg"]["λ"]
	)
	l2amgdlogreg_clf.fit(X_train, y_train.ravel())
	models.append(("l2amgdlogreg", l2amgdlogreg_clf))

	knn_clf = KNeighborsClassifier(
		n_neighbors=hyperparameters["knn"]["n_neighbors"]
	)
	knn_clf.fit(X_train, y_train.ravel())
	models.append(("knn", knn_clf))

	dtree_clf = DecisionTreeClassifier(
		random_state=hyperparameters["dtree"]["random_state"],
		max_depth=hyperparameters["dtree"]["𝒹"],
		min_samples_split=hyperparameters["dtree"]["𝓈"],
		min_samples_leaf=hyperparameters["dtree"]["ℓ"]
	)
	dtree_clf.fit(X_train, y_train.ravel())
	models.append(("dtree", dtree_clf))

	rforest_clf = RandomForestClassifier(
		random_state=hyperparameters["rforest"]["random_state"],
		n_estimators=hyperparameters["rforest"]["η"]
	)
	rforest_clf.fit(X_train, y_train.ravel())
	models.append(("rforest", rforest_clf))

	svm_clf = SVC(
		kernel=hyperparameters["svm"]["κ"],
		C=hyperparameters["svm"]["λ"],
		probability=hyperparameters["svm"]["probability"],
		random_state=hyperparameters["svm"]["random_state"]
	)
	svm_clf.fit(X_train, y_train.ravel())
	models.append(("svm", svm_clf))
	return models

def generate_predictions_stack(hyperparameters):
	start_time = time.time()
	models = train_models(hyperparameters)
	stacking_clf = StackingClassifier(
		estimators=models,
		final_estimator=LogisticRegression(),
		cv=5
	)
	stacking_clf.fit(X_train, y_train.ravel())
	predictions = stacking_clf.predict(X_test)
	os.makedirs("./predictions/stack/", exist_ok=True)
	file_name = os.path.join("./predictions/stack/", "predictions.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")


hyperparameters = {
	"l2amgdlogreg": {"β": 128, "τ": 2000, "α": 0.0825, "λ": 0.01},
	"knn": {"n_neighbors": 5},
	"dtree": {"random_state": 42, "𝒹": 10, "𝓈": 2, "ℓ": 1},
	"rforest": {"random_state": 42, "η": 100},
	"svm": {"κ": "linear", "λ": 1.0, "probability": True, "random_state": 42}
}

# generate_predictions_stack(hyperparameters)