# Cogito

## Definitions
- `m` is # of data points.
- `n` is # of features.
- `X` is `m`-by-`n` matrix representing features.
- `y` is `m`-by-1 vector representing labels.
- `w` is `n`-by-1 vector representing weights.
- `b` is scalar representing bias.
- `dw` is loss gradient w.r.t. `w`.
- `db` is loss gradient w.r.t. `b`.

### logreg(œÑ,Œ±,G,Œ≤,L,Œª)
1. `œÑ` is # of epochs: non-negative integer.
2. `Œ±` is learning rate: float in range ${[0,1]}$.
3. `G` is gradient descent type: `"full-batch"`, `"mini-batch"`, xor `"stochastic"`.
4. `Œ≤` is batch size: non-negative integer xor `None`.
5. `L` is regularisation type: `"L2"`, `"L1"`, xor `None`.
6. `Œª` is regularisation degree: real in range ${[0,\infin)}$.

### pcaknn(œÅ,ŒΩ)
1. `œÅ` is # of principal components: integer in range ${[0,n]}$.
2. `ŒΩ` is variance ratio: float in range ${[0,1]}$.

### svm(Œª,K,¬∞,Œ≥,Œ∫,œÑ,seed)
1. `Œª` is margin regularisation degree.
2. `K` is kernel type: `"linear"`, `"poly"`, `"rbf"`, `"sigmoid"`, xor `"precomputed"`.
3. `¬∞` is polynomial kernel degree for polynomial kernel type: non-negative integer.
4. `Œ≥` is kernel coefficient for RBF, polynomial, xor sigmoid kernel type: `"scale"`, `"auto"`, xor float in range ${[0,1]}$.
5. `Œ∫` is independent term of kernel function with effect for polynomial xor sigmoid kernel type.
6. `œÑ` is maximum # of iterations: non-negative integer xor `-1`.
7. `seed` is pseudorandom random state: integer xor `None`.

### gbdtree(Œ±,Œ∑,Œ≥,œà,‚Ñì,Œî,Œ¥,seed,Œ∏)
1. `Œ±` is learning rate: float in range ${[0,1]}$.
2. `Œ∑` is # of estimators: non-negative integer.
3. `Œ≥` is fraction of samples used to fit individual base learners: float in range ${[0,1]}$.
4. `œà` is minimum # of samples needed to split internal node: non-negative integer.
5. `‚Ñì` is minimum # of samples needed at leaf node: non-negative integer.
6. `Œî` is maximum tree depth: non-negative integer xor `None`.
7. `Œ¥` is minimum impurity decrease: float in range ${[0,1]}$.
8. `seed` is pseudorandom random state: integer xor `None`.
9. `Œ∏` is maximum # of leaf nodes: non-negative integer xor `None`.

#### hgbdtree(Œ±,œÑ,Œ∏,Œî,‚Ñì,seed)
1. `Œ±` is learning rate: float in range ${[0,1]}$.
2. `œÑ` is maximum # of iterations: non-negative integer.
3. `Œ∏` is maximum # of leaf nodes: non-negative integer xor `None`.
4. `Œî` is maximum tree depth: non-negative integer xor `None`.
5. `‚Ñì` is minimum # of samples needed at leaf node: non-negative integer xor `None`.
6. `seed` is pseudorandom random state: integer xor `None`.

#### rforest(Œ∑,C,Œî,œà,‚Ñì,Œ∏,Œ¥,B,seed)
1. `Œ∑` is # of estimators: non-negative integer.
2. `C` is criterion to measure split quality: `"entropy"`, `"log_loss"`, xor `"gini"`.
3. `Œî` is maximum tree depth: non-negative integer xor `None`.
4. `œà` is minimum # of samples needed to split internal node: non-negative integer.
5. `‚Ñì` is minimum # of samples needed at leaf node: non-negative integer.
6. `Œ∏` is maximum # of leaf nodes: non-negative integer xor `None`.
7. `Œ¥` is minimum impurity decrease: float in range ${[0,1]}$.
8. `B` is whether bootstrapping is used: `False` xor `True`.
9. `seed` is pseudorandom random state: integer xor `None`.


## Setup
- numpy, pandas, and sklearn (scikit-learn) must be installed and available.

In [None]:
# Import os for operating system functions.
import os
# Import numpy for mathematical computation.
import numpy as np
# Import pandas for data manipulation.
import pandas as pd
# Import datetime for local time retrieval.
from datetime import datetime
# Import logging for file logging.
import logging
# Import time for duration measurement.
import time

from sklearn.preprocessing import StandardScaler
# Import PCA and KNeighborsClassifier for pcaknn.
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# Import RandomForestClassifier for rforest.
from sklearn.ensemble import RandomForestClassifier

# Import SVC for svm model.
from sklearn.svm import SVC

# Import GradientBoostingClassifier for gbdtree.
from sklearn.ensemble import GradientBoostingClassifier
# Import HistGradientBoostingClassifier for hgbdtree.
from sklearn.ensemble import HistGradientBoostingClassifier

# Import StackingClassifier, LogisticRegression, BaseEstimator, and ClassifierMixin for stack.
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin

# Initialise file logging configuration.
logging.basicConfig(
	filename="main.log",
	level=logging.INFO,
	format="%(asctime)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S"
)

# Load train and test datasets.
S_train = pd.read_csv("train.csv")
S_train_tfidf = pd.read_csv("train_tfidf_features.csv")
S_test = pd.read_csv("test.csv")
S_test_tfidf = pd.read_csv("test_tfidf_features.csv")

# Extract train features, train labels, and test features.
X_train = S_train_tfidf.iloc[:, 2:].values
y_train = S_train["label"].values.reshape(-1, 1)
X_test = S_test_tfidf.iloc[:, 1:].values

# Define generic functions.
# Return œÉ(z) for some z.
def œÉ(z): return 1 / (1 + np.exp(-z))

# Return cross-entropy loss for some y and yÃÇ.
# def loss(y, yÃÇ): return (-1/(y.shape[0])) * np.sum(y * np.log(yÃÇ) + (1 - y) * np.log(1 - yÃÇ))

## Task 1: Logistic Regression Model (`logreg`)

In [None]:
# Return dw and db, for some X, y, yÃÇ, w, Œª, and L.
def gradients_logreg(X, y, yÃÇ, w, L=None, Œª=0):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (yÃÇ - y))
	db = 1/m * np.sum(yÃÇ - y)
	if L == "L2":
		dw += Œª * w / m
	elif L == "L1":
		dw += Œª * np.sign(w) / m
	return dw, db

# Return w and b from gradient descent on X and y, for some Œ≤, œÑ, Œ±, Œª, L, and G.
def train_logreg(œÑ=1000, Œ±=0.1, G="mini-batch", Œ≤=128, L=None, Œª=0):
	m, n = X_train.shape
	w, b = np.zeros((n, 1)), 0
	for epoch in range(œÑ):
		if G == "full-batch":
			X_batch, y_batch = X_train, y_train
			yÃÇ = œÉ(np.dot(X_batch, w) + b)
			dw, db = gradients_logreg(X_batch, y_batch, yÃÇ, w, L, Œª)
			w, b = w - Œ±*dw, b - Œ±*db
		elif G == "mini-batch":
			for i in range(0, m, Œ≤):
				X_batch, y_batch = X_train[i:i+Œ≤], y_train[i:i+Œ≤]
				yÃÇ = œÉ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, yÃÇ, w, L, Œª)
				w, b = w - Œ±*dw, b - Œ±*db
		elif G == "stochastic":
			for i in range(m):
				X_batch, y_batch = X_train[i:i+1], y_train[i:i+1]
				yÃÇ = œÉ(np.dot(X_batch, w) + b)
				dw, db = gradients_logreg(X_batch, y_batch, yÃÇ, w, L, Œª)
				w, b = w - Œ±*dw, b - Œ±*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding yÃÇ entry > 0.5, and 0 otherwise.
def predict_logreg(w, b):
	# Compute yÃÇ = œÉ(w ‚ãÖ X + b).
	yÃÇ = œÉ(np.dot(X_test, w) + b)
	return [1 if p > 0.5 else 0 for p in yÃÇ]

# Train model, make model predictions, and save model predictions to CSV file, for some Œ≤, œÑ, Œ±, Œª, L, and G.
def generate_predictions_logreg(œÑ=1000, Œ±=0.1, G="mini-batch", Œ≤=128, L=None, Œª=0):
	start_time = time.time()
	w, b = train_logreg(œÑ, Œ±, G, Œ≤, L, Œª)
	os.makedirs("./predictions/logreg/", exist_ok=True)
	file_name = os.path.join("./predictions/logreg/", f"G={G},Œ≤={Œ≤},L={L},Œª={Œª},œÑ={œÑ},Œ±={Œ±}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predict_logreg(w, b)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_logreg(10, 0.1, "stochastic")

## Task 2: PCA-KNN Model (`pcaknn`)

In [None]:
# Return PCA-transformed train and test featuresets, for either some ŒΩ or œÅ.
def apply_pca(mode_value, mode):
	# Validate mode and mode value.
	if mode not in ["ŒΩ", "œÅ"] or (mode == "ŒΩ" and not (0 < mode_value <= 1)) or (mode == "œÅ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ŒΩ with value between 0 and 1, or œÅ with mode_value as an integer greater than 1.")
	# Instantiate PCA and StandardScaler objects.
	pca = PCA(n_components=mode_value)
	scaler = StandardScaler()
	# PCA-transform train and test featuresets.
	X_train_pca = pca.fit_transform(scaler.fit_transform(X_train))
	X_test_pca = pca.transform(scaler.transform(X_test))
	# Return PCA-transformed featuresets and # of components.
	return X_train_pca, X_test_pca, pca.n_components_

# Return KNN predictions for PCA-transformed train and test datasets.
def train_and_predict_knn(X_train_pca, y_train, X_test_pca):
	# Instantiate KNeighborsClassifier object.
	knn = KNeighborsClassifier(n_neighbors=2)
	knn.fit(X_train_pca, y_train)
	return knn.predict(X_test_pca)

# Train model, make model predictions, and save model predictions to CSV file, for some ŒΩ or œÅ.
def generate_predictions_pcaknn(mode_value, mode="œÅ"):
	# Validate mode and mode value.
	if mode not in ["ŒΩ", "œÅ"] or (mode == "ŒΩ" and not (0 < mode_value <= 1)) or (mode == "œÅ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ŒΩ with value between 0 and 1, or œÅ with mode_value as an integer greater than 1.")

	start_time = time.time()
	X_train_pca, X_test_pca, n_components = apply_pca(mode_value, mode)
	os.makedirs("./predictions/pcaknn/", exist_ok=True)
	if mode == "ŒΩ":
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ŒΩ={mode_value},œÅ={n_components}).csv")
	else:
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(œÅ={n_components}).csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": train_and_predict_knn(X_train_pca, y_train, X_test_pca)
	}).to_csv(file_name, index=False)
	end_time = time.time()

	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_pcaknn(mode_value=5000, mode="œÅ")

## Task 3: Other Models

### Random Forest Model (`rforest`)

In [None]:
# Train RandomForest classifier, make predictions, and save predictions to CSV file.
def generate_predictions_rforest(Œ∑=100, C="gini", Œî=None, œà=2, ‚Ñì=1, Œ∏=None, Œ¥=0.0, B=True, seed=None):
	start_time = time.time()
	rf = RandomForestClassifier(
		n_estimators=Œ∑,
		criterion=C,
		max_depth=Œî,
		min_samples_split=œà,
		min_samples_leaf=‚Ñì,
		max_leaf_nodes=Œ∏,
		min_impurity_decrease=Œ¥,
		bootstrap=B,
		random_state=seed
	)
	rf.fit(X_train, y_train.ravel())
	predictions = rf.predict(X_test)
	os.makedirs("./predictions/rforest/", exist_ok=True)
	file_name = os.path.join("./predictions/rforest/", f"ùíπ={ùíπ},ùìà={ùìà},‚Ñì={‚Ñì},Œ∑={Œ∑}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_rforest(Œ∑=100, C="gini", Œî=None, œà=2, ‚Ñì=1, Œ∏=None, Œ¥=0.0, B=True, seed=None)

### SVM Model (`svm`)

In [None]:
# Train SVM classifier, make predictions, and save predictions to CSV file.
def generate_predictions_svm(Œª=1.0, K="rbf", ¬∞=3, Œ≥="scale", Œ∫=0.0, œÑ=-1, seed=None):
	start_time = time.time()
	svm_clf = SVC(
		C=Œª,
		kernel=K,
		degree=¬∞,
		gamma=Œ≥,
		coef0=Œ∫,
		max_iter=œÑ,
		random_state=seed
	)
	svm_clf.fit(X_train, y_train.ravel())
	predictions = svm_clf.predict(X_test)
	os.makedirs("./predictions/svm/", exist_ok=True)
	file_name = os.path.join("./predictions/svm/", f"Œª={Œª},K={K},¬∞={¬∞},Œ≥={Œ≥},Œ∫={Œ∫},œÑ={œÑ},seed={seed}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_svm(Œª=1.0, K="rbf", ¬∞=3, Œ≥="scale", Œ∫=0.0, œÑ=-1, seed=None)

### Gradient Boosted Decision Tree Model (`gbdtree`)

In [None]:
def generate_predictions_gbdtree(Œ±=0.1, Œ∑=100, Œ≥=1.0, œà=2, ‚Ñì=1, Œî=3, Œ¥=0.0, seed=None, Œ∏=None):
	start_time = time.time()
	gbc = GradientBoostingClassifier(
		learning_rate=Œ±,
		n_estimators=Œ∑,
		subsample=Œ≥,
		min_samples_split=œà,
		min_samples_leaf=‚Ñì,
		max_depth=Œî,
		min_impurity_decrease=Œ¥,
		random_state=seed,
		max_leaf_nodes=Œ∏
	)
	gbc.fit(X_train, y_train)
	predictions = gbc.predict(X_test)
	os.makedirs("./predictions/gbdtree/", exist_ok=True)
	file_name = os.path.join("./predictions/gbdtree/", f"Œ±={Œ±},Œ∑={Œ∑},Œ≥={Œ≥},œà={œà},‚Ñì={‚Ñì},Œî={Œî},Œ¥={Œ¥},seed={seed},Œ∏={Œ∏}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

generate_predictions_gbdtree(Œ±=0.1, Œ∑=100, Œ≥=1.0, œà=2, ‚Ñì=1, Œî=3, Œ¥=0.0, seed=None, Œ∏=None)

### Histogram Gradient Boosted Decision Tree Model (`hgbdtree`)

In [None]:
def generate_predictions_hgbdtree(Œ±=0.1, œÑ=100, Œ∏=31, Œî=None, ‚Ñì=20, seed=None):
	start_time = time.time()
	hgbc = HistGradientBoostingClassifier(
		learning_rate=Œ±,
		max_iter=œÑ,
		max_leaf_nodes=Œ∏,
		max_depth=Œî,
		min_samples_leaf=‚Ñì,
		random_state=seed
	)
	hgbc.fit(X_train, y_train)
	predictions = hgbc.predict(X_test)
	os.makedirs("./predictions/hgbdtree/", exist_ok=True)
	file_name = os.path.join("./predictions/hgbdtree/", f"Œ±={Œ±},œÑ={œÑ},Œ∏={Œ∏},Œî={Œî},‚Ñì={‚Ñì},seed={seed}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_hgbdtree(Œ±=0.1, œÑ=100, Œ∏=31, Œî=None, ‚Ñì=20, seed=None)

### Bagging Model

In [None]:
# Define wrapper classes for original (i.e. non-imported) models.

