# Cogito
Œ±Œ≤œÑœÅŒΩùíπùìà‚ÑìŒ∑ŒªŒµ

## Introduction
### Definitions
- Let `X` be a `m`-by-`n` matrix of input data, and `y` be a `m`-by-1 vector of output data, where `m` is the # of data points, and `n` is the # of features, so that `X` represents the features and `y` represents the labels.
- Let `w` be a `n`-by-1 vector of weights, and `b` be a scalar of bias.
- Let `dw` be the loss gradient w.r.t. `w`, and `db` be the loss gradient w.r.t. `b`, where for least squares we have ${dw=\frac{1}{m}\sum_{i=1}^{m}(yÃÇ^{(i)} - y^{(i)})x^{(i)}+\frac{\lambda w}{m}}$ and ${db = \frac{1}{m}\sum_{i=1}^{m}(yÃÇ^{(i)} - y^{(i)})}$.
- Let `Œ≤` be the batch size.
- Let `œÑ` be the # of epochs.
- Let `Œ±` be the learning rate.
- Let `œÅ` be the # of principal components (where `n` is the maximum possible).
- Let `ŒΩ` be the variance ratio.
- Let `ùíπ` be the maximum depth.
- Let `ùìà` be the minimum number of samples to split an internal node.
- Let `‚Ñì` be the minimum number of samples at a leaf node.
- Let `Œ∑` be the number of estimators.
- Let `Œª` be either the L2 regularisation parameter (in context of logistic regression), or the C parameter (in the context of SVM).
- Let `Œµ` be a small positive constant.

### Naming Conventions
1. Dataset is treated as a single word.
2. Featureset refers to the features of a dataset.
3. Labelset refers to the labels of a dataset.

## Setup
- numpy, pandas, and sklearn (scikit-learn) must be installed and available.

In [17]:
# Import os for operating system functions.
import os
# Import numpy for mathematical computation.
import numpy as np
# Import pandas for data manipulation.
import pandas as pd
# Import datetime for local time retrieval.
from datetime import datetime
# Import logging for file logging.
import logging
# Import time for duration measurement.
import time

from sklearn.preprocessing import StandardScaler
# Import PCA and KNeighborsClassifier for PCA-KNN model.
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
# Import DecisionTreeClassifier for dtree model.
from sklearn.tree import DecisionTreeClassifier
# Import RandomForestClassifier for rforest model.
from sklearn.ensemble import RandomForestClassifier
# Import SVC for SVM model.
from sklearn.svm import SVC
# from sklearn.ensemble import VotingClassifier
# Import ... for stack model.
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin

# Initialise file logging configuration.
logging.basicConfig(filename="main.log", level=logging.INFO, format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

# Load train and test datasets.
S_train = pd.read_csv("train.csv")
S_train_tfidf = pd.read_csv("train_tfidf_features.csv")
S_test = pd.read_csv("test.csv")
S_test_tfidf = pd.read_csv("test_tfidf_features.csv")

# Extract train features, train labels, and test features.
X_train = S_train_tfidf.iloc[:, 2:].values
y_train = S_train["label"].values.reshape(-1, 1)
X_test = S_test_tfidf.iloc[:, 1:].values

# Define generic functions.
# Return œÉ(z) for some z.
def œÉ(z): return 1 / (1 + np.exp(-z))
# Return cross-entropy loss for some y and yÃÇ.
# def loss(y, yÃÇ): return (-1/(y.shape[0])) * np.sum(y * np.log(yÃÇ) + (1 - y) * np.log(1 - yÃÇ))

## Task 1: Mini-Batch Gradient Descent Logistic Regression Model (mgdlogreg)

In [18]:
# Return dw and db, for some X, y, and yÃÇ.
def gradients_mgdlogreg(X, y, yÃÇ):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (yÃÇ - y))
	db = 1/m * np.sum(yÃÇ - y)
	return dw, db

# Return w and b from mini-batch gradient descent on X and y, for batch size Œ≤, epochs œÑ, and learning rate Œ±.
def train_mgdlogreg(Œ≤, œÑ, Œ±):
	m, n = X_train.shape
	# Initialise w to 0 vector and b to 0.
	w, b = np.zeros((n, 1)), 0
	# For each epoch:
	for epoch in range(œÑ):
		# For every batch within epoch:
		for i in range(0, m, Œ≤):
			X_batch, y_batch = X_train[i:i+Œ≤], y_train[i:i+Œ≤]
			yÃÇ = œÉ(np.dot(X_batch, w) + b)
			dw, db = gradients_mgdlogreg(X_batch, y_batch, yÃÇ)
			w, b = w - Œ±*dw, b - Œ±*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding yÃÇ entry > 0.5, and 0 otherwise.
def predict_mgdlogreg(w, b):
	# Compute yÃÇ = œÉ(w‚ãÖX + b).
	yÃÇ = œÉ(np.dot(X_test, w) + b)
	return [1 if p > 0.5 else 0 for p in yÃÇ]

# Train model, make model predictions, and save model predictions to CSV file, for some Œ≤, œÑ, and Œ±.
def generate_predictions_mgdlogreg(Œ≤, œÑ, Œ±):
	start_time = time.time()
	w, b = train_mgdlogreg(Œ≤, œÑ, Œ±)
	os.makedirs("./predictions/", exist_ok=True)
	file_name = os.path.join("./predictions/", f"mgdlogreg(Œ≤={Œ≤},œÑ={œÑ},Œ±={Œ±}).csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predict_mgdlogreg(w, b)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# Example usage:
# generate_predictions_mgdlogreg(128, 1200, 0.084)

## Task 2: PCA-KNN Model (pcaknn)
### Usage Examples
1. `generate_predictions_pcaknn(X_train, y_train, X_test, 0.95, "ŒΩ")`
2. `generate_predictions_pcaknn(X_train, y_train, X_test, 2000, "œÅ")`

In [19]:
# Return PCA-transformed train and test featuresets, for either some ŒΩ or œÅ.
def apply_pca(mode_value, mode):
	# Validate mode and mode value.
	if mode not in ["ŒΩ", "œÅ"] or (mode == "ŒΩ" and not (0 < mode_value <= 1)) or (mode == "œÅ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ŒΩ with value between 0 and 1, or œÅ with mode_value as an integer greater than 1.")
	# Instantiate PCA and StandardScaler objects.
	pca = PCA(n_components=mode_value)
	scaler = StandardScaler()
	# PCA-transform train and test featuresets.
	X_train_pca = pca.fit_transform(scaler.fit_transform(X_train))
	X_test_pca = pca.transform(scaler.transform(X_test))
	# Return PCA-transformed featuresets and # of components.
	return X_train_pca, X_test_pca, pca.n_components_

# Return KNN predictions for PCA-transformed train and test datasets.
def train_and_predict_knn(X_train_pca, y_train, X_test_pca):
	# Instantiate KNeighborsClassifier object.
	knn = KNeighborsClassifier(n_neighbors=2)
	knn.fit(X_train_pca, y_train)
	return knn.predict(X_test_pca)

# Train model, make model predictions, and save model predictions to CSV file, for some ŒΩ or œÅ.
def generate_predictions_pcaknn(mode_value, mode):
	# Validate mode and mode value.
	if mode not in ["ŒΩ", "œÅ"] or (mode == "ŒΩ" and not (0 < mode_value <= 1)) or (mode == "œÅ" and not (mode_value > 1 and isinstance(mode_value, int))):
		raise ValueError("Mode must either be ŒΩ with value between 0 and 1, or œÅ with mode_value as an integer greater than 1.")

	start_time = time.time()
	X_train_pca, X_test_pca, n_components = apply_pca(mode_value, mode)
	os.makedirs("./predictions/pcaknn/", exist_ok=True)
	if mode == "ŒΩ":
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(ŒΩ={mode_value},œÅ={n_components}).csv")
	else:
		file_name = os.path.join("./predictions/pcaknn/", f"pcaknn(œÅ={n_components}).csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": train_and_predict_knn(X_train_pca, y_train, X_test_pca)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_pcaknn(5000, "œÅ")

## Task 3: Other Models
### Decision Tree Model (dtree)

In [20]:
def generate_predictions_dtree(ùíπ, ùìà, ‚Ñì):
	start_time = time.time()
	model = DecisionTreeClassifier(max_depth=ùíπ, min_samples_split=ùìà, min_samples_leaf=‚Ñì)
	model.fit(X_train, y_train)
	predictions = model.predict(X_test)
	os.makedirs("./predictions/dtree/", exist_ok=True)
	file_name = os.path.join("./predictions/dtree/", f"ùíπ={ùíπ},ùìà={ùìà},‚Ñì={‚Ñì}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_dtree(ùíπ=50, ùìà=10, ‚Ñì=2)

### Random Forest Model (rforest)

In [21]:
# Train RandomForest classifier, make predictions, and save predictions to CSV file.
def train_and_predict_rforest(ùíπ, ùìà, ‚Ñì, Œ∑):
	start_time = time.time()
	rf = RandomForestClassifier(max_depth=ùíπ, min_samples_split=ùìà, min_samples_leaf=‚Ñì, n_estimators=Œ∑, random_state=42)
	rf.fit(X_train, y_train.ravel())
	predictions = rf.predict(X_test)
	os.makedirs("./predictions/rforest/", exist_ok=True)
	file_name = os.path.join("./predictions/rforest/", f"ùíπ={ùíπ},ùìà={ùìà},‚Ñì={‚Ñì},Œ∑={Œ∑}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# train_and_predict_rforest(ùíπ=10000, ùìà=10, ‚Ñì=2, Œ∑=100)

### L2 Adagrad Mini-Batch Gradient Descent Logistic Regression Model (l2amgdlogreg)

In [22]:
# Return dw and db, for some X, y, yÃÇ, and Œª.
def gradients_l2amgdlogreg(X, y, yÃÇ, w, Œª):
	m, _ = X.shape
	dw = 1/m * np.dot(X.T, (yÃÇ - y)) + (Œª/m) * w
	db = 1/m * np.sum(yÃÇ - y)
	return dw, db

# Return w and b from L2 adagrad mini-batch gradient descent on X_train and y_train, for batch size Œ≤, epochs œÑ, initial learning rate Œ±, and regularisation Œª.
def train_l2amgdlogreg(Œ≤, œÑ, Œ±, Œª=0, Œµ=1e-8):
	m, n = X_train.shape
	# Initialise w to 0 vector and b to 0.
	w, b = np.zeros((n, 1)), 0
	# Initialise gradient accumulation variables for Adagrad.
	sdw, sdb = np.zeros((n, 1)), 0
	# For each epoch:
	for epoch in range(œÑ):
		# For every batch within epoch:
		for i in range(0, m, Œ≤):
			X_batch, y_batch = X_train[i:i+Œ≤], y_train[i:i+Œ≤]
			yÃÇ = œÉ(np.dot(X_batch, w) + b)
			dw, db = gradients_l2amgdlogreg(X_batch, y_batch, yÃÇ, w, Œª)
			sdw += dw ** 2
			sdb += db ** 2
			w -= (Œ± / (np.sqrt(sdw) + Œµ)) * dw
			b -= (Œ± / (np.sqrt(sdb) + Œµ)) * db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding yÃÇ entry > 0.5, and 0 otherwise.
def predict_l2amgdlogreg(w, b):
	# Compute yÃÇ = œÉ(w‚ãÖX + b).
	yÃÇ = œÉ(np.dot(X_test, w) + b)
	return [1 if p > 0.5 else 0 for p in yÃÇ]

# Train model, make model predictions, and save model predictions to CSV file, for some Œ≤, œÑ, Œ±, and Œª.
def generate_predictions_l2amgdlogreg(Œ≤, œÑ, Œ±, Œª=0):
	start_time = time.time()
	w, b = train_l2amgdlogreg(Œ≤, œÑ, Œ±, Œª)
	os.makedirs("./predictions/l2amgdlogreg/", exist_ok=True)
	file_name = os.path.join("./predictions/l2amgdlogreg/", f"Œ≤={Œ≤},œÑ={œÑ},Œ±={Œ±},Œª={Œª}.csv")
	pd.DataFrame({
		"id": S_test["id"],
		"label": predict_l2amgdlogreg(w, b)
	}).to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_l2amgdlogreg(Œ≤=128, œÑ=2000, Œ±=0.0825, Œª=0.01)

### SVM Model

In [23]:
# Train SVM classifier, make predictions, and save predictions to CSV file.
def generate_predictions_svm(Œ∫, Œª):
	start_time = time.time()
	svm_clf = SVC(kernel=Œ∫, C=Œª, random_state=42)
	svm_clf.fit(X_train, y_train.ravel())
	predictions = svm_clf.predict(X_test)
	os.makedirs("./predictions/svm/", exist_ok=True)
	file_name = os.path.join("./predictions/svm/", f"Œ∫={Œ∫},Œª={Œª}.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

# generate_predictions_svm("linear", 1.0)

### Stacking Ensemble Model (stack)

In [24]:
# Define wrapper classes for original (i.e. non-imported) models.
class CustomL2AMGDLogReg(BaseEstimator, ClassifierMixin):
	def __init__(self, Œ≤=128, œÑ=2000, Œ±=0.0825, Œª=0.01):
		self.Œ≤ = Œ≤
		self.œÑ = œÑ
		self.Œ± = Œ±
		self.Œª = Œª
		self.w = None
		self.b = None

	def fit(self, X, y):
		self.w, self.b = train_l2amgdlogreg(self.Œ≤, self.œÑ, self.Œ±, self.Œª)
		return self

	def predict(self, X):
		return predict_l2amgdlogreg(self.w, self.b)

In [25]:
def train_models(hyperparameters):
	models = []

	l2amgdlogreg_clf = CustomL2AMGDLogReg(
		Œ≤=hyperparameters["l2amgdlogreg"]["Œ≤"],
		œÑ=hyperparameters["l2amgdlogreg"]["œÑ"],
		Œ±=hyperparameters["l2amgdlogreg"]["Œ±"],
		Œª=hyperparameters["l2amgdlogreg"]["Œª"]
	)
	l2amgdlogreg_clf.fit(X_train, y_train.ravel())
	models.append(("l2amgdlogreg", l2amgdlogreg_clf))

	knn_clf = KNeighborsClassifier(
		n_neighbors=hyperparameters["knn"]["n_neighbors"]
	)
	knn_clf.fit(X_train, y_train.ravel())
	models.append(("knn", knn_clf))

	dtree_clf = DecisionTreeClassifier(
		random_state=hyperparameters["dtree"]["random_state"],
		max_depth=hyperparameters["dtree"]["ùíπ"],
		min_samples_split=hyperparameters["dtree"]["ùìà"],
		min_samples_leaf=hyperparameters["dtree"]["‚Ñì"]
	)
	dtree_clf.fit(X_train, y_train.ravel())
	models.append(("dtree", dtree_clf))

	rforest_clf = RandomForestClassifier(
		random_state=hyperparameters["rforest"]["random_state"],
		n_estimators=hyperparameters["rforest"]["Œ∑"]
	)
	rforest_clf.fit(X_train, y_train.ravel())
	models.append(("rforest", rforest_clf))

	svm_clf = SVC(
		kernel=hyperparameters["svm"]["kernel"],
		C=hyperparameters["svm"]["Œª"],
		probability=hyperparameters["svm"]["probability"],
		random_state=hyperparameters["svm"]["random_state"]
	)
	svm_clf.fit(X_train, y_train.ravel())
	models.append(("svm", svm_clf))

	return models

from sklearn.ensemble import StackingClassifier

def generate_predictions_stack(hyperparameters):
	start_time = time.time()
	models = train_models(hyperparameters)
	stacking_clf = StackingClassifier(
		estimators=models,
		final_estimator=LogisticRegression(),
		cv=5
	)
	stacking_clf.fit(X_train, y_train.ravel())
	predictions = stacking_clf.predict(X_test)
	os.makedirs("./predictions/stack/", exist_ok=True)
	file_name = os.path.join("./predictions/stack/", "predictions.csv")
	submission = pd.DataFrame({
		"id": S_test["id"],
		"label": predictions
	})
	submission.to_csv(file_name, index=False)
	end_time = time.time()
	# Log and print success message.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")
	print(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

hyperparameters = {
	"l2amgdlogreg": {"Œ≤": 128, "œÑ": 2000, "Œ±": 0.0825, "Œª": 0.01},
	"knn": {"n_neighbors": 5},
	"dtree": {"random_state": 42, "ùíπ": 10, "ùìà": 2, "‚Ñì": 1},
	"rforest": {"random_state": 42, "Œ∑": 100},
	"svm": {"kernel": "linear", "Œª": 1.0, "probability": True, "random_state": 42}
}

generate_predictions_stack(hyperparameters)

KeyboardInterrupt: 