# Model α

## Introduction
### Definitions
- Let `X` be a `m`-by-`n` matrix of input data, and `y` be a `m`-by-1 vector of output data, where `m` is the # of data points, and `n` is the # of features, so that `X` represents the features and `y` represents the labels.
- Let `w` be a `n`-by-1 vector of weights, and `b` be a scalar of bias.
- Let `dw` be the loss gradient w.r.t. `w`, and `db` be the loss gradient w.r.t. `b`.
- Let `bs` be the batch size, `τ` be the # of epochs, and `α` be the learning rate.
- Let `C` be the # of components (where `n` is the maximum possible # of components).
- Let `ν` be the variance threshold.

### Unicode Symbols
1. α (alpha)
2. τ (tau)
3. ν (nu)

### Naming Conventions
1. Dataset is seen as a single word. Featureset refers to the features of a dataset, and labelset refers to the labels of a dataset.

### Setup

In [1]:
# Import numpy for mathematical computation.
import numpy as np
# Import pandas for data manipulation.
import pandas as pd
# Import datetime for local time retrieval.
from datetime import datetime
# Import logging for file logging.
import logging
# Import time for duration measurement.
import time
# Import PCA and StandardScaler for PCA.
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Import KNeighborsClassifier for KNN training.
from sklearn.neighbors import KNeighborsClassifier

# Initialise file logging configuration.
logging.basicConfig(filename="predictions.log", level=logging.INFO, format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

# Load sample submission dataset.
# sample_submission = pd.read_csv("sample_submission.csv")

# Load train and test datasets.
S_train = pd.read_csv("train.csv")
S_train_tfidf = pd.read_csv("train_tfidf_features.csv")
S_test = pd.read_csv("test.csv")
S_test_tfidf = pd.read_csv("test_tfidf_features.csv")

# Extract train features, train labels, and test features.
X_train = S_train_tfidf.iloc[:, 2:].values
y_train = S_train["label"].values.reshape(-1, 1)
X_test = S_test_tfidf.iloc[:, 1:].values

## Task 1: Logistic Regression Model

In [2]:
# Return σ(z) for some z.
def σ(z):
	return 1 / (1 + np.exp(-z))

# Return cross-entropy loss for some y and ŷ.
def loss(y, ŷ):
	# Compute m.
	m, _ = y.shape
	return -1/m * np.sum(y * np.log(ŷ) + (1 - y) * np.log(1 - ŷ))

# Return dw and db, for some X, y, and ŷ.
def gradients(X, y, ŷ):
	# Compute m.
	m, _ = X.shape
	# Compute dw = 1/m * \sum_{i=1}^{m}(ŷ^{(i)} - y^{(i)})x^{(i)}.
	dw = 1/m * np.dot(X.T, (ŷ - y))
	# Compute db = 1/m * \sum_{i=1}^{m}(ŷ^{(i)} - y^{(i)}).
	db = 1/m * np.sum(ŷ - y)
	return dw, db

# Return w and b from batched gradient descent on X and y, for batch size bs, epochs τ, and learning rate α.
def train_logistic(X, y, bs, τ, α):
	# Compute m and n.
	m, n = X.shape
	# Initialise w to 0 vector and b to 0.
	w, b = np.zeros((n, 1)), 0
	# For each epoch:
	for epoch in range(τ):
		# For every batch within epoch:
		for i in range(0, m, bs):
			X_batch, y_batch = X[i:i+bs], y[i:i+bs]
			ŷ = σ(np.dot(X_batch, w) + b)
			dw, db = gradients(X_batch, y_batch, ŷ)
			w, b = w - α*dw, b - α*db
	return w, b

# Return array of predictions, where each prediction is 1 if corresponding ŷ entry > 0.5, and 0 otherwise.
def predict_logistic(X, w, b):
	# Compute ŷ = σ(w⋅X + b).
	ŷ = σ(np.dot(X, w) + b)
	return [1 if p > 0.5 else 0 for p in ŷ]

# Train model, make model predictions, and save model predictions to CSV file, for some bs, τ, and α.
def generate_predictions_logistic(X_train, y_train, X_test, bs, τ, α):
	start_time = time.time()
	w, b = train_logistic(X_train, y_train, bs, τ, α)
	file_name = f"predictions_bs={bs}_τ={τ}_α={α}.csv"
	pd.DataFrame({
		"id": S_test["id"],
		"label": predict_logistic(X_test, w, b)
	}).to_csv(file_name, index=False)
	end_time = time.time()

	local_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
	# Log message to predictions.log.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

generate_predictions_logistic(X_train, y_train, X_test, 128, 50, 0.1)


## Task 2: PCA Feature Dimensionality Reduction

In [3]:
# Return PCA-transformed train and test feature, for some ν.
def apply_pca(X_train, X_test, ν):
	# Instantiate StandardScaler and PCA objects.
	scaler = StandardScaler()
	pca = PCA(n_components=ν)

	X_train_pca = pca.fit_transform(scaler.fit_transform(X_train))
	X_test_pca = pca.transform(scaler.transform(X_test))

	# Return the transformed featuresets and the number of components.
	return X_train_pca, X_test_pca, pca.n_components_

# Return KNN predictions for PCA-transformed train and test datasets.
def train_and_predict_knn(X_train_pca, y_train, X_test_pca):
	# Instantiate KNeighborsClassifier with 2 neighbors.
	knn = KNeighborsClassifier(n_neighbors=2)
	# Train KNN model on the PCA-transformed training data.
	knn.fit(X_train_pca, y_train)
	# Predict labels for the PCA-transformed test data.
	return knn.predict(X_test_pca)

# Train model, make model predictions, and save model predictions to CSV file, for some ν.
def generate_predictions_pcaknn(X_train, y_train, X_test, ν):
	start_time = time.time()
	X_train_pca, X_test_pca, n_components = apply_pca(X_train, X_test, ν)

	file_name = f"predictions_ν={ν}_components={n_components}.csv"
	pd.DataFrame({
		"id": S_test["id"],
		"label": train_and_predict_knn(X_train_pca, y_train, X_test_pca)
	}).to_csv(file_name, index=False)
	end_time = time.time()

	local_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
	# Log message to predictions.log.
	logging.info(f"Predictions file {file_name} generated in {end_time - start_time:.2f}s.")

generate_predictions_pcaknn(X_train, y_train, X_test, ν=0.8)

  return self._fit(X, y)
