# Assignment 3

In [1]:
from ISLP.models import (ModelSpec as MS)
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis)
import statsmodels.api as sm

In [2]:
def gen_data(n: int):
    """
    Generating function for multivariate normal for a given n
    """
    p = 15
    n1 = int(n/2)
    n2 = int(n/2)
    cov_1 = np.diag(np.repeat(1, p)) + 0.2

    x_class_1 = np.random.multivariate_normal(size = n1,
                                              mean = np.repeat(3, p),
                                              cov = cov_1)
    x_class_2 = np.random.multivariate_normal(size = n2,
                                              mean = np.repeat(2, p),
                                              cov = cov_1)
    
    x = np.concatenate([x_class_1, x_class_2])
    y = np.repeat([1, 2], [n1, n2])

    x_col_names = ["x%s" % x_enum for x_enum in range(1, p + 1)]
    df = pd.DataFrame(x,
                      columns = x_col_names)
    df['y'] = y

    return df


def lda_fit_score(training: pd.DataFrame,
                  testing: pd.DataFrame,
                  X_start_idx: int = 0,
                  X_end_idx: int = 3,
                  y_idx: int = -1):
    """
    Fit and score an LDA model and return the accuracy of it
    """
    lda = LinearDiscriminantAnalysis()
    lda.fit(X = training.iloc[:, X_start_idx:X_end_idx],
            y = training.iloc[:, y_idx])
    lda_accuracy = lda.score(X = testing.iloc[:, X_start_idx:X_end_idx],
                             y = testing.iloc[:, y_idx])
    
    return lda_accuracy


def log_reg_accuracy(training: pd.DataFrame,
                    testing: pd.DataFrame,
                    X_start_idx: int = 0,
                    X_end_idx: int = 3,
                    y_label: str = 'y'):
    """
    Fit and return the balanced accuracy of a logistic regression model
    """
    testing_set_X = MS(testing.iloc[:, X_start_idx:X_end_idx]).fit_transform(testing)

    logreg_design = MS(training.iloc[:, X_start_idx:X_end_idx])
    logreg_X = logreg_design.fit_transform(training)
    logreg_y = training[y_label] == 2
    logreg_glm = sm.GLM(logreg_y,
                        logreg_X,
                        family = sm.families.Binomial())
    logreg_results = logreg_glm.fit()
    logreg_probs = logreg_results.predict(exog = testing_set_X)
    logreg_labels = np.array([1] * len(logreg_probs))
    logreg_labels[logreg_probs > 0.5] = 2
    logreg_accuracy = np.mean(logreg_labels == testing[y_label])

    return logreg_accuracy

### Ex. 1

We would expect the generative classifier, LDA, to work better on small $N$, as it relies on assumptions about the prior that can still predict in absence of large $N$. However, without a large $N$, and because it relies on prior assumptions, it will have higher bias, but lower variance given that it is using an existing distribution to classify. On the other hand, once the training set is a large $N$, logistic regression would perform better as it has enough data to calculate more significant coefficients with lower errors. This in turn produces a model witih lower bias but higher variance

### Ex. 2

In [3]:
small_set_model_lda_accuracies = []
large_set_model_lda_accuracies = []
small_set_model_logreg_accuracies = []
large_set_model_logreg_accuracies = []

n_iter = 1000

for _ in range(0, n_iter):
        training_set_1  = gen_data(n = 50)
        training_set_2 = gen_data(n = 10000)
        testing_set = gen_data(n = 10000)

        small_set_lda_accuracy = lda_fit_score(training = training_set_1,
                                               testing = testing_set)
        small_set_model_lda_accuracies.append(small_set_lda_accuracy)

        large_set_lda_accuracy = lda_fit_score(training = training_set_2,
                                               testing = testing_set)
        large_set_model_lda_accuracies.append(large_set_lda_accuracy)

        small_logreg_accuracy = log_reg_accuracy(training = training_set_1,
                                                 testing = testing_set)
        small_set_model_logreg_accuracies.append(small_logreg_accuracy)

        large_logreg_accuracy = log_reg_accuracy(training = training_set_2,
                                                 testing = testing_set)
        large_set_model_logreg_accuracies.append(large_logreg_accuracy)

small_set_lda_mean_accuracy = np.mean(small_set_model_lda_accuracies)
large_set_lda_mean_accuracy = np.mean(large_set_model_lda_accuracies)
small_set_logreg_mean_accuracy = np.mean(small_set_model_logreg_accuracies)
large_set_logreg_mean_accuracy = np.mean(large_set_model_logreg_accuracies)

accuracy_data = [[small_set_lda_mean_accuracy, large_set_lda_mean_accuracy],
                 [small_set_logreg_mean_accuracy, large_set_logreg_mean_accuracy]]

accuracies = pd.DataFrame(data = accuracy_data,
                          columns = ['Small Training Set', 'Large Training Set'],
                          index = ['LDA', 'Log. Reg.'])

accuracies

Unnamed: 0,Small Training Set,Large Training Set
LDA,0.737192,0.753299
Log. Reg.,0.736378,0.753299


- For the small training set ($n=50$), we expect LDA to outperform logistic regression given that it classifies based on an assumed distribution. We see that this holds true by a very small margin, with LDA classifying with a $73.6995\%$ accuracy as opposed to logistic regression classifying with a $73.6261\%$ accuracy.
- Similarly, at large sized training sets ($n=10000$), with two independent Gaussian distributed random variables, LDA and logistic regression perform at nearly equivalent levels, with logistic regression's performance improving to meet LDA's (although both have improved slightly with the significantly larger training set)
- The relatively marginal improvements in increasing $n$ and between models can be attributed to both random variables being independent and Gaussian, hence LDA's inherent normality assumption is valid and logistic regression can easily determine the decision boundary between the random variables