# Methods comparison
- SVRG (with/without outer loop)
- Katyusha (with/without outer loop)

In [39]:
from typing import Optional

import numpy as np
import pandas as pd

In [31]:
MANUAL_SEED = 42
np.random.seed(MANUAL_SEED)

## Data loading & preprocessing

Dataset reference can be found in README.md

In [32]:
heart_df = pd.read_csv("../data/raw/heart_attack_dataset.csv")
print(f"{len(heart_df)=}")
heart_df.head()

len(heart_df)=303


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [33]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    new_df = df.copy()

    # Make labels be 1 or -1
    new_df["output"] = new_df["output"] * 2 - 1

    # Normalize columns
    new_df["age"] = new_df["age"] / new_df["age"].max()
    new_df["trtbps"] = new_df["trtbps"] / new_df["trtbps"].max()
    new_df["chol"] = new_df["chol"] / new_df["chol"].max()
    new_df["thalachh"] = new_df["thalachh"] / new_df["thalachh"].max()
    new_df["oldpeak"] = new_df["oldpeak"] / new_df["oldpeak"].max()

    return new_df


preprocessed_df = preprocess_dataset(heart_df)
preprocessed_df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,0.818182,1,3,0.725,0.413121,1,0,0.742574,0,0.370968,0,0,1,1
1,0.480519,1,2,0.65,0.443262,0,1,0.925743,0,0.564516,0,0,2,1
2,0.532468,0,1,0.65,0.361702,0,0,0.851485,0,0.225806,2,0,2,1
3,0.727273,1,1,0.6,0.41844,0,1,0.881188,0,0.129032,2,0,2,1
4,0.74026,0,0,0.6,0.62766,0,1,0.806931,1,0.096774,2,0,2,1


In [34]:
values = heart_df.drop(columns=["output"]).to_numpy()
targets = heart_df["output"].to_numpy()
print(f"{values.shape=}")
print(f"{targets.shape=}")

values.shape=(303, 13)
targets.shape=(303,)


Train/Test split

In [35]:
test_ratio = 0.1

test_length = int(test_ratio * len(targets))
train_length = len(targets) - test_length

indices = np.random.permutation(targets.shape[0])
train_idx, test_idx = indices[:train_length], indices[train_length:]

train_values, test_values = values[train_idx, :], values[test_idx, :]
train_targets, test_targets = targets[train_idx], targets[test_idx]

In [38]:
print(f"{len(train_values)=}")
print(f"{len(train_targets)=}")
print(f"{len(test_values)=}")
print(f"{len(test_targets)=}")

len(train_values)=273
len(train_targets)=273
len(test_values)=30
len(test_targets)=30


## Problems

In [6]:
class Problem:
    def get_value(self, *args, **kwargs):
        raise NotImplementedError

    def get_gradient(self, *args, **kwargs):
        raise NotImplementedError

### Binary Logistic Regression

#### Definition

Binary Logistic Regression (BLR) problem can be defined as follows:
$$\begin{equation}
\min_{w \in \mathbb{R}^d} \frac{1}{n} \sum\limits_{i=1}^n \ell (g(w, x_i), y_i) + \frac{\lambda}{2} \| w \|^2_2,
\end{equation}
$$
where $\ell(z,y) = \ln (1 + e^{-yz})$ is the loss function, $g(w, x) = w^T x$ is the model, $w$ is the model parameters, $\{x_i, y_i\}_{i=1}^n$ is the data sample from feature vectors $x_i$ and labels $y_i$, $\lambda > 0$ is the regularization parameter.

**Important Assumption**: $y$ must take values $-1$ or $+1$.

### Insights

**Lemma 1**
Let $x \in \mathbb{R}^d$. Then $X=xx^T \succeq 0$.

**Proof**
Let $y \in \mathbb{R}^d$ be column vector. Then
$$
y^TXy = y^Txx^Ty = (x^Ty)^T(x^Ty) = \|x^Ty\|_2^2 \geq 0
$$
Therefore, $X=xx^T$ is positive semi-definite by definition

**Lemma 2**
Let $x \in \mathbb{R}^d$. Then $(x^Tx)I_n \succeq xx^T$.

**Proof**
Let us denote $X =xx^T$, $r = x^Tx = \|x\|_2^2$ and $X' = X-rI_n$.  
Now let's look on calculation of eigenvalues for matrix $X'$.
We will have to calculate determinant of $X'-\lambda I_n = X-r I_n - \lambda I_n = X-(r + \lambda)I$.  
So we can claim that eigenvalues of $X'$ are just eigenvalues of $X$ minus $r$ (let us denote this fact as $eig(X')=eig(X)-r$).  
Then also recap property of definite matrix: matrix is positive definite if and only if all of its eigenvalues are positive.

Therefore, we need to prove that eigenvalues of $-X' = rI_n - X$ are nonnegative, what means that eigenvalues of $X' = X - rI_n $ are non-positive.  

It is sufficient to show that the maximum eigenvalue $eig_{max}(X')$ is non-positive. We have shown that $eig(X')=eig(X)-r$.  
As $r \geq 0$, $eig_{max}(X') = eig_{max}(X') - r$.  
Note that $Sum(eig(X))=Tr(X)= r$, and, by *Lemma 1*, $X \succeq 0$, so all eigenvalue of $X$ are nonnegative.  
Therefore, the maximum possible $eig_{max}(X') \leq r$ and $eig_{max}(X') = eig_{max}(X') - r \leq 0$.  

We has proven that $eig_{max}(X') \leq 0$, so $eig(X') \leq 0$ and $eig(-X') \geq 0$. Therefore, $-X' = (x^Tx)I_n - xx^T \succeq 0$

Let us define the function $f$ as
$$ f = \frac{1}{n} \sum\limits_{i=1}^n \ell (g(w, x_i), y_i) + \frac{\lambda}{2} \| w \|^2_2$$

So the initial problem is minimizing $f$. Let us also use the following notations:
$$
e_i = e^{-y_iw^Tx_i} \\
h_i(w) = \ell (g(w, x_i), y_i) = ln(1+e^{-y_iw^Tx_i}) = ln(1+e_i) \\
r(w) = \frac{\lambda}{2} \| w \|^2_2 = \frac{\lambda}{2} w^Tw
$$

To compute $\nabla_w f$ and $\nabla_w^2 f$, we first need to find $\nabla h_i$, $\nabla^2 h_i$, $\nabla r$ and $\nabla^2 r$.
Note that $e_i' = -y_ie_ix_i$.

Let us start with $h(w)$:
$$
\nabla h_i = \frac{-y_ie_ix_i}{1+e_i} \\
\nabla^2 h_i = \frac{1}{(1+e_i)^2} (-y_i(-y_ie_ix_i)x_i(1+e_i)-(-y_ie_ix_i)(-y_ie_ix_i)) = \\
= \frac{y_i^2e_ix_ix_i^T}{(1+e_i)^2} (1+e_i-e_i) =  \{ y_i^2 = 1 \text{ as } y_i = \pm 1 \} = \frac{e_ix_ix_i^T}{(1+e_i)^2}
$$

For $r(w)$,
$$
\nabla r = \lambda w\\
\nabla^2 r = \lambda I_n
$$

Therefore,
$$
\nabla f = \frac{1}{n} \sum\limits_{i=1}^n \nabla h_i(w) +  \nabla r(w) = \frac{1}{n} \sum\limits_{i=1}^n \frac{-y_ie_ix_i}{1+e_i} +  \lambda w\\

\nabla^2 f = \frac{1}{n} \sum\limits_{i=1}^n \nabla^2 h_i(w) + \nabla^2 r(w) =\frac{1}{n} \sum\limits_{i=1}^n \frac{e_ix_ix_i^T}{(1+e_i)^2} + \lambda I_n
$$

From Theorem 2.1.6 and Theorem 2.1.11 from Nesterov's book (check references) we know that $f$ is $\mu$-strongly convex and has $L$-Lipschitz gradient iff
$$
L  I_n \succeq \nabla^2 f \succeq \mu I_n
$$

Note that $e_i > 0$, so $\frac{1}{n} \sum\limits_{i=1}^n \frac{e_ix_ix_i^T}{(1+e_i)^2} + \lambda I_n \succeq \lambda I_n$  (using *Lemma 1*).
Therefore $\nabla^2 f \succeq \lambda I_n$ and $f$ is $\mu$-strongly convex with $\mu = \lambda$.

Let us prove that $f$ has $L$-Lipschitz gradient with $L = \lambda + \frac{1}{4n} \sum_{i=1}^n x_i^T x_i$. To do this we need to show that
$$
(\lambda + \frac{1}{4n} \sum_{i=1}^n x_i^T x_i)I_n \succeq \frac{1}{n} \sum\limits_{i=1}^n \frac{e_ix_ix_i^T}{(1+e_i)^2} + \lambda I_n \\
\Longleftrightarrow \\
(\frac{1}{4n} \sum_{i=1}^n x_i^T x_i)I_n \succeq \frac{1}{n} \sum\limits_{i=1}^n \frac{e_ix_ix_i^T}{(1+e_i)^2}
$$

Let us focus on some fixed $x_i$ and prove
$$(\frac{1}{4n} x_i^T x_i)I_n \succeq \frac{1}{n} \frac{e_ix_ix_i^T}{(1+e_i)^2}$$

Using the fact that $x_i^T x_i I_n \succeq  x_ix_i^T$ from *Lemma 2*, we can compare only matrix scalars and proceed with following
$$
\frac{1}{4n} \geq \frac{1}{n} \frac{e_i}{(1+e_i)^2} \\
\frac{1}{4} \geq  \frac{e_i}{(1+e_i)^2} \\
(1+e_i)^2 \geq 4e_i \\
(1-e_i)^2 \geq 0 \\
$$

The last statement is true for all $x_i$, so $L I_n \succeq \nabla^2 f$ is also true and $f$ has $L$-Lipschitz gradient with $L = \lambda + \frac{1}{4n} \sum_{i=1}^n x_i^T x_i$.

**Results**
- $\nabla f = \frac{1}{n} \sum\limits_{i=1}^n \frac{-y_ie_ix_i}{1+e_i} +  \lambda w$
- The problem is $\mu$-strongly convex with $\mu = \lambda$
- The problem has $L$-Lipschitz gradient $L = \lambda + \frac{1}{4n} \sum_{i=1}^n \| x_i\|^2_2$


### Code

In [12]:
class BinaryLogisticRegression(Problem):
    def _expand_dim(self, x: np.ndarray) -> np.ndarray:
        """Convert (n,) vector to (n,1)"""
        return np.expand_dims(x, axis=1)

    def _get_iterate_data(
        self, indices: Optional[list[int]] = None
    ) -> list[tuple[np.ndarray, float]]:
        if indices == None:
            return self.data
        data = []
        for idx in indices:
            data.append(self.data[idx])
        return data

    def _custom_exponent(self, x: np.ndarray, y: float, w: np.ndarray) -> float:
        return np.exp(-y * w.dot(x))

    def __init__(
        self,
        xs: np.ndarray,
        ys: np.ndarray,
        lambda_term: float,
        seed: float = MANUAL_SEED,
    ) -> None:
        np.random.seed(seed)

        self.xs = xs
        self.ys = ys
        self.lambda_term = lambda_term

        self.data = list(zip(self.xs, self.ys))

        self.grad_shape = self.xs.shape[1]
        self.hes_shape = (self.xs.shape[1], self.xs.shape[1])
        self.sample_size = self.ys.shape[0]

        self.identity = np.identity(self.grad_shape)

    def get_uniformly_sampled_indices(self, n: int = 1) -> list[int]:
        return [np.random.randint(0, len(self.data)) for _ in range(n)]

    def get_value(self, w: np.ndarray, indices: Optional[list[int]] = None) -> float:
        sum_result = 0

        for x, y in self._get_iterate_data(indices):
            sum_result += np.log(1 + self._custom_exponent(x, y, w))

        squared_norm: float = w.dot(w)

        return sum_result / self.sample_size + squared_norm * self.lambda_term / 2

    def get_gradient(
        self, w: np.ndarray, indices: Optional[list[int]] = None
    ) -> np.ndarray:
        sum_result = np.zeros(self.grad_shape)
        for x, y in self._get_iterate_data(indices):
            custom_exp = self._custom_exponent(x, y, w)
            sum_result += (-y * custom_exp * x) / (1 + custom_exp)

        return sum_result / self.sample_size + self.lambda_term * w

NameError: name 'Problem' is not defined