In [1]:
# hide
# skip
! [ -e /content ] && pip install -Uqq fastai #upgrade fastai on colab

In [2]:
# default_exp utils

In [3]:
# export
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from fastcore.all import *
from xcube.imports import *

In [4]:
# hide
from nbdev.showdoc import *

In [5]:
#hide
%load_ext autoreload
%autoreload 2

# Utils

> Utilities needed for little repititive tasks 

In [24]:
# export
def namestr(obj, namespace):
    "Returns the name of the object `obj` passed"
    return [name for name in namespace if namespace[name] is obj]

In [7]:
a = 'some_var'
test_eq(namestr(a, globals()), ['a'])

In [28]:
# export
def list_files(startpath):
    """ simulates the linux tree cmd 
    https://stackoverflow.com/questions/9727673/list-directory-tree-structure-in-python
    """ 
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))

In [16]:
#export
def get_paths(path, prefix=None):
    "with `path` as basedir, makes data and models dir and the relevant pathlib objects inside them"
    path_data = path/'data'
    path_model = path/'models'

    path_model.mkdir(exist_ok=True)
    path_data.mkdir(exist_ok=True)

    data = path_data/(prefix+'.csv')
    dls_lm_path, dls_lm_r_path = path_model/f"{prefix}_dls_lm.pkl", path_model/f"{prefix}_dls_lm_r.pkl"
    dls_lm_vocab_path, dls_lm_vocab_r_path = path_model/f"{prefix}_dls_lm_vocab.pkl", path_model/f"{prefix}_dls_lm_vocab_r.pkl"
    lm_path, lm_r_path = path_model/f"{prefix}_lm.pth", path_model/f"{prefix}_lm_r.pth"
    lm_finetuned_path, lm_finetuned_r_path = path_model/f"{prefix}_lm_finetuned.pth", path_model/f"{prefix}_lm_finetuned_r.pth"
    dsets_clas_path, dsets_clas_r_path = path_model/f"{prefix}_dset_clas.pkl", path_model/f"{prefix}_dset_clas_r.pkl"
    dls_clas_path, dls_clas_r_path = path_model/f"{prefix}_dls_clas.pkl", path_model/f"{prefix}_dls_clas_r.pkl"
    clas_path, clas_r_path = path_model/f"{prefix}_clas.pth", path_model/f"{prefix}_clas_r.pth"
    dls_colab_path = path_model/f"{prefix}_dls_colab.pkl"
    colab_path = path_model/f"{prefix}_colab.pth"
    plist = [path, path_data, path_model, 
             data, 
             dls_lm_path, dls_lm_r_path,
             dls_lm_vocab_path, dls_lm_vocab_r_path,
             lm_path, lm_r_path,
             lm_finetuned_path, lm_finetuned_r_path,
             dsets_clas_path, dsets_clas_r_path,
             dls_clas_path, dls_clas_r_path,
             clas_path, clas_r_path,
             dls_colab_path,
             colab_path]
    pdir = {}
    for o in plist:  pdir[namestr(o, locals())[0]] = o
    return pdir

In [None]:
with tempfile.TemporaryDirectory() as tempdirname:
    print(f"created temporary dir: {tempdirname}")
    _paths = get_paths(Path(tempdirname), "mimic3-9k")
    for v in _paths.values(): v.touch()
    list_files(tempdirname)

In [90]:
# export
def plot_reduction(X, tSNE=True, n_comps=None, perplexity=30, figsize=(6,4)):
    """
    PCA on X and plots the first two principal components, returns the decomposition 
    and the explained variances for each directions,
    if `tSNE` then does a tSNE after PCA.
    """
    reduction = "tSNE" if tSNE else "PCA"
    pca = PCA(n_components=n_comps, svd_solver="full")
    X_red = pca.fit_transform(X)
    if tSNE:
        tsne = TSNE(n_components=2, perplexity=perplexity)
        X_red = tsne.fit_transform(X_red[:, :50])
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(1,1,1)
    plt.scatter(X_red[:, 0], X_red[:, 1], marker='x')
    ax.set_xlabel("1st component")
    ax.set_ylabel("2nd component")
    ax.set_title(f"{reduction} Decomposition")
    plt.show()
    return X_red, pca.explained_variance_ratio_

In [29]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_utils.ipynb.
Converted 01_layers.ipynb.
Converted 02_text.models.core.ipynb.
Converted 03_text.learner.ipynb.
Converted 04_metrics.ipynb.
Converted index.ipynb.


#### Step 1: Data Preprocessing- Normalization and Scaling

In [242]:
X = to_np(lbs_emb)
X_copy = X.copy()

Training set:
* $x^1, \cdots, x^m$ where $x^i \in \mathbb{R}^n$

In [243]:
mu, std = X.mean(axis=0), X.std(axis=0)
mu.shape, std.shape

((400,), (400,))

In [244]:
X = (X-mu)/std
X_copy = StandardScaler().fit_transform(X_copy)
assert np.allclose(X, X_copy, atol=1e-5)

#### Step 2: Compute the Covariance Matrix:

$$
\Sigma = \frac{1}{m} \sum_{i=1}^{m} x^i {x^i}^{T}
$$

In [245]:
m, n = X.shape
m,n

(1271, 400)

In [246]:
sigma = 1/m * X.T @ X
sigma.shape

(400, 400)

#### Step 3: Compute the Eigen Vectors of `Sigma` (using SVD):

In [247]:
u, s, vh = np.linalg.svd(sigma)
test_eq((u.shape, s.shape, vh.shape), ((400, 400), (400,), (400, 400)))

#### Step 4: Take the first $k$ columns of `u` -> these are the direction vectors (or mathematically, the first $k$ eigen vectors of `sigma`)

In [248]:
n_comps = 300
u_red = u[:, :n_comps]

#### Step 5: Compute the projections, $z_i \in \mathbb{R}^k$

$z^i = u_{\textsf{red}}^Tx^i$

In [249]:
Z = X @ u_red 

#### Step 6: Choosing the number of principal components:

* Total variation in the data: $\frac{1}{m} \sum_{i}^{m} ||x^i||^2$
* Squared projection error: $\frac{1}{m} \sum_{i}^{m} ||x^i - z^i||^2$
* Choose $k$ to be the samllest value such that $\frac{\frac{1}{m} \sum_{i}^{m} ||x^i - z^i||^2}{\frac{1}{m} \sum_{i}^{m} ||x^i||^2} \leq 0.01 (\text{or } 0.05)$
* **Shortcut:** $\frac{\sum_i^k s_{ii}}{\sum_i^n s_{ii}} >= 0.99 (\text{or } 0.95)$

In [250]:
var_exp = np.sum(s[:n_comps])/np.sum(s)
var_exp

0.9851864

The first two principal components explains only ~18% of the variance, but let's cross our fingers and toss some matplotlib into the mix

In [253]:
np.sum(s[:2])/np.sum(s)

0.14398894

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1, 1, 1)
plt.scatter(-Z[:, 0], -Z[:, 1])
plt.show()