In [None]:
#| hide
#| eval: false
! [ -e /content ] && pip install -Uqq xcube #upgrade fastai on colab

In [None]:
#| default_exp utils

In [None]:
#| export
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from fastcore.all import *
from xcube.imports import *

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# utils

> Utilities needed for little repititive tasks

In [None]:
#| export
def namestr(obj, namespace=None):
    "Returns the name of the object `obj` passed"
    return [name for name in namespace if namespace[name] is obj]

Here's an example of how `namestr` works:

In [None]:
a = 'some_var'
test_eq(namestr(a, globals()), ['a'])

In [None]:
#| export
def list_files(startpath):
    """ [simulates the linux tree cmd] 
    (https://stackoverflow.com/questions/9727673/list-directory-tree-structure-in-python)
    """ 
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))

In [None]:
#| export
def make_paths(path, prefix=None):
    """
    with `path` as basedir, makes data and models dir and 
    returns a dictionary of relevant pathlib objects
    """
    path_data = path/'data'
    path_model = path/'models'

    path_model.mkdir(exist_ok=True)
    path_data.mkdir(exist_ok=True)
    (path_model/'collab').mkdir(exist_ok=True)

    data = path_data/(prefix+'.csv')
    dls_lm_path, dls_lm_r_path = path_model/f"{prefix}_dls_lm.pkl", path_model/f"{prefix}_dls_lm_r.pkl"
    dls_lm_vocab_path, dls_lm_vocab_r_path = path_model/f"{prefix}_dls_lm_vocab.pkl", path_model/f"{prefix}_dls_lm_vocab_r.pkl"
    lm_path, lm_r_path = path_model/f"{prefix}_lm.pth", path_model/f"{prefix}_lm_r.pth"
    lm_finetuned_path, lm_finetuned_r_path = path_model/f"{prefix}_lm_finetuned.pth", path_model/f"{prefix}_lm_finetuned_r.pth"
    dsets_clas_path, dsets_clas_r_path = path_model/f"{prefix}_dset_clas.pkl", path_model/f"{prefix}_dset_clas_r.pkl"
    dls_clas_path, dls_clas_r_path = path_model/f"{prefix}_dls_clas.pkl", path_model/f"{prefix}_dls_clas_r.pkl"
    clas_path, clas_r_path = path_model/f"{prefix}_clas.pth", path_model/f"{prefix}_clas_r.pth"
    collab_bootst_path = path_model/f"{prefix}_tok_lbl_info.pkl"
    collab_data_path = path_data/f"{prefix}_tok_lbl.ft"
    collab_tok_path = path_data/f"{prefix}_tok.ft"
    collab_lbl_path = path_data/f"{prefix}_lbl.ft"
    dls_collab_path = path_model/f"{prefix}_dls_collab.pkl"
    dls_learn_rank_path = path_model/f"{prefix}_dls_learn_rank.pkl"
    collab_path = path_model/'collab'/f"{prefix}_collab.pth"
    plist = [path, path_data, path_model, 
             data, 
             dls_lm_path, dls_lm_r_path,
             dls_lm_vocab_path, dls_lm_vocab_r_path,
             lm_path, lm_r_path,
             lm_finetuned_path, lm_finetuned_r_path,
             dsets_clas_path, dsets_clas_r_path,
             dls_clas_path, dls_clas_r_path,
             clas_path, clas_r_path,
             collab_bootst_path,
             collab_data_path,
             collab_tok_path,
             collab_lbl_path,
             dls_collab_path,
             dls_learn_rank_path,
             collab_path]
    pdir = {}
    for o in plist:  pdir[namestr(o, locals())[0]] = o
    return pdir

In [None]:
with tempfile.TemporaryDirectory() as tempdirname:
    print(f"created temporary dir: {tempdirname}")
    _paths = make_paths(Path(tempdirname), "mimic3-9k")
    for v in _paths.values(): v.touch()
    list_files(tempdirname)

created temporary dir: /tmp/tmphq0qgtsa
tmphq0qgtsa/
    data/
        mimic3-9k_tok.ft
        mimic3-9k_lbl.ft
        mimic3-9k.csv
        mimic3-9k_tok_lbl.ft
    models/
        mimic3-9k_dls_clas.pkl
        mimic3-9k_dls_lm.pkl
        mimic3-9k_lm_r.pth
        mimic3-9k_lm_finetuned_r.pth
        mimic3-9k_tok_lbl_info.pkl
        mimic3-9k_dls_lm_vocab_r.pkl
        mimic3-9k_dls_collab.pkl
        mimic3-9k_clas.pth
        mimic3-9k_dset_clas.pkl
        mimic3-9k_dls_lm_vocab.pkl
        mimic3-9k_dls_learn_rank.pkl
        mimic3-9k_dls_lm_r.pkl
        mimic3-9k_dset_clas_r.pkl
        mimic3-9k_lm.pth
        mimic3-9k_clas_r.pth
        mimic3-9k_lm_finetuned.pth
        mimic3-9k_dls_clas_r.pkl
        collab/
            mimic3-9k_collab.pth


In [None]:
#| export
def plot_hist(data, x_label=None, y_label=None, title="Histogram"):
    n, bins, pathches = plt.hist(data)
    plt.grid(axis='y', color='b')
    # plt.yscale('log')
    if x_label is not None: plt.xlabel(x_label)
    if y_label is not None: plt.ylabel(y_label)
    maxfreq = n.max()
    plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
    plt.title(title);

In [None]:
#| export
def plot_reduction(X, tSNE=True, n_comps=None, perplexity=30, figsize=(6,4)):
    """
    PCA on X and plots the first two principal components, returns the decomposition 
    and the explained variances for each directions,
    if `tSNE` then does a tSNE after PCA.
    """
    reduction = "tSNE" if tSNE else "PCA"
    pca = PCA(n_components=n_comps, svd_solver="full")
    X_red = pca.fit_transform(X)
    if tSNE:
        tsne = TSNE(n_components=2, perplexity=perplexity)
        X_red = tsne.fit_transform(X_red[:, :50])
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(1,1,1)
    plt.scatter(X_red[:, 0], X_red[:, 1], marker='x')
    ax.set_xlabel("1st component")
    ax.set_ylabel("2nd component")
    ax.set_title(f"{reduction} Decomposition")
    plt.show()
    return X_red, pca.explained_variance_ratio_

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()