In [1]:
import csv
import pandas as pd

with open("../data/msigdb_gobp.tsv", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    R = []
    for r in reader:
        R += [r]
df = pd.DataFrame(R, columns=["key", "feature"])



In [2]:
import numpy as np

def pairs_to_dense(pairs):
    pairs = pd.DataFrame(pairs)
    pairs_columns = list(pairs.columns)
    rows = sorted(set(pairs[pairs_columns[0]]))
    cols = sorted(set(pairs[pairs_columns[1]]))
    X = np.zeros((len(rows), len(cols)), dtype=int)
    rows_idxs = dict((k,i) for i,k in enumerate(rows))
    cols_idxs = dict((k,i) for i,k in enumerate(cols))
    for r in pairs.values:
        X[rows_idxs[r[0]], cols_idxs[r[1]]] += 1
    df = pd.DataFrame(X, columns=cols, index=rows)
    return df

dd = pairs_to_dense(df)


In [3]:
from tqdm import tqdm

def filter_underrepresented_features(df, min_freq=5):
    if min_freq < 1:
        min_freq = int(df.shape[0]*min_freq)+1
    else:
        min_freq = int(min_freq)
    columns = list(df.columns)
    keep = []
    for c in tqdm(columns):
        if np.sum(df[c]) >= min_freq:
            keep += [c]
    return df[keep]


def filter_overrepresented_features(df, max_freq=0.9):
    if max_freq <= 1:
        max_freq = int(df.shape[0]*max_freq)
    else:
        max_freq = int(max_freq)
    columns = list(df.columns)
    keep = []
    for c in tqdm(columns):
        if np.sum(df[c]) <= max_freq:
            keep += [c]
    return df[keep]


from sklearn.feature_extraction.text import TfidfTransformer

class TfidfVectorizer(object):

    def __init__(self):
        self.model = TfidfTransformer()
        
    def fit(self, data):
        self.index = list(data.index)
        self.columns = list(data.columns)
        X = np.array(data)
        self.model.fit(X)

    def transform(self, X):
        X = self.model.transform(X).todense()
        return pd.DataFrame(X, index=self.index, columns=self.columns)

In [4]:
dd = filter_underrepresented_features(dd, 5)

100%|██████████| 7656/7656 [00:01<00:00, 5416.49it/s]


In [5]:
dd = filter_overrepresented_features(dd)

100%|██████████| 7579/7579 [00:00<00:00, 11713.04it/s]


In [6]:
mdl = TfidfVectorizer()

In [7]:
mdl.fit(dd)

In [9]:
%matplotlib inline
import matplotlib.pyplot as plt

data = mdl.transform(dd)

In [16]:
from umap import UMAP

reducer = UMAP()
reducer.fit(np.array(data).T)

UMAP(tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True})

In [20]:
%matplotlib

import matplotlib.pyplot as plt

Xc = reducer.transform(np.array(data).T)

Using matplotlib backend: TkAgg


In [22]:
from griddify import Cloud2Grid

cg = Cloud2Grid(max_side=10)
cg.fit(Xc)
Xg = cg.transform(Xc, as_integers=False)

In [23]:
from griddify.plots import arrows_plot

arrows_plot(Xc, Xg, capping_distance=0.5)

<AxesSubplot:>