# Code snippets

In [None]:
def np2pd_fit_transform(transformer, pd, prefix=None):
    '''Transform a pandas dataframe with an sklearn object keeping index/columns when appropriate'''
    if hasattr(transformer, 'fit_transform'):
        np = transformer.fit_transform(pd)
        keep_index, keep_cols = (np.shape[i] == pd.shape[i] for i in range(2))
        assert keep_index or keep_col
        return pandas.DataFrame(np, 
            index=pd.index if keep_index else map((prefix + '{}').format, range(1, np.shape[1] + 1)),
            columns=pd.columns if keep_cols else map((prefix + '{}').format, range(1, np.shape[1] + 1)))
    else:
        np = transformer.fit_predict(pd)
        return pandas.Series(np, index=pd.index)

Admixture inspired decomposition of matrix

In [None]:
def random_components(X, k):
    components = numpy.random.random(size=(X.shape[0], k))
    return components/numpy.sum(components, axis=1)[:,numpy.newaxis]
def model_error(X, components, profiles):
    return numpy.sum(numpy.square(X.values - numpy.dot(components, profiles)))
def components_constraint(components):
    return numpy.sum(numpy.square(1 - numpy.sum(components, axis=1)))
def components_error(x, X, profiles, C):
    k = profiles.shape[0]
    components = x.reshape((X.shape[0], k))
    return model_error(X, components, profiles) + components_constraint(components)/C
from scipy.optimize import minimize
def profile_error(x, X, components, C):
    k = components.shape[1]
    profiles = x.reshape((k, X.shape[1]))
    return model_error(X, components, profiles)
def block_relaxation(X, profiles0, n=10, components0=None):
    if isinstance(profiles0, int):
        profiles0 = X.sample(profiles0).values
    k = profiles0.shape[0]
    if components0 is None:
        components0 = random_components(X, k)
        
    iterations = [(profiles0, components0)]
    scores = [model_error(X, components0, profiles0)]
    optim_results = []
    for i in range(n):
        prev_profiles, prev_components = iterations[-1]
        print('Iter {}, optimizing components...'.format(i), end=' ', flush=True)
        cmp_res = minimize(components_error, prev_components.flatten(), args=(X, prev_profiles, 0.01), 
             method='L-BFGS-B', bounds=[(0, 1)]*(k*X.shape[0]), 
             options=dict(maxiter=100000, maxfun=100000))
        print('{}'.format(cmp_res.fun))
        next_components = cmp_res.x.reshape((X.shape[0], k))
        print('Iter {}, optimizing profiles...'.format(i), end=' ', flush=True)
        prf_res = minimize(profile_error, prev_profiles.flatten(), args=(X, next_components, 0.01), 
             method='L-BFGS-B', bounds=list(zip(X.min(), X.max()))*k, 
             options=dict(maxiter=100000, maxfun=100000))
        print('{}'.format(prf_res.fun))
        next_profiles = prf_res.x.reshape((k, X.shape[1]))
        
        iterations.append((next_profiles, next_components))
        scores.append(model_error(X, next_components, next_profiles))
        optim_results.append((prf_res, cmp_res))
    return iterations, scores, optim_results

# I/O

In [None]:
import gzip
import pickle
def load_pickle(path, n=None):
    with (gzip.open if path.endswith('.gz') else open)(path, 'rb') as f:
        while True:
            

# Plots and graphics

In [None]:
def cat2colors(x):
    unique_values = x.unique()
    colors = seaborn.color_palette(n_colors=len(unique_values))
    return x.map(dict(zip(unique_values, colors)))