# Function definitions
this notebook is for preprocessing SNP data that is present in hdf5 files. The output is a hdf5 file with stratified KFolds for use in a ML pipeline

In [1]:
%matplotlib inline
import numpy as np
import h5py
import pandas as pd
import matplotlib.pyplot as plt
import sys
import qgrid
from itertools import combinations
import torch
from torch.nn.functional import one_hot
import verstack
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold

cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm

In [2]:
def print_attrs(name, obj):
    print(name)
    for key, val in obj.attrs.items():
        print ("    %s: %s" % (key, val))

def print_h5_structure(h5_path):
    with h5py.File(h5_path, "r") as f:
        f.visititems(print_attrs)
        
def rSubset(arr, r):
    # return list of all subsets of length r
    return set(list(combinations(arr, r)))

def check_equal_ids(pheno_id_ls):
    checks_equal= []
    for i in rSubset(range(0,len(pheno_id_ls)-1),2):
        check_eq = set(pheno_id_ls[i[0]]) == set(pheno_id_ls[i[1]])
        checks_equal.append(check_eq)

    if all(checks_equal)==True:
        return True
    else:
        return False
    
def search_for_substring(ls ,sub):
    """
    Checks if there is a substring in the list of strings
    """
    d = [s for s in ls if sub.lower() in s.lower()]
    return d

def get_phenotypes_by_threshold(y_ls, pheno_names_ls, thresh):
    # get a list with the counts of the phenotypes
    cnt_phenotypes = [np.count_nonzero(~np.isnan(i)) for i in y_ls]
    idx = np.where(np.array(cnt_phenotypes)>thresh)[0]
    names = np.take(np.array(pheno_names_ls), idx)
    return idx, names

def filter_by_MAF(X, maf_thresh=0.2):
    maf = (np.sum(X,axis=0))/(2*X.shape[0])
    ind_to_del = np.where(maf <= maf_thresh)[0]
    X_maf = np.delete(X, ind_to_del, axis=1)
    #X_ids = np.delete(X_ids, ind_to_del)
    return X_maf

def select_genotypes_for_phenotype(X, X_ids, y_ls, pheno_idx):
    y_pheno = y_ls[pheno_idx]
    X_selected = X[~np.isnan(y_pheno)]
    X_ids_selected = X_ids[~np.isnan(y_pheno)]
    y_selected = y_pheno[~np.isnan(y_pheno)]
    return X_ids_selected, X_selected, y_selected

def encode_nucleotides(nuc_arr):
    '''
    input:
    nuc_arr: 2d array of nucleotides with the shape (n_samples, n_features).
    features should be a string of nucleotides
    output:
    3d torch tensor with (n_samples, n_features, 4), with 4 as the one hot encoding 
    A : [1,0,0,0]
    C : [0,1,0,0]
    G : [0,0,1,0]
    T : [0,0,0,1]
    '''
    unique_nuc, nuc_int_enc = np.unique(nuc_arr, return_inverse=True)
    nuc_int_enc = nuc_int_enc.reshape(nuc_arr.shape)
    nuc_one_hot = one_hot(torch.from_numpy(nuc_int_enc))
    return nuc_one_hot, unique_nuc

def change_encoding(X):
    # change 0 to 1 and 2 to -1, works also for 0 to 1 and 1 to -1 
    X_enc = np.where(X<1,1,-1)
    return X_enc

def concat_with_ids(X_ids, X, y):
    X_ids = np.expand_dims(X_ids, axis=1)
    X_exp = np.concatenate((X_ids, X), axis=1)
    y = np.expand_dims(y, axis=1)
    y_exp = np.concatenate((X_ids, y), axis=1)
    X_y = np.concatenate((X_ids, y, X), axis=1)
    return X_exp, y_exp, X_y

def make_groups(y):
    '''
    generataes groups for stratification of continous values, so we can use stratified Kfold
    '''
    hist, bin_edges1 = np.histogram(y)
    bin_edges = bin_edges1[1:]
    # Fix, so upper bin will fit into y<val
    bin_edges[-1] = bin_edges[-1]+1
    for idx, val in enumerate(bin_edges):
        y = np.where(y <val, 255+idx, y)
    y = y-255
    return bin_edges1, y.astype(int)

def concat_X_y(X_id, X, y, y_groups):
    '''
    takes the arrays and makes a complete array with X and y and y_groups for easier train test split
    '''
    X_id_exp = np.expand_dims(X_id, axis=1)
    y_exp = np.expand_dims(y, axis=1)
    y_grp = np.expand_dims(y_groups, axis=1)
    X_exp = np.concatenate((X_id_exp, y_exp, y_grp, X), axis=1)
    return X_exp

# Load the dataset

In [3]:
target = "FT10_b"

h5_path = f"./data/atwell_{target}.hdf5"
print_h5_structure(h5_path)

X
X_raw
identifiers
sample_ids
y


In [4]:
with h5py.File(h5_path, "r") as f:
    X_ids = f['sample_ids'][:]
    X_ids = X_ids.astype("int")
    Ident = f['identifiers'][:]
    Ident = Ident.astype("str")
    X = f['X'][:]
    X = X.astype(int)
    X_raw = f['X_raw'][:]
    X_raw = X_raw.astype("str")
    y = f["y"][:]

# Encode X_raw as one-hot

In [6]:
X_one_hot, unique_nuc = encode_nucleotides(X_raw)

# Change Encoding of X
we need to make sure that the encoding is correct for the machine learning algorithms
- 0 --> 1
- 2 --> -1

In [7]:
X2 = change_encoding(X)

# Stratify data 

In [13]:
bin_edges, y_groups = make_groups(y)
X_id, y_id, X_y = concat_with_ids(X_ids, X2, y)
x_df = pd.DataFrame(X2, index = X_ids)
X_full = concat_X_y(X_ids, X2, y, y_groups)

t = pd.DataFrame(data = {
    "y":y,
    "y_grp": y_groups
}, index = X_ids)

result = pd.concat([t, x_df], axis=1)

# Save Stratified KFold as hdf5

In [21]:
# Save one hot encoded data
outer_cv = StratifiedKFold(n_splits=4)
inner_cv = StratifiedKFold(n_splits=3)
outfile=f"atwell_{target}_strat.h5"
with h5py.File(outfile, 'w') as f:
    outersplit_idx=0
    for trainval_index, test_index in outer_cv.split(X=result, y=result["y_grp"]):
        print(f"Saving Outer: {outersplit_idx}")
        innersplit_idx=0
        trainval = result.iloc[trainval_index]
        test = result.iloc[test_index]

        
        X_oh_trainval, X_oh_test = X_one_hot[trainval_index], X_one_hot[test_index]
        
        
        o = f.create_group(f"outerfold_{outersplit_idx}")
        i = o.create_group(f"innerfold_full")                   
        trn_vld = i.create_group("trn")
        trn_vld.create_dataset("sid", data=np.array(trainval.index))
        trn_vld.create_dataset("X", data=np.array(trainval.iloc[:, 2:]))
        trn_vld.create_dataset("X_onehot", data=np.swapaxes(X_oh_trainval,1,2))
        trn_vld.create_dataset("y", data=np.array(trainval.iloc[:, 0]))

        tst = i.create_group("vld")
        tst.create_dataset("sid", data=np.array(test.index))
        tst.create_dataset("X", data=np.array(test.iloc[:, 2:]))
        tst.create_dataset("X_onehot", data=np.swapaxes(X_oh_test,1,2))
        tst.create_dataset("y", data=np.array(test.iloc[:, 0]))
        
        
        for train_index, val_index in inner_cv.split(X=trainval, y=trainval["y_grp"]):
            print(f"Inner: {innersplit_idx}")
            train = trainval.iloc[train_index]
            val = trainval.iloc[val_index]
            
            X_oh_train, X_oh_val = X_oh_trainval[train_index], X_oh_trainval[val_index]
            
            i = o.create_group(f"innerfold_{innersplit_idx}")
            trn = i.create_group("trn")
            trn.create_dataset("sid", data=np.array(train.index))
            trn.create_dataset("X", data=np.array(train.iloc[:, 2:]))
            trn.create_dataset("X_onehot", data=np.swapaxes(X_oh_train,1,2))
            trn.create_dataset("y", data=np.array(train.iloc[:, 0]))
        
            vld = i.create_group("vld")
            vld.create_dataset("sid", data=np.array(val.index))
            vld.create_dataset("X", data=np.array(val.iloc[:, 2:]))
            vld.create_dataset("X_onehot", data=np.swapaxes(X_oh_val,1,2))
            vld.create_dataset("y", data=np.array(val.iloc[:, 0]))      
            innersplit_idx+=1
        outersplit_idx+=1

Outer: 0
Inner: 0
Inner: 1
Inner: 2
Outer: 1
Inner: 0
Inner: 1
Inner: 2
Outer: 2
Inner: 0
Inner: 1
Inner: 2
Outer: 3
Inner: 0
Inner: 1
Inner: 2
