In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.utils import resample
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from os import mkdir
import sys
import pickle

file_path = dataset_path = "/home/colombelli/Documents/datasets/iqrSelectedGenesAllSamples.rds"
read_RDS = robjects.r['readRDS']
df = read_RDS(file_path)

with localconverter(robjects.default_converter + pandas2ri.converter):
    df = robjects.conversion.rpy2py(df)

    
import random
import pandas as pd
import numpy as np

class StratifiedKFold:
    
    def __init__(self, seed, dataframe, class_column_name, k, undersampling=True):
        random.seed(seed)
        
        self.df = dataframe
        self.class_coloumn_name = class_column_name
        self.k = k
        self.undersampling = undersampling

        
        self.classes = self.df[self.class_coloumn_name].unique()
        self.class_counts = self.df[self.class_coloumn_name].value_counts().to_dict()
        self.minority_count = self.class_counts[min(self.class_counts)]
        
        
        self.folds = self.__get_folds()   # a list with pandas Index objects, one per fold
        self.__shuffle_each_fold()
        
        
    def __get_folds(self):
        
        
        final_folds = [[] for _ in range(self.k)]
        for df_class in self.classes:
    
            class_indexes = self.df.loc[self.df[self.class_coloumn_name] == df_class].index.to_list()
            amount_per_fold = self.class_counts[df_class] // self.k
            
            random.shuffle(class_indexes)
            current_class_folds = [[] for _ in range(self.k)]

            for class_fold in current_class_folds:
                self.__get_random_samples(class_fold, class_indexes, amount_per_fold)

        
            self.__distribute_remaining_samples(amount_per_fold, current_class_folds, final_folds, class_indexes)
            if self.undersampling:
                self.__random_undersample(final_folds, current_class_folds)
            else:
                self.__append_in_final_folds(final_folds, current_class_folds)
        
        return final_folds
        
            
    def __get_random_samples(self, class_fold, samples, amount):
        
        for _ in range(amount):
            class_fold.append(samples.pop())
        return
    
    
    def __distribute_remaining_samples(self, current_amount, current_folds, final_folds, class_indexes):
        
        len_folds = np.array([len(x)+current_amount for x in final_folds])
        while class_indexes:
            fold_with_less_samples = len_folds.argmin()
            current_folds[fold_with_less_samples].append(class_indexes.pop())
            len_folds[fold_with_less_samples] += 1
            
        return
    
    
    def __random_undersample(self, final_folds, current_class_folds):
        
        base_per_fold = self.minority_count // self.k
        remaining_samples = self.minority_count - (self.k * base_per_fold)
        samples_per_fold = [base_per_fold for _ in range(self.k)]
        
        len_folds = np.array([len(x)+base_per_fold for x in final_folds])
        for _ in range(remaining_samples):
            fold_with_less_samples = len_folds.argmin()
            samples_per_fold[fold_with_less_samples] += 1
            len_folds[fold_with_less_samples] += 1

        
        for i, amount in enumerate(samples_per_fold):
            final_folds[i] = final_folds[i] + \
                            random.sample(current_class_folds[i], amount)
        return
    
    
    def __append_in_final_folds(self, final_folds, current_class_folds):
        
        for i, samples in enumerate(current_class_folds):
            final_folds[i] = final_folds[i] + samples
        return
    
    
    
    def __shuffle_each_fold(self):
        
        for fold in self.folds:
            random.shuffle(fold)
        return
    
    
    
    def split(self):
        
        for i, fold in enumerate(self.folds):
            
            test_set = fold
            train_set = [item for j,sublist in enumerate(self.folds) if j!=i for item in sublist]
            yield (train_set, test_set)

In [11]:
#skfold = StratifiedKFold(42, df, 'class', 10, undersampling=False)
skfold = StratifiedKFold(42, df, 'class', 10)

In [23]:
for tr, tst in skfold.split():
    train = tr

In [25]:
dff = df.loc[train]

In [26]:
# some_file.py
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, "/home/colombelli/Documents/bioinformatics-ml/Experiments/python-experiments/geode/geode")

In [49]:
from geode import *
import numpy as np
from pprint import pprint

In [88]:
genes = list(dff.columns[0:len(dff.columns)-1])
labels = list(np.array(dff['class']).astype('int') + 1)
mat = dff.iloc[:, 0:len(dff.columns)-1].transpose().to_numpy()

In [89]:
chdir_res = chdir(mat, labels, genes, calculate_sig=0, nnull=100)

In [116]:
data = {}
data['gene'] = []
data['rank'] = []
for i, gene in enumerate(chdir_res):
    data['gene'].append(gene[1])
    data['rank'].append(i+1)

In [121]:
pd.DataFrame(data, columns=['rank']).set_index(pd.Index(data['gene']))

Unnamed: 0,rank
MIR4508,1
GSTT1,2
CHIT1,3
COL10A1,4
PENK,5
...,...
PROM2,14017
KLF5,14018
OSCP1,14019
NOTCH1,14020
