In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.utils import resample
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from os import mkdir
import sys
import pickle

file_path = dataset_path = "/home/colombelli/Documents/datasets/iqrSelectedGenesAllSamples.rds"
read_RDS = robjects.r['readRDS']
df = read_RDS(file_path)

with localconverter(robjects.default_converter + pandas2ri.converter):
    df = robjects.conversion.rpy2py(df)

    
import random
import pandas as pd
import numpy as np

class StratifiedKFold:
    
    def __init__(self, seed, dataframe, class_column_name, k, undersampling=True):
        random.seed(seed)
        
        self.df = dataframe
        self.class_coloumn_name = class_column_name
        self.k = k
        self.undersampling = undersampling

        
        self.classes = self.df[self.class_coloumn_name].unique()
        self.class_counts = self.df[self.class_coloumn_name].value_counts().to_dict()
        self.minority_count = self.class_counts[min(self.class_counts)]
        
        
        self.folds = self.__get_folds()   # a list with pandas Index objects, one per fold
        self.__shuffle_each_fold()
        
        
    def __get_folds(self):
        
        
        final_folds = [[] for _ in range(self.k)]
        for df_class in self.classes:
    
            class_indexes = self.df.loc[self.df[self.class_coloumn_name] == df_class].index.to_list()
            amount_per_fold = self.class_counts[df_class] // self.k
            
            random.shuffle(class_indexes)
            current_class_folds = [[] for _ in range(self.k)]

            for class_fold in current_class_folds:
                self.__get_random_samples(class_fold, class_indexes, amount_per_fold)

        
            self.__distribute_remaining_samples(amount_per_fold, current_class_folds, final_folds, class_indexes)
            if self.undersampling:
                self.__random_undersample(final_folds, current_class_folds)
            else:
                self.__append_in_final_folds(final_folds, current_class_folds)
        
        return final_folds
        
            
    def __get_random_samples(self, class_fold, samples, amount):
        
        for _ in range(amount):
            class_fold.append(samples.pop())
        return
    
    
    def __distribute_remaining_samples(self, current_amount, current_folds, final_folds, class_indexes):
        
        len_folds = np.array([len(x)+current_amount for x in final_folds])
        while class_indexes:
            fold_with_less_samples = len_folds.argmin()
            current_folds[fold_with_less_samples].append(class_indexes.pop())
            len_folds[fold_with_less_samples] += 1
            
        return
    
    
    def __random_undersample(self, final_folds, current_class_folds):
        
        base_per_fold = self.minority_count // self.k
        remaining_samples = self.minority_count - (self.k * base_per_fold)
        samples_per_fold = [base_per_fold for _ in range(self.k)]
        
        len_folds = np.array([len(x)+base_per_fold for x in final_folds])
        for _ in range(remaining_samples):
            fold_with_less_samples = len_folds.argmin()
            samples_per_fold[fold_with_less_samples] += 1
            len_folds[fold_with_less_samples] += 1

        
        for i, amount in enumerate(samples_per_fold):
            final_folds[i] = final_folds[i] + \
                            random.sample(current_class_folds[i], amount)
        return
    
    
    def __append_in_final_folds(self, final_folds, current_class_folds):
        
        for i, samples in enumerate(current_class_folds):
            final_folds[i] = final_folds[i] + samples
        return
    
    
    
    def __shuffle_each_fold(self):
        
        for fold in self.folds:
            random.shuffle(fold)
        return
    
    
    
    def split(self):
        
        for i, fold in enumerate(self.folds):
            
            test_set = fold
            train_set = [item for j,sublist in enumerate(self.folds) if j!=i for item in sublist]
            yield (train_set, test_set)

            
#skfold = StratifiedKFold(42, df, 'class', 10, undersampling=False)
skfold = StratifiedKFold(42, df, 'class', 10)

for tr, tst in skfold.split():
    train = tr
    test = tst

In [64]:
from collections import defaultdict 
from operator import itemgetter

def train_feature_value(X, y_true, feature, value):
    
    # Create a simple dictionary to count how frequency they give certain predictions
    class_counts = defaultdict(int)
    
    # Iterate through each sample and count the frequency of each class/value pair
    for sample, y in zip(X, y_true):
        if sample[feature] == value: 
            class_counts[y] += 1
        
    # Now get the best one by sorting (highest first) and choosing the first item
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1),
                                 reverse=True)
    most_frequent_class = sorted_class_counts[0][0]

    # The error is the number of samples that do not classify as the most frequent class
    # *and* have the feature value.
    n_samples = X.shape[1]
    error = sum([class_count for class_value, class_count in
                 class_counts.items() if class_value != most_frequent_class])
    
    return most_frequent_class, error

In [75]:
def train_oneR(X, y_true, feature): 
    
    # Check that variable is a valid number 
    n_samples, n_features = X.shape 
    assert 0 <= feature < n_features 
    
    # Get all of the unique values that this variable has 
    values = set(X[:,feature]) 
    
    # Stores the predictors array that is returned 
    predictors = dict() 
    errors = [] 
    for current_value in values: 
        most_frequent_class, error = train_feature_value(X, y_true, feature, current_value) 
        predictors[current_value] = most_frequent_class 
        errors.append(error) 
        
    # Compute the total error of using this feature to classify on 
    total_error = sum(errors) 
    return predictors, total_error

In [12]:
from sklearn.model_selection import train_test_split

In [65]:
xtr = np.array(df.loc[train].iloc[0:, 0:len(df.columns)-1])
xts = np.array(df.loc[test].iloc[0:, 0:len(df.columns)-1])

In [66]:
ytr = np.array(df.loc[train].iloc[0:, len(df.columns)-1:]).flatten()
yts = np.array(df.loc[test].iloc[0:, len(df.columns)-1:]).flatten()

In [67]:
all_predictors = {} 
errors = {} 
for feature_index in range(xtr.shape[1]): 
    predictors, total_error = train_oneR(xtr, ytr, feature_index) 
    all_predictors[feature_index] = predictors 
    errors[feature_index] = total_error

In [85]:
best_feature, best_error = sorted(errors.items(), key=itemgetter(1))[0]
model = {'feature': best_feature,
         'predictor': all_predictors[best_feature]}

In [87]:
model['predictor']

{0.0: 1.0,
 1.0678402156858908: 0.0,
 1.0525923859502853: 1.0,
 3.5428521896882494: 1.0,
 3.244706957821207: 0.0,
 5.377084337232653: 1.0,
 6.315503013366993: 1.0,
 7.630769878875693: 1.0,
 7.414877220003843: 1.0,
 8.375481302995023: 0.0,
 8.616251511557154: 0.0,
 9.249725319454638: 0.0,
 10.000407236730474: 0.0,
 11.107756740676738: 1.0,
 9.284647240004157: 0.0,
 9.419154837638663: 1.0,
 13.1846006750577: 0.0,
 10.141707391253329: 0.0,
 14.64454204228: 1.0,
 11.039481015054887: 0.0,
 13.345894066061467: 1.0,
 14.41725112806211: 1.0,
 14.44823141173327: 1.0,
 4.260436285847533: 1.0,
 3.5304923194724296: 0.0,
 3.0556449852651113: 1.0,
 5.589308087105628: 1.0,
 8.987007266611194: 1.0,
 6.493569069048817: 1.0,
 6.177332245765314: 0.0,
 7.574320196733396: 0.0,
 7.803995729898105: 0.0,
 8.393399551292987: 0.0,
 8.543062598135876: 1.0,
 2.051229093199773: 1.0,
 8.517113624111019: 0.0,
 8.636326907992405: 0.0,
 9.26389079713497: 0.0,
 9.361281490490974: 0.0,
 8.141878653040997: 0.0,
 0.429992

In [81]:
all_predictors[1]

{0.0: 1.0,
 1.0678402156858908: 0.0,
 1.0525923859502853: 1.0,
 3.5428521896882494: 1.0,
 3.244706957821207: 0.0,
 5.377084337232653: 1.0,
 6.315503013366993: 1.0,
 7.630769878875693: 1.0,
 7.414877220003843: 1.0,
 8.375481302995023: 0.0,
 8.616251511557154: 0.0,
 9.249725319454638: 0.0,
 10.000407236730474: 0.0,
 11.107756740676738: 1.0,
 9.284647240004157: 0.0,
 9.419154837638663: 1.0,
 13.1846006750577: 0.0,
 10.141707391253329: 0.0,
 14.64454204228: 1.0,
 11.039481015054887: 0.0,
 13.345894066061467: 1.0,
 14.41725112806211: 1.0,
 14.44823141173327: 1.0,
 4.260436285847533: 1.0,
 3.5304923194724296: 0.0,
 3.0556449852651113: 1.0,
 5.589308087105628: 1.0,
 8.987007266611194: 1.0,
 6.493569069048817: 1.0,
 6.177332245765314: 0.0,
 7.574320196733396: 0.0,
 7.803995729898105: 0.0,
 8.393399551292987: 0.0,
 8.543062598135876: 1.0,
 2.051229093199773: 1.0,
 8.517113624111019: 0.0,
 8.636326907992405: 0.0,
 9.26389079713497: 0.0,
 9.361281490490974: 0.0,
 8.141878653040997: 0.0,
 0.429992