In [None]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from os import mkdir
import sys
import pickle



def load_RDS(file_path):
    read_RDS = robjects.r['readRDS']
    return read_RDS(file_path)



def pandas_to_r(df):        
    with localconverter(robjects.default_converter + pandas2ri.converter):
        r_from_pandas_df = robjects.conversion.py2rpy(df)
    return r_from_pandas_df



def r_to_pandas(df):
    with localconverter(robjects.default_converter + pandas2ri.converter):
        pandas_from_r_df = robjects.conversion.rpy2py(df)
    return pandas_from_r_df


In [None]:
path = "/home/colombelli/Documents/datasets/thyroid_log2.rds"

In [None]:
rdf = load_RDS(path)

In [None]:
for name in robjects.r['colnames'](rdf):
    print(name)

In [None]:
pdf = r_to_pandas(rdf)

In [None]:
for col in pdf.columns: 
    print(col) 

In [1]:
# some_file.py
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, "/home/colombelli/Documents/bioinformatics-ml/EnsembleSelector-v2.0")

from DataManager import DataManager
from EFS import EFS
from Evaluator import Evaluator
import rpy2.robjects.packages as rpackages

seed = 42
num_bootstraps = 5
num_folds = 3

fs_methods = [
    ("gain-ratio", "r", "gr")
]

dataset_path = "/home/colombelli/Documents/datasets/thyroid_log2.rds"
results_path = "/home/colombelli/Documents/bioinformatics-ml/Experiments/debug-res/"

rpackages.importr('FSelectorRcpp')

aggregator = "mean"

In [2]:
dm = DataManager(results_path, dataset_path, num_bootstraps, num_folds, seed)

Creating results directory...
Given directory already created, files will be replaced.
Input c to cancel or any other key to continue... 


In [3]:
ensemble = EFS(dm, fs_methods, aggregator, aggregator)
ensemble.select_features()



################# Fold iteration: 1 #################


Bootstrap:  1 


[1] "Calculating Information Gain Ratio..."
[1] "Processing output..."
[1] "Saving ranking..."


R[write to console]: Error in saveRDS(rankDf, outputPath) : 
  argument "outputPath" is missing, with no default
Calls: <Anonymous> -> saveRDS



RRuntimeError: Error in saveRDS(rankDf, outputPath) : 
  argument "outputPath" is missing, with no default
Calls: <Anonymous> -> saveRDS


In [None]:
for col in dm.pd_df.columns:
    print(col)

In [None]:
from Selector import PySelector, RSelector
from Aggregator import Aggregator
from DataManager import DataManager
import time

class EFS:
    
    # fs_methods: a tuple (script name, language which the script was written, .rds output name)
    def __init__(self, data_manager:DataManager, fs_methods, first_aggregator, second_aggregator):

        self.dm = data_manager
        self.fs_methods = self.__generate_fselectors_object(fs_methods)
        self.fst_aggregator = Aggregator(first_aggregator, self.dm)
        self.snd_aggregator = Aggregator(second_aggregator, self.dm)


        
    def __generate_fselectors_object(self, methods):
        
        fs_methods = []
        for script, language, rds_name in methods:
            if language == "python":
                fs_methods.append(
                    PySelector(rds_name, script)
                )
            elif language == "r":
                fs_methods.append(
                    RSelector(rds_name, script)
                )

        return fs_methods




    def end_time_and_print(self, start):
        end = time.time()
        hours, rem = divmod(end-start, 3600)
        minutes, seconds = divmod(rem, 60)
        print("\nTime taken:")
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
        print("\n")
        return


    def select_features(self):

        for i in range(self.dm.num_folds):
            print("\n\n################# Fold iteration:", i+1, "#################")
            self.dm.current_fold_iteration = i
            self.dm.update_bootstraps()

            self.snd_layer_rankings = []
            for j, (bootstrap, _) in enumerate(self.dm.current_bootstraps):
                print("\n\nBootstrap: ", j+1, "\n")
                start = time.time()
                output_path = self.dm.get_output_path(i, j)
                bootstrap_data = self.dm.pd_df.loc[bootstrap]

                self.fst_layer_rankings = []
                for fs_method in self.fs_methods:   
                    print("")
                    bootstrap_data = bootstrap_data.rename({"hsa-miR-375": "aa:.,-=)(*&%$#@!çãõáÀ><;_ []{}|\/?", 
                                           'hsa-miR-376c': "bb:.,-=)(*&%$#@!çãõáÀ><;_ []{}|\/?"}, axis='columns')
                    #self.fst_layer_rankings.append(
                        #fs_method.select(bootstrap_data, output_path)
                    #)
                    print("after features selecting:\n", fs_method.select(bootstrap_data, output_path))
                    input()
                
                self.fs_aggregation = self.fst_aggregator.aggregate(self.fst_layer_rankings)
                self.dm.save_aggregated_ranking(self.fs_aggregation, output_path)
                self.snd_layer_rankings.append(self.fs_aggregation)
                self.end_time_and_print(start)
            
            output_path = self.dm.get_output_path(fold_iteration=i)
            final_ranking = self.snd_aggregator.aggregate(self.snd_layer_rankings)
            self.dm.save_aggregated_ranking(final_ranking, output_path)

In [None]:
ensemble.fst_layer_rankings

In [None]:
pdf.rename({'DarkCorner': '*'}, axis='columns')

In [None]:
import urllib.parse

def alnum_encode(text):
    return urllib.parse.quote(text, safe='')\
        .replace('-', '%2d').replace('.', '%2e').replace('_', '%5f')\
        .replace('%', '_')

def alnum_decode(underscore_encoded):
    return urllib.parse.unquote(underscore_encoded.replace('_','%'), errors='strict')

In [None]:
def encode_df(df):

    for attribute in df.columns:
        enc_attribute = alnum_encode(attribute)
        df = df.rename(columns = {attribute: enc_attribute})
    
    for sample_index_name in df.index:
        enc_index = alnum_encode(sample_index_name)
        df = df.rename(index = {sample_index_name: enc_index})
    
    return df

In [None]:
def decode_df(df):

    for attribute in df.columns:
        dec_attribute = alnum_decode(attribute)
        df = df.rename(columns = {attribute: dec_attribute})
    
    for sample_index_name in df.index:
        dec_index = alnum_decode(sample_index_name)
        df = df.rename(index = {sample_index_name: dec_index})
        
    return df

In [None]:
test = pdf.rename(index={'GSM1002105': '@#%@#$&*()'}, columns={'DarkCorner': '*(#$%;.,)'})

In [None]:
enc_test = encode_df(test)
dec_test = decode_df(enc_test)

In [None]:
enc_test

In [None]:
dec_test