In [1]:
import numpy as np
import pandas as pd
import math

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics


def get_x(df):
    return df.loc[:, df.columns != 'class']
    
def get_y(df):
    return df.loc[:, ['class']].T.values[0]


def load_RDS(file_path):

    read_RDS = robjects.r['readRDS']
    return read_RDS(file_path)



def load_csv(file_path):
    return pd.read_csv(file_path, index_col=0)



def pandas_to_r(df):

    with localconverter(robjects.default_converter + pandas2ri.converter):
        r_from_pandas_df = robjects.conversion.py2rpy(df)
    return r_from_pandas_df



def r_to_pandas(df):
    with localconverter(robjects.default_converter + pandas2ri.converter):
            pandas_from_r_df = robjects.conversion.rpy2py(df)
    return pandas_from_r_df


In [2]:
mic_arr = pd.read_csv(
    "/home/colombelli/Documents/datasets/Projeto Bernardo/Agilent/GSE70947/Tabela dos dados normalizados GSE70947.txt", 
    sep=" ", index_col=0)

mic_arr = mic_arr.set_index('GeneName')
del mic_arr['Description']
mic_arr = mic_arr.transpose().rename(columns={np.nan:'class'})
mic_arr["class"] = mic_arr["class"].replace({"Normal":0, "Tumor":1})
mic_arr

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


GeneName,class,U2AF1L4,PSMC3IP,ZCCHC7,AK2,lincRNA:chr7:226042-232442_R,PRAC,YLPM1,SDR16C6P,MLLT6,...,GOLGA8A,CCDC53,IL1A,RXRA,BEX4,TMEM229B,ENST00000508993,UNC13B,MCOLN3,IL34
GSM1823702_252800417016_S01_GE1_107_Sep09_1_2,0,8.87216881108986,8.05084760177513,6.91221522612451,10.7922244505045,9.26767906939989,6.2700961309039,6.69874397542322,6.23049773071545,11.1868625820602,...,9.00187,10.6976,6.2701,12.9817,9.11869,6.3972,6.44548,8.80152,7.62706,6.81734
GSM1823703_252800417016_S01_GE1_107_Sep09_2_1,0,9.48998243066658,7.56373318521904,7.4964243387062,10.4857157405717,9.6296017608601,6.50475661967614,6.598459355902,6.33713007186884,10.8816819732041,...,8.82886,10.792,5.96027,12.8184,8.42436,6.05648,6.13036,8.8952,8.19039,6.96649
GSM1823704_252800416877_S01_GE1_107_Sep09_2_3,0,9.09526637088704,8.24993963233832,7.04041331002757,10.8678326632135,9.50928242700838,6.52173772251195,6.79471607844506,6.38736760355326,10.9252695858618,...,8.62676,10.4861,6.30084,13.3071,8.67975,6.22939,6.44257,9.20426,7.64307,6.6146
GSM1823705_252800416894_S01_GE1_107_Sep09_1_1,0,9.16507960899626,8.13613965194505,6.98011401084642,11.6711627506704,8.94586234142829,6.21647427157496,7.29363747859504,6.28961974483475,10.6948132616429,...,9.67722,11.1456,6.131,13.2346,9.03902,6.131,6.21647,9.17376,7.79019,6.95297
GSM1823706_252800416894_S01_GE1_107_Sep09_1_3,0,8.97229423881034,7.92322383331715,7.01808127166568,11.6598152727585,9.06644230773251,6.11285832077853,7.19361152442885,6.13355251281461,10.9793620369365,...,10.0341,10.9502,6.23066,13.0115,9.6296,6.23066,6.19225,8.57652,7.02202,7.01403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM1823993_252800416912_S01_GE1_107_Sep09_1_2,1,9.18171542629342,8.67544854601032,7.42933146343936,11.9571882623437,9.31546390862963,6.3914084243582,6.63901493943572,6.24964828747101,11.7593819179624,...,8.64304,10.0072,6.10396,12.2503,9.36339,6.26183,6.24965,8.0617,6.36963,6.80182
GSM1823994_252800416913_S01_GE1_107_Sep09_2_2,1,8.84968338175536,8.81549784038803,7.46998791010345,11.3423193553585,10.4318428621735,6.4636258903747,6.95921239728392,6.35838923315478,12.2622043246975,...,9.23957,10.6981,6.08639,12.7465,8.78617,6.16995,6.28385,8.04552,6.68053,6.8626
GSM1823995_252800416913_S01_GE1_107_Sep09_1_2,1,9.19278294217826,7.99859686644502,6.80547538146877,11.7581934383918,10.0225170132326,6.376350659134,6.76289096340403,6.19341632664292,12.5035582975485,...,9.07842,10.8204,6.09965,13.0881,8.93042,6.14761,6.20162,8.64554,7.06903,7.08038
GSM1823996_252800416912_S01_GE1_107_Sep09_2_2,1,8.3741706306154,8.35921291956648,7.09181373729081,11.5820320313808,9.84734752053401,6.47338150166635,6.83307545907084,6.67906814357764,11.1958228238605,...,7.88014,9.79678,6.14058,11.9869,9.75255,6.11071,6.16926,8.2529,6.40772,6.42764


In [4]:
for gene in mic_arr.columns:
    try:
        print("contains str?", mic_arr[gene].str.contains('').any())
        mic_arr[gene] = pd.to_numeric(mic_arr[gene])
    except:
        pass

contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str

contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str? True
contains str

In [6]:
class_temp = mic_arr['class']
del mic_arr['class']
mic_arr = pd.concat([mic_arr, class_temp], axis=1)
mic_arr

Unnamed: 0,U2AF1L4,PSMC3IP,ZCCHC7,AK2,lincRNA:chr7:226042-232442_R,PRAC,YLPM1,SDR16C6P,MLLT6,ZNF362,...,CCDC53,IL1A,RXRA,BEX4,TMEM229B,ENST00000508993,UNC13B,MCOLN3,IL34,class
GSM1823702_252800417016_S01_GE1_107_Sep09_1_2,8.872169,8.050848,6.91221522612451,10.7922244505045,9.26767906939989,6.270096,6.69874397542322,6.230498,11.1868625820602,10.742427,...,10.6976,6.2701,12.9817,9.11869,6.3972,6.44548,8.80152,7.62706,6.81734,0
GSM1823703_252800417016_S01_GE1_107_Sep09_2_1,9.489982,7.563733,7.4964243387062,10.4857157405717,9.6296017608601,6.504757,6.598459355902,6.337130,10.8816819732041,10.988549,...,10.792,5.96027,12.8184,8.42436,6.05648,6.13036,8.8952,8.19039,6.96649,0
GSM1823704_252800416877_S01_GE1_107_Sep09_2_3,9.095266,8.249940,7.04041331002757,10.8678326632135,9.50928242700838,6.521738,6.79471607844506,6.387368,10.9252695858618,10.620291,...,10.4861,6.30084,13.3071,8.67975,6.22939,6.44257,9.20426,7.64307,6.6146,0
GSM1823705_252800416894_S01_GE1_107_Sep09_1_1,9.165080,8.136140,6.98011401084642,11.6711627506704,8.94586234142829,6.216474,7.29363747859504,6.289620,10.6948132616429,10.827569,...,11.1456,6.131,13.2346,9.03902,6.131,6.21647,9.17376,7.79019,6.95297,0
GSM1823706_252800416894_S01_GE1_107_Sep09_1_3,8.972294,7.923224,7.01808127166568,11.6598152727585,9.06644230773251,6.112858,7.19361152442885,6.133553,10.9793620369365,10.857679,...,10.9502,6.23066,13.0115,9.6296,6.23066,6.19225,8.57652,7.02202,7.01403,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM1823993_252800416912_S01_GE1_107_Sep09_1_2,9.181715,8.675449,7.42933146343936,11.9571882623437,9.31546390862963,6.391408,6.63901493943572,6.249648,11.7593819179624,11.021391,...,10.0072,6.10396,12.2503,9.36339,6.26183,6.24965,8.0617,6.36963,6.80182,1
GSM1823994_252800416913_S01_GE1_107_Sep09_2_2,8.849683,8.815498,7.46998791010345,11.3423193553585,10.4318428621735,6.463626,6.95921239728392,6.358389,12.2622043246975,11.009336,...,10.6981,6.08639,12.7465,8.78617,6.16995,6.28385,8.04552,6.68053,6.8626,1
GSM1823995_252800416913_S01_GE1_107_Sep09_1_2,9.192783,7.998597,6.80547538146877,11.7581934383918,10.0225170132326,6.376351,6.76289096340403,6.193416,12.5035582975485,11.388836,...,10.8204,6.09965,13.0881,8.93042,6.14761,6.20162,8.64554,7.06903,7.08038,1
GSM1823996_252800416912_S01_GE1_107_Sep09_2_2,8.374171,8.359213,7.09181373729081,11.5820320313808,9.84734752053401,6.473382,6.83307545907084,6.679068,11.1958228238605,10.514076,...,9.79678,6.14058,11.9869,9.75255,6.11071,6.16926,8.2529,6.40772,6.42764,1


In [7]:
X_train = get_x(mic_arr)
y_train = get_y(mic_arr)

clf = SVC(gamma='auto', probability=True)
clf.fit(X_train, y_train)

# accuracy
clf.score(X_train, y_train)

0.918918918918919