In [1]:
import pandas as pd
import numpy as np
from math import log2

path = "/home/colombelli/Documents/datasets/assembler/"
luad = "luad/LUAD__gene.normalized_RNAseq__tissueTypeAll__20200822175257.txt"
luad_microarr = "luad/LUAD__gene_Array__tissueTypeAll__20201020150206.txt"
prad = "prad/PRAD__gene.normalized_RNAseq__tissueTypeAll__20200822180034.txt"
thca = "thca/THCA__gene.normalized_RNAseq__tissueTypeAll__20200822180826.txt"

In [6]:
def set_new_cols(data):
    print("Getting gene symbols")
    new_cols = []
    count_no_gene_sym = 0

    for gene in data.columns:
        symbol = gene.split('|')[0]
        if symbol == '?':
            new_cols.append(gene.split('|')[1])
            count_no_gene_sym +=1
        else:
            new_cols.append(symbol)
            
    print("Genes without symbol:", count_no_gene_sym)
    data.columns = new_cols
    return


def log_transform_data(data):
    print("Log transforming")
    df = (data+1).applymap(log2)
    return df


def set_class_column(data):
    print("Getting classes")
    classes = []

    for barcode in data.index:
        sample_type = int(barcode.split('-')[3][0:-1])
        if sample_type > 9 and sample_type < 20:    # Codes within [10, 19] are normal sample types
            classes.append(0)
        elif sample_type > 0 and sample_type < 10:  # Codes within [1, 9] are tumoral sample types
            classes.append(1)
        else:
            print("Control sample type found for barcode:", barcode)
            classes.append(0)

        if sample_type != 1 and sample_type != 6 and sample_type != 11:
            print("Unexpected sample type found for barcode:", barcode)
        
        
    np_cls = np.array(classes)
    print("Classes info:\n", np.unique(np_cls, return_counts=True))
    data['class'] = classes
    return


def process_dataset(file_path, log=True):
    
    print("Loading dataset...")
    df = pd.read_csv(file_path, sep="\t")
    df = df.T
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])
    
    set_new_cols(df)
    if log:
        df = log_transform_data(df)
    set_class_column(df)
    return df

In [3]:
def save_processed_df(df, path_and_name):
    print("Saving dataset...")
    df.to_csv(path_and_name)
    return pd.read_csv(path_and_name, index_col=0)

In [19]:
df = process_dataset(path+luad)
save_processed_df(df, path+"luad/luad_log2.csv")

Loading dataset...
Getting gene symbols
Genes without symbol: 29
Log transforming
Getting classes
Unexpected sample type found for barcode: TCGA-50-5066-02A-11R-2090-07
Unexpected sample type found for barcode: TCGA-50-5946-02A-11R-2090-07
Classes info:
 (array([0, 1]), array([ 59, 517]))
Saving dataset...


Unnamed: 0,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,317712,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,class
TCGA-05-4244-01A-01R-1107-07,0.0,3.460913,3.618474,5.661048,9.731217,0.0,8.435590,1.033652,0.000000,0.0,...,9.018678,5.350285,8.197321,9.907260,0.763921,10.088859,11.471139,9.768648,9.170597,1
TCGA-05-4249-01A-01R-1107-07,0.0,3.034867,3.748848,6.515884,9.853334,0.0,7.191824,1.383939,0.000000,0.0,...,8.172465,5.980428,8.950001,10.204971,4.411650,9.622978,11.199826,10.153700,9.433116,1
TCGA-05-4250-01A-01R-1107-07,0.0,3.043572,2.811142,5.659257,10.156940,0.0,5.720508,0.000000,0.000000,0.0,...,10.033199,5.931168,8.517334,9.722642,4.782796,8.895339,12.408981,10.194168,9.060342,1
TCGA-05-4382-01A-01R-1206-07,0.0,3.624230,3.099968,6.389400,9.658520,0.0,7.913022,0.564232,0.309525,0.0,...,9.558593,5.373036,8.441914,9.888267,6.041142,9.828389,12.725186,10.192589,9.376841,1
TCGA-05-4384-01A-01R-1755-07,0.0,2.079088,2.168064,6.200361,9.137001,0.0,8.104768,0.687867,0.000000,0.0,...,7.275567,6.340285,9.140128,10.368188,3.160501,9.607078,11.706702,10.763482,9.500392,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-NJ-A55O-01A-11R-A262-07,0.0,4.046081,4.584193,6.660876,9.773105,0.0,8.006332,1.115633,0.000000,0.0,...,8.530321,5.386814,8.366562,10.114199,4.209680,9.506548,12.031540,10.382324,9.195202,1
TCGA-NJ-A55R-01A-11R-A262-07,0.0,3.235650,3.501133,6.242622,9.584483,0.0,8.536352,0.783163,0.000000,0.0,...,8.810213,5.692087,8.552138,10.537639,5.744756,9.567679,11.588149,10.958776,9.259183,1
TCGA-NJ-A7XG-01A-12R-A39D-07,0.0,5.458963,6.359535,6.432483,9.406701,0.0,8.391223,0.000000,0.000000,0.0,...,7.424646,4.165373,7.409182,10.259693,3.469157,9.057056,10.963035,10.476904,8.740755,1
TCGA-O1-A52J-01A-11R-A262-07,0.0,3.915148,4.654859,5.957422,10.152691,0.0,7.424699,1.952632,0.000000,0.0,...,8.529122,7.101856,9.138731,10.341686,0.564134,9.613415,12.130652,10.571349,8.903853,1


In [13]:
df = process_dataset(path+luad_microarr, log=False)
df

#df = pd.read_csv(path+"/BRCA__gene_Array__tissueTypeAll__20201020151509.txt", sep="\t")
#df = df.T
#df

Loading dataset...
Getting gene symbols
Genes without symbol: 0
Getting classes
Classes info:
 (array([1]), array([32]))


Unnamed: 0,15E1.2,2'-PDE,7A5,A1BG,A2BP1,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,...,ZW10,ZWINT,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,class
TCGA-05-4244-01A-01R-1107-07,-0.3095,-0.0786429,1.292,-2.04983,-1.426,0.08975,0.262,1.60533,-0.7175,,...,-2.373,-2.20717,-1.024,-0.2025,-2.575,0.336,-1.16917,-0.0226,-0.704,1
TCGA-05-4249-01A-01R-1107-07,0.7035,-0.06325,3.0055,-0.2454,-1.52133,0.731,0.441,-0.19575,0.604667,0.31,...,-1.05825,-0.4462,1.0065,-0.87175,-2.1655,-0.737125,-0.465,0.402833,-0.611,1
TCGA-05-4250-01A-01R-1107-07,1.8615,-0.088125,1.5545,1.09117,-1.102,-0.097,1.3325,0.29675,0.133,0.259,...,-1.4885,-0.5635,-0.3925,-1.15075,-1.3305,-0.28275,-1.03533,-0.908833,-0.314,1
TCGA-35-3615-01A-01R-0946-07,-0.811,-0.24,1.502,-0.499167,-1.295,-0.2985,1.1495,0.625,-0.548333,0.5365,...,-1.7335,-2.4435,0.1905,0.06,-3.5825,-0.663125,-1.20683,0.156167,-0.433,1
TCGA-35-4122-01A-01R-1107-07,-0.581,-0.0205,3.682,0.992333,-1.8325,0.97425,0.7165,0.35025,0.222,0.5025,...,-1.7305,-2.07017,-0.603,-0.4455,-0.758,0.390125,-0.118,-0.183333,-0.049,1
TCGA-35-4123-01A-01R-1107-07,-0.33425,0.104813,3.518,1.15333,-0.968167,0.50675,0.6385,0.28925,0.0703333,0.333,...,-2.15025,-0.8605,0.565,-0.49675,-2.6335,0.539375,-0.276667,-0.593,0.0215,1
TCGA-44-2655-01A-01R-0946-07,-0.622,0.502062,3.069,-0.245167,-1.19583,2.11075,0.857,0.6195,-0.142833,-0.4815,...,-2.10425,-1.89217,0.711,0.25025,-4.6815,-0.1695,-0.8915,0.1765,-0.2325,1
TCGA-44-2656-01A-02R-0946-07,0.03825,0.0280625,2.68,-0.5105,-2.182,1.878,0.777,0.26225,-0.259667,0.862,...,-1.305,-1.3655,0.51,0.1995,-0.29025,0.08025,-0.0591667,-0.139833,-0.096,1
TCGA-44-2657-01A-01R-1107-07,-1.3545,-0.147625,2.983,0.860833,-0.900333,2.5675,1.079,0.10925,0.0725,0.458,...,-1.24325,-1.707,0.2175,-0.126,-1.4165,0.8455,-0.549,0.306833,-0.2995,1
TCGA-44-2659-01A-01R-0946-07,-1.2365,-0.1745,3.3355,-0.226,-1.141,2.229,1.234,0.061,-0.4025,0.453,...,-1.5955,-2.175,0.988,0.10375,0.36,0.868375,-1.11533,0.0228333,-0.005,1


In [20]:
df = process_dataset(path+prad)
save_processed_df(df, path+"prad/prad_log2.csv")

Loading dataset...
Getting gene symbols
Genes without symbol: 29
Log transforming
Getting classes
Classes info:
 (array([0, 1]), array([ 52, 498]))
Saving dataset...


Unnamed: 0,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,317712,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,class
TCGA-2A-A8VL-01A-21R-A37L-07,0.00000,4.443063,4.197960,6.355992,10.357574,0.0,8.304415,0.00000,0.00000,0.0,...,6.619447,5.611432,8.156048,10.106487,5.289281,9.436857,11.170432,10.540779,8.983419,1
TCGA-2A-A8VO-01A-11R-A37L-07,0.00000,4.344871,3.898257,5.979012,9.756570,0.0,8.566700,0.00000,0.00000,0.0,...,7.892424,5.421937,8.586790,9.827936,7.396474,9.564797,11.405844,9.739847,9.325745,1
TCGA-2A-A8VT-01A-11R-A37L-07,0.00000,5.640563,5.911744,6.576262,9.189134,0.0,9.513292,1.07238,0.00000,0.0,...,6.934558,6.835865,9.228571,10.755366,5.507328,10.218937,10.584548,11.023928,10.424722,1
TCGA-2A-A8VV-01A-11R-A37L-07,0.00000,4.506799,4.120004,6.770158,9.845014,0.0,7.896056,0.00000,0.00000,0.0,...,7.203301,5.514473,8.721872,9.990273,6.032943,9.261573,11.579402,10.009504,10.036262,1
TCGA-2A-A8VX-01A-11R-A37L-07,0.00000,1.982765,3.268404,6.613591,9.703645,0.0,8.346399,0.00000,0.00000,0.0,...,8.289457,4.654458,8.490792,9.959399,7.364122,9.470459,11.150269,9.711469,9.557922,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZG-A9M4-01A-11R-A41O-07,0.53037,4.356968,4.788816,6.970637,8.821826,0.0,10.983346,0.00000,0.00000,0.0,...,7.285975,6.079766,8.649915,10.310744,6.595229,9.337349,10.516591,10.244216,9.107529,1
TCGA-ZG-A9MC-01A-31R-A41O-07,0.00000,3.821516,3.990038,6.734920,8.998620,0.0,8.032105,0.00000,0.00000,0.0,...,6.715675,6.067512,8.857866,10.538661,6.249352,9.975857,12.065681,10.501288,9.248182,1
TCGA-ZG-A9N3-01A-11R-A41O-07,0.00000,2.889532,2.989956,6.999465,10.571173,0.0,7.422318,0.00000,1.01956,0.0,...,10.968822,5.247259,8.600984,9.878005,7.987055,9.978384,11.302684,9.597213,9.534645,1
TCGA-ZG-A9ND-01A-11R-A41O-07,0.00000,3.523524,4.608378,6.033258,10.214231,0.0,8.016795,0.00000,0.00000,0.0,...,7.727670,5.543836,8.327548,10.107021,7.164772,9.654848,10.508092,10.389611,9.243703,1


In [21]:
df = process_dataset(path+thca)
save_processed_df(df, path+"thca/thca_log2.csv")

Loading dataset...
Getting gene symbols
Genes without symbol: 29
Log transforming
Getting classes
Classes info:
 (array([0, 1]), array([ 59, 513]))
Saving dataset...


Unnamed: 0,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,317712,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,class
TCGA-4C-A93U-01A-11R-A39I-07,0.0,1.552082,0.000000,7.061192,9.301386,0.0,8.240550,0.568616,0.000000,0.0,...,7.612397,6.100828,7.989480,9.759390,4.759529,10.813808,12.115444,9.765008,9.358773,1
TCGA-BJ-A0YZ-01A-11R-A10U-07,0.0,3.537060,1.529421,7.135796,10.774049,0.0,5.535984,0.502331,0.000000,0.0,...,6.704461,5.683576,8.029609,9.784443,5.880012,9.952747,11.124221,10.541867,9.568781,1
TCGA-BJ-A0Z0-01A-11R-A10U-07,0.0,2.633152,1.884676,7.509491,9.823909,0.0,7.053899,0.480058,0.000000,0.0,...,7.315837,6.285243,8.870615,10.096284,6.728250,11.060050,10.090026,10.662298,10.375914,1
TCGA-BJ-A0Z2-01A-11R-A10U-07,0.0,3.503769,3.470758,7.677577,9.852725,0.0,7.962813,0.963622,0.000000,0.0,...,8.804562,5.870093,8.659544,10.220458,6.267133,10.276796,9.565787,10.355813,10.218733,1
TCGA-BJ-A0Z3-01A-11R-A13Y-07,0.0,1.705845,2.535804,7.395265,9.635258,0.0,7.086891,0.606916,1.032877,0.0,...,7.938185,6.006493,8.534474,10.033615,4.691797,10.668134,11.901242,10.218508,9.914348,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-MK-A4N6-01A-11R-A250-07,0.0,2.283151,2.903308,6.870865,9.459332,0.0,6.728208,0.468114,0.000000,0.0,...,7.534200,5.225445,8.542219,10.028337,5.996825,11.457597,12.177864,10.205503,9.491184,1
TCGA-MK-A4N7-01A-11R-A250-07,0.0,2.710217,3.237288,7.377259,9.593059,0.0,7.820870,0.382723,0.382723,0.0,...,6.821435,5.948554,8.686597,10.166190,4.043808,10.651912,11.481169,10.179101,9.875139,1
TCGA-MK-A4N9-01A-11R-A250-07,0.0,2.498557,3.967113,7.111762,9.655508,0.0,8.506455,0.514703,1.192825,0.0,...,6.612963,5.736031,8.288401,10.334155,2.893401,10.409751,11.909396,10.336070,9.680594,1
TCGA-MK-A84Z-01A-11R-A39I-07,0.0,0.000000,2.953004,7.048441,10.062860,0.0,8.504144,1.026800,0.000000,0.0,...,7.879401,5.910164,8.227827,10.060059,6.820788,10.353965,10.862017,10.106959,9.637037,1
