### Init

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
base = os.path.basename(os.getcwd())
list = os.getcwd().split(os.sep) 
list.pop(list.index(base))
ROOT = '\\'.join(list)
print(ROOT)
DATA_PATH = os.path.join(ROOT, 'datasets\\preprocessed\\mRNA')
mRNA_file = 'clinical_mRNA(protein_coding).csv'

d:\Universita\2 anno magistrale\Progetto BioInf\miRNA_to_age


In [3]:
raw_data = pd.read_csv(os.path.join(DATA_PATH, mRNA_file))

In [124]:
raw_data.columns

Index(['file_name_clinical', 'days_to_death', 'pathologic_stage',
       'age_at_initial_pathologic_diagnosis', 'days_to_last_followup', 'Death',
       'case_id', 'file_name_mRNA', 'gene_id', 'gene_name', 'gene_type',
       'unstranded', 'tpm_unstranded', 'fpkm_unstranded'],
      dtype='object')

In [125]:
raw_data.shape

(781, 14)

## Data preparation

In [4]:
def parse_array(x):
    if isinstance(x, str):
        x = x.strip("[]")
        return np.array(eval(x, {'np':np}))
    
    return np.array(x)

One-hot encoding for pathologic stage and delete columns that have less than 20 samples

In [5]:
from sklearn.preprocessing import OneHotEncoder

print(raw_data.shape)

onehot = OneHotEncoder()
pathologic_stages = raw_data[['pathologic_stage']]
stages = onehot.fit_transform(pathologic_stages).toarray()
stages_cols = onehot.get_feature_names_out(['pathologic_stage'])
tmp_df = pd.DataFrame(stages, columns=stages_cols, index=pathologic_stages.index)

print(tmp_df.shape)
index = tmp_df.sum().index
tmp_df.drop(columns=[i for i in index if tmp_df.sum()[i]<20], inplace=True)
print(tmp_df.shape)

raw_data = pd.concat([raw_data, tmp_df], axis=1)
raw_data.drop(columns=['pathologic_stage'],inplace=True)
print(raw_data.shape)

(781, 14)
(781, 12)
(781, 6)
(781, 19)


Delete age outliers

In [6]:
print(raw_data.shape)

ages_distrib = raw_data['age_at_initial_pathologic_diagnosis'].value_counts()
ages_to_del = ages_distrib[ages_distrib < 5].index.tolist()
mask = raw_data['age_at_initial_pathologic_diagnosis'].isin(ages_to_del)
indexes = raw_data[mask].index.to_list()
raw_data.drop(indexes, inplace=True)
print(raw_data.shape)

(781, 19)
(760, 19)


### Unstranded case

Separate genes reads column (unstranded) into separate columns, one for each gene

In [18]:
if os.path.exists(os.path.join(DATA_PATH, 'genes_reads(unstranded).csv')):
    genes_reads = pd.read_csv(os.path.join(DATA_PATH, 'genes_reads(unstranded).csv'), index_col=0)
    print("Loaded from existing")

else:
    print("Creating new file")
    n_cols = len(raw_data["unstranded"][0].strip("[]").split(","))
    reads = raw_data["unstranded"].apply(parse_array)

    genes_reads = pd.DataFrame(
        np.stack(reads.values),  # Converte la Series di vettori in un array 2D
        index=[f'Sample_{i}' for i in raw_data.index],
        columns=np.array(eval(raw_data['gene_name'][0].strip('[]'), {'np':str}))
    )

    genes_reads.columns = 'gene.' + np.array(genes_reads.columns)

    genes_reads.to_csv(os.path.join(DATA_PATH, 'genes_reads(unstranded).csv'))

print(genes_reads.head())
print(genes_reads.shape)

Creating new file
          gene.TSPAN6  gene.TNMD  gene.DPM1  gene.SCYL3  gene.C1orf112  \
Sample_0         4901          2       2691        2350            791   
Sample_1         2971         24       1956         599            210   
Sample_2         4951        198       3646        1556           1679   
Sample_3         3834          0       3161        1411            637   
Sample_4         1555         21       4679        2352           1122   

          gene.FGR  gene.CFH  gene.FUCA2  gene.GCLC  gene.NFYA  ...  \
Sample_0       119       896        1752       1508       4529  ...   
Sample_1      1199      2451        2001       1707       1268  ...   
Sample_2       171       389        6036        561       2647  ...   
Sample_3       468      4472        2588       1601       1585  ...   
Sample_4       529      2247        2042       2306       3581  ...   

          gene.ACTL10  gene.AC119733.1  gene.AC020765.6  gene.AC010980.1  \
Sample_0            0             

Normalize with Log2

In [19]:
reads_logged = genes_reads.applymap(lambda x: np.log2(float(x) + 1e-3))

  reads_logged = genes_reads.applymap(lambda x: np.log2(float(x) + 1e-3))


Delete reads columns with variance under the 50th percentile of the value

In [20]:
print(genes_reads.shape)

zero_var_cols = reads_logged.var()[reads_logged.var()<reads_logged.var().describe().loc['50%']].index
print(f"Columns to drop:{len(zero_var_cols)}")
reads_logged.drop(columns=[i for i in reads_logged.columns if i in zero_var_cols], inplace=True)
print(reads_logged.shape)

(760, 19962)
Columns to drop:9981
(760, 9973)


### File creation

In [21]:
if os.path.exists(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_log.csv')):
    dataset = pd.read_csv(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_log.csv'))
    print("Loading from existing")

else:
    print("Creating new dataset file")
    dataset = raw_data.copy()
    # dropping metadata and unused columns
    dataset.drop(columns=['unstranded', 'tpm_unstranded', 'fpkm_unstranded', 'gene_name', 'gene_id', 'gene_type', 'file_name_mRNA', 'case_id', 'file_name_clinical'], inplace=True)

    reads_logged.index = dataset.index
    dataset = pd.concat([dataset, reads_logged], axis=1)
    dataset.to_csv(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_log.csv'), index=False)

print(dataset.head())
print(dataset.shape)

Creating new dataset file
   days_to_death  age_at_initial_pathologic_diagnosis  days_to_last_followup  \
0           -1.0                                   71                 1918.0   
1           -1.0                                   53                 1309.0   
2           -1.0                                   59                  238.0   
3           -1.0                                   55                    0.0   
4           -1.0                                   64                  212.0   

   Death  pathologic_stage_Stage I  pathologic_stage_Stage IA  \
0      0                       0.0                        0.0   
1      0                       1.0                        0.0   
2      0                       0.0                        0.0   
3      0                       0.0                        0.0   
4      0                       0.0                        0.0   

   pathologic_stage_Stage IIA  pathologic_stage_Stage IIB  \
0                         1.0            

### TPM unstranded case

In [22]:
if os.path.exists(os.path.join(DATA_PATH, 'genes_reads(tpm_unstranded).csv')):
    genes_reads_tpm = pd.read_csv(os.path.join(DATA_PATH, 'genes_reads(tpm_unstranded).csv'), index_col=0)
    print("Loaded from existing")

else:
    print("Creating new file")
    n_cols = len(raw_data["tpm_unstranded"][0].strip("[]").split(","))
    reads = raw_data["tpm_unstranded"].apply(parse_array)

    genes_reads_tpm = pd.DataFrame(
        np.stack(reads.values),  # Converte la Series di vettori in un array 2D
        index=[f'Sample_{i}' for i in raw_data.index],
        columns=np.array(eval(raw_data['gene_name'][0].strip('[]'), {'np':str}))
    )

    genes_reads_tpm.columns = 'gene.' + np.array(genes_reads_tpm.columns)

    genes_reads_tpm.to_csv(os.path.join(DATA_PATH, 'genes_reads(tpm_unstranded).csv'))

print(genes_reads_tpm.head())
print(genes_reads_tpm.shape)

Creating new file
          gene.TSPAN6  gene.TNMD  gene.DPM1  gene.SCYL3  gene.C1orf112  \
Sample_0      79.8332     0.1001   164.7321     25.2268         9.7898   
Sample_1      34.1691     0.8483    84.5407      4.5400         1.8351   
Sample_2      59.1275     7.2669   163.6360     12.2462        15.2351   
Sample_3      52.3390     0.0000   162.1673     12.6939         6.6071   
Sample_4      21.6296     0.8977   244.5886     21.5601        11.8579   

          gene.FGR  gene.CFH  gene.FUCA2  gene.GCLC  gene.NFYA  ...  \
Sample_0    2.5998    8.2962     45.8721    12.9291    87.8082  ...   
Sample_1   18.4948   16.0230     36.9909    10.3331    17.3574  ...   
Sample_2    2.7390    2.6407    115.8675     3.5264    37.6257  ...   
Sample_3    8.5688   34.7012     56.7876    11.5035    25.7535  ...   
Sample_4    9.8690   17.7660     45.6551    16.8828    59.2865  ...   

          gene.ACTL10  gene.AC119733.1  gene.AC020765.6  gene.AC010980.1  \
Sample_0          0.0             

In [23]:
reads_logged = genes_reads_tpm.applymap(lambda x: np.log2(float(x) + 1e-3))

  reads_logged = genes_reads_tpm.applymap(lambda x: np.log2(float(x) + 1e-3))


In [24]:
print(genes_reads_tpm.shape)

zero_var_cols = reads_logged.var()[reads_logged.var() <= reads_logged.var().describe().loc['50%']].index
print(f"Columns to drop: {len(zero_var_cols)}")
reads_logged.drop(columns=[i for i in genes_reads_tpm.columns if i in zero_var_cols], inplace=True)
print(reads_logged.shape)

(760, 19962)
Columns to drop: 9981
(760, 9972)


In [25]:
if os.path.exists(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_tpm_log.csv')):
    dataset = pd.read_csv(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_tpm_log.csv'))
    print("Loading from existing")

else:
    print("Creating new dataset file")
    dataset = raw_data.copy()
    # dropping metadata and unused columns
    dataset.drop(columns=['unstranded', 'tpm_unstranded', 'fpkm_unstranded', 'gene_name', 'gene_id', 'gene_type', 'file_name_mRNA', 'case_id', 'file_name_clinical'], inplace=True)

    reads_logged.index = dataset.index
    dataset = pd.concat([dataset, reads_logged], axis=1)
    dataset.to_csv(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_tpm_log.csv'), index=False)

print(dataset.head())
print(dataset.shape)

Creating new dataset file
   days_to_death  age_at_initial_pathologic_diagnosis  days_to_last_followup  \
0           -1.0                                   71                 1918.0   
1           -1.0                                   53                 1309.0   
2           -1.0                                   59                  238.0   
3           -1.0                                   55                    0.0   
4           -1.0                                   64                  212.0   

   Death  pathologic_stage_Stage I  pathologic_stage_Stage IA  \
0      0                       0.0                        0.0   
1      0                       1.0                        0.0   
2      0                       0.0                        0.0   
3      0                       0.0                        0.0   
4      0                       0.0                        0.0   

   pathologic_stage_Stage IIA  pathologic_stage_Stage IIB  \
0                         1.0            

### FPKM unstranded case