### Init

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
base = os.path.basename(os.getcwd())
list = os.getcwd().split(os.sep) 
list.pop(list.index(base))
ROOT = '\\'.join(list)
print(ROOT)
DATA_PATH = os.path.join(ROOT, 'datasets\\preprocessed\\mRNA')
mRNA_file = 'clinical_mRNA(protein_coding).csv'

d:\Universita\2 anno magistrale\Progetto BioInf\miRNA_to_age


In [3]:
raw_data = pd.read_csv(os.path.join(DATA_PATH, mRNA_file))

In [4]:
raw_data.columns

Index(['file_name_clinical', 'days_to_death', 'pathologic_stage',
       'age_at_initial_pathologic_diagnosis', 'days_to_last_followup', 'Death',
       'case_id', 'file_name_mRNA', 'gene_id', 'gene_name', 'gene_type',
       'unstranded', 'tpm_unstranded', 'fpkm_unstranded'],
      dtype='object')

In [5]:
raw_data.shape

(781, 14)

## Data preparation

In [6]:
def parse_array(x):
    if isinstance(x, str):
        x = x.strip("[]")
        return np.array(eval(x, {'np':np}))
    
    return np.array(x)

One-hot encoding for pathologic stage and delete columns that have less than 20 samples

In [7]:
from sklearn.preprocessing import OneHotEncoder

print(raw_data.shape)

onehot = OneHotEncoder()
pathologic_stages = raw_data[['pathologic_stage']]
stages = onehot.fit_transform(pathologic_stages).toarray()
stages_cols = onehot.get_feature_names_out(['pathologic_stage'])
tmp_df = pd.DataFrame(stages, columns=stages_cols, index=pathologic_stages.index)

print(tmp_df.shape)
index = tmp_df.sum().index
tmp_df.drop(columns=[i for i in index if tmp_df.sum()[i]<20], inplace=True)
print(tmp_df.shape)

raw_data = pd.concat([raw_data, tmp_df], axis=1)
raw_data.drop(columns=['pathologic_stage'],inplace=True)
print(raw_data.shape)

(781, 14)
(781, 12)
(781, 6)
(781, 19)


Delete age outliers

In [8]:
print(raw_data.shape)

ages_distrib = raw_data['age_at_initial_pathologic_diagnosis'].value_counts()
ages_to_del = ages_distrib[ages_distrib < 5].index.tolist()
mask = raw_data['age_at_initial_pathologic_diagnosis'].isin(ages_to_del)
indexes = raw_data[mask].index.to_list()
raw_data.drop(indexes, inplace=True)
print(raw_data.shape)

(781, 19)
(760, 19)


Deleting rows that are not dead people or patients with high last_days_to_followup (25-th percentile of dead patients days_to_death)

In [9]:
print(raw_data.shape)

dead = raw_data[raw_data['Death'] == 1]
print(f"\nNumber of dead patients: {dead.shape[0]}")
describe = dead.describe()['days_to_death']
print(describe)
alive = raw_data[(raw_data['days_to_last_followup']>describe['25%']) & (raw_data['Death']==0)]
print(f"\nNumber of alive patients with days_to_last_followup > {describe['25%']}: {alive.shape[0]}")

raw_data = pd.concat([dead, alive], axis=0)
print(raw_data.shape)

(760, 19)

Number of dead patients: 70
count      70.000000
mean     1505.542857
std      1071.375950
min         1.000000
25%       639.750000
50%      1158.000000
75%      2370.000000
max      4456.000000
Name: days_to_death, dtype: float64

Number of alive patients with days_to_last_followup > 639.75: 254
(324, 19)


### Unstranded case

Separate genes reads column (unstranded) into separate columns, one for each gene

In [None]:
if os.path.exists(os.path.join(DATA_PATH, 'genes_reads(unstranded).csv')):
    genes_reads = pd.read_csv(os.path.join(DATA_PATH, 'genes_reads(unstranded).csv'), index_col=0)
    print("Loaded from existing")

else:
    print("Creating new file")
    n_cols = len(raw_data["unstranded"][0].strip("[]").split(","))
    reads = raw_data["unstranded"].apply(parse_array)

    genes_reads = pd.DataFrame(
        np.stack(reads.values),  # Converte la Series di vettori in un array 2D
        index=[f'Sample_{i}' for i in raw_data.index],
        columns=np.array(eval(raw_data['gene_name'][0].strip('[]'), {'np':str}))
    )

    genes_reads.columns = 'gene.' + np.array(genes_reads.columns)

    genes_reads.to_csv(os.path.join(DATA_PATH, 'genes_reads(unstranded).csv'))

print(genes_reads.head())
print(genes_reads.shape)

# estimated 40 sec

Creating new file
           gene.TSPAN6  gene.TNMD  gene.DPM1  gene.SCYL3  gene.C1orf112  \
Sample_5          2581          5       2766        2002            799   
Sample_28         3646         12       2340        1355            614   
Sample_40         1357         28       3998        3533            718   
Sample_78         1680         22       1497        1364            276   
Sample_80          667          5       3136        2160           1833   

           gene.FGR  gene.CFH  gene.FUCA2  gene.GCLC  gene.NFYA  ...  \
Sample_5        384      2718        3073       2459       3521  ...   
Sample_28       250      1485        4709       1768       5619  ...   
Sample_40       773      4569        1786       5385       6787  ...   
Sample_78      2243      4825        3120       1903       1138  ...   
Sample_80       421      2088        3069       4202       4368  ...   

           gene.ACTL10  gene.AC119733.1  gene.AC020765.6  gene.AC010980.1  \
Sample_5             

Normalize with Log2

In [12]:
reads_logged = genes_reads.applymap(lambda x: np.log2(float(x) + 1e-3))

  reads_logged = genes_reads.applymap(lambda x: np.log2(float(x) + 1e-3))


Delete reads columns with variance under the 50th percentile of the value

In [13]:
print(genes_reads.shape)

zero_var_cols = reads_logged.var()[reads_logged.var()<reads_logged.var().describe().loc['50%']].index
print(f"Columns to drop:{len(zero_var_cols)}")
reads_logged.drop(columns=[i for i in reads_logged.columns if i in zero_var_cols], inplace=True)
print(reads_logged.shape)

(324, 19962)
Columns to drop:9981
(324, 9973)


### File creation

In [14]:
if os.path.exists(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_log.csv')):
    dataset = pd.read_csv(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_log.csv'))
    print("Loading from existing")

else:
    print("Creating new dataset file")
    dataset = raw_data.copy()
    # dropping metadata and unused columns
    dataset.drop(columns=['unstranded', 'tpm_unstranded', 'fpkm_unstranded', 'gene_name', 'gene_id', 'gene_type', 'file_name_mRNA', 'case_id', 'file_name_clinical'], inplace=True)

    reads_logged.index = dataset.index
    dataset = pd.concat([dataset, reads_logged], axis=1)
    dataset.to_csv(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_log.csv'), index=False)

print(dataset.head())
print(dataset.shape)

Creating new dataset file
    days_to_death  age_at_initial_pathologic_diagnosis  days_to_last_followup  \
5          2763.0                                   46                 2763.0   
28         4456.0                                   50                 4456.0   
40         2520.0                                   55                 2520.0   
78          538.0                                   79                  538.0   
80         2551.0                                   47                 2551.0   

    Death  pathologic_stage_Stage I  pathologic_stage_Stage IA  \
5       1                       1.0                        0.0   
28      1                       0.0                        0.0   
40      1                       1.0                        0.0   
78      1                       0.0                        0.0   
80      1                       0.0                        0.0   

    pathologic_stage_Stage IIA  pathologic_stage_Stage IIB  \
5                          0

### TPM unstranded case

In [None]:
if os.path.exists(os.path.join(DATA_PATH, 'genes_reads(tpm_unstranded).csv')):
    genes_reads_tpm = pd.read_csv(os.path.join(DATA_PATH, 'genes_reads(tpm_unstranded).csv'), index_col=0)
    print("Loaded from existing")

else:
    print("Creating new file")
    n_cols = len(raw_data["tpm_unstranded"][0].strip("[]").split(","))
    reads = raw_data["tpm_unstranded"].apply(parse_array)

    genes_reads_tpm = pd.DataFrame(
        np.stack(reads.values),  # Converte la Series di vettori in un array 2D
        index=[f'Sample_{i}' for i in raw_data.index],
        columns=np.array(eval(raw_data['gene_name'][0].strip('[]'), {'np':str}))
    )

    genes_reads_tpm.columns = 'gene.' + np.array(genes_reads_tpm.columns)

    genes_reads_tpm.to_csv(os.path.join(DATA_PATH, 'genes_reads(tpm_unstranded).csv'))

print(genes_reads_tpm.head())
print(genes_reads_tpm.shape)

# estimated 20 sec

Creating new file
           gene.TSPAN6  gene.TNMD  gene.DPM1  gene.SCYL3  gene.C1orf112  \
Sample_5       31.8307     0.1895   128.1966     16.2711         7.4869   
Sample_28      37.2234     0.3765    89.7803      9.1166         4.7628   
Sample_40      12.2208     0.7749   135.3093     20.9680         4.9129   
Sample_78      32.9920     1.3277   110.4809     17.6526         4.1182   
Sample_80      10.1116     0.2329   178.6637     21.5796        21.1133   

           gene.FGR  gene.CFH  gene.FUCA2  gene.GCLC  gene.NFYA  ...  \
Sample_5     6.3517   19.0536     60.9168    15.9619    51.6843  ...   
Sample_28    3.4233    8.6178     77.2759     9.5005    68.2799  ...   
Sample_40    9.3368   23.3889     25.8533    25.5253    72.7497  ...   
Sample_78   59.0784   53.8601     98.4851    19.6700    26.5997  ...   
Sample_80    8.5600   17.9927     74.7838    33.5287    78.8154  ...   

           gene.ACTL10  gene.AC119733.1  gene.AC020765.6  gene.AC010980.1  \
Sample_5           0.

Apply log2 normalization

In [16]:
reads_logged = genes_reads_tpm.applymap(lambda x: np.log2(float(x) + 1e-3))

  reads_logged = genes_reads_tpm.applymap(lambda x: np.log2(float(x) + 1e-3))


In [17]:
print(genes_reads_tpm.shape)

zero_var_cols = reads_logged.var()[reads_logged.var() <= reads_logged.var().describe().loc['50%']].index
print(f"Columns to drop: {len(zero_var_cols)}")
reads_logged.drop(columns=[i for i in genes_reads_tpm.columns if i in zero_var_cols], inplace=True)
print(reads_logged.shape)

(324, 19962)
Columns to drop: 9981
(324, 9972)


In [18]:
if os.path.exists(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_tpm_log.csv')):
    dataset = pd.read_csv(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_tpm_log.csv'))
    print("Loading from existing")

else:
    print("Creating new dataset file")
    dataset = raw_data.copy()
    # dropping metadata and unused columns
    dataset.drop(columns=['unstranded', 'tpm_unstranded', 'fpkm_unstranded', 'gene_name', 'gene_id', 'gene_type', 'file_name_mRNA', 'case_id', 'file_name_clinical'], inplace=True)

    reads_logged.index = dataset.index
    dataset = pd.concat([dataset, reads_logged], axis=1)
    dataset.to_csv(os.path.join(DATA_PATH, 'clinical_mRNA_normalized_tpm_log.csv'), index=False)

print(dataset.head())
print(dataset.shape)

Creating new dataset file
    days_to_death  age_at_initial_pathologic_diagnosis  days_to_last_followup  \
5          2763.0                                   46                 2763.0   
28         4456.0                                   50                 4456.0   
40         2520.0                                   55                 2520.0   
78          538.0                                   79                  538.0   
80         2551.0                                   47                 2551.0   

    Death  pathologic_stage_Stage I  pathologic_stage_Stage IA  \
5       1                       1.0                        0.0   
28      1                       0.0                        0.0   
40      1                       1.0                        0.0   
78      1                       0.0                        0.0   
80      1                       0.0                        0.0   

    pathologic_stage_Stage IIA  pathologic_stage_Stage IIB  \
5                          0