In [2]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [6]:
clinical_df = pd.read_csv("/Users/oo/Desktop/multi_omics_data/clinical.cart.2023-11-19/clinical.tsv", sep='\t')
alive_na = (clinical_df['vital_status'] == 'Alive') & (clinical_df['days_to_last_follow_up'] == '\'--')
dead_na = (clinical_df['vital_status'] == 'Dead') & (clinical_df['days_to_death'] == '\'--')
both_na = (clinical_df['days_to_last_follow_up'] == '\'--') & (clinical_df['days_to_death'] == '\'--')
vital_na = (clinical_df['vital_status'] == '\'--')
na_mask = alive_na | dead_na | both_na | vital_na
survival_cols = ['vital_status', 'days_to_last_follow_up', 'days_to_death']

def dur(df):
    if df['vital_status'] == 'Alive':
        return df['days_to_last_follow_up']
    elif df['vital_status'] == 'Dead':
        return df['days_to_death']

clinical_df = clinical_df[~na_mask].reset_index(drop=True)
clinical_df['duration'] = clinical_df.apply(dur, axis=1)
# clinical_df[clinical_df['duration'].isna()][survival_cols]
clinical_df_d = clinical_df[clinical_df['treatment_type'] == 'Pharmaceutical Therapy, NOS'].reset_index(drop=True)
dic_status = {'Alive' : 0, 'Dead': 1}
clinical_df_d.loc[:, 'vital_status'] = clinical_df_d['vital_status'].map(dic_status)
final_clin = clinical_df_d[['case_id', 'case_submitter_id', 'vital_status', 'duration']]
final_clin


Unnamed: 0,case_id,case_submitter_id,vital_status,duration
0,0075437e-ba1a-46be-86d6-9773209a2b5e,TCGA-62-A471,0,1246
1,009be09b-f9f6-43b7-8f45-4a648f8123ce,TCGA-67-3773,0,427
2,01e9888d-b5b9-48f1-8ba6-8a89af108a04,TCGA-NJ-A7XG,0,617
3,0232d299-4cdf-4fd7-9a5e-8d13c208b40c,TCGA-91-6848,0,224
4,028e99e9-5b9a-4954-bb6e-6d4709a3cea8,TCGA-55-6986,0,3261
...,...,...,...,...
469,fe3eeeb6-0db4-46d7-a020-55604d474c12,TCGA-78-7167,1,2681
470,fe714ac0-f874-426a-924b-a2980232b5f7,TCGA-55-8616,0,48
471,ff07ea4b-4e50-410d-99d6-96a351dad7b1,TCGA-55-7570,0,824
472,ff9def3d-17e5-4ef6-b74e-933f11ed6f00,TCGA-78-7146,1,173


In [3]:
def mrna(file_name, folder, final_clin):
    mrna = pd.read_csv(file_name, sep='\t', skiprows=[0,2,3,4,5])
    temp = mrna[['gene_id', 'fpkm_unstranded']].set_index('gene_id').T.reset_index(drop=True).rename_axis(None, axis=1)
    temp1 = final_clin[final_clin['case_submitter_id'] == folder].reset_index(drop=True)
    temp2 = pd.concat([temp1, temp], axis=1)
    return temp2

def mirna(file_name, folder, final_clin):
    mirna = pd.read_csv(file_name, sep='\t')
    temp = mirna[['miRNA_ID', 'reads_per_million_miRNA_mapped']].set_index('miRNA_ID').T.reset_index(drop=True).rename_axis(None, axis=1)
    temp1 = final_clin[final_clin['case_submitter_id'] == folder].reset_index(drop=True)
    temp2 = pd.concat([temp1, temp], axis=1)
    return temp2

def dnam(file_name):
    DNAm = pd.read_csv(file_name, sep='\t', header=None)
    DNAm.columns = ['features', 'vals']
    DNAm = DNAm.set_index('features').T.reset_index(drop=True).rename_axis(None, axis=1)
    return DNAm

def cnv(file_name, folder, final_clin):
    cnv = pd.read_csv(file_name, sep='\t')
    temp = cnv[['gene_id', 'copy_number']].set_index('gene_id').T.reset_index(drop=True).rename_axis(None, axis=1)
    temp1 = final_clin[final_clin['case_submitter_id'] == folder].reset_index(drop=True)
    temp2 = pd.concat([temp1, temp], axis=1)
    return temp2

In [84]:
mrna_lst, mirna_lst, cnv_lst = [], [], []
temp = dnam("/Users/oo/Desktop/omics by case/TCGA-S2-AA1A/72a78e11-422c-4e99-8ee4-8614e6a22a43.methylation_array.sesame.level3betas.txt")

omics_file_path = "/Users/oo/Desktop/omics by case"
current_row = 0
all_case_ids = os.listdir(omics_file_path)
all_case_ids_temp = []
DNAm_df = pd.DataFrame(columns=temp.columns, index=range(407))
for case_id in all_case_ids:
    if (case_id == ".DS_Store") or (case_id not in final_clin['case_submitter_id'].values):
        continue
    case_id_path = os.path.join(omics_file_path, case_id)
    for file in os.listdir(case_id_path):
        file_path = os.path.join(case_id_path, file)
        if file.endswith('augmented_star_gene_counts.tsv'):
            mrna_df = mrna(file_path, case_id, final_clin)
            mrna_lst.append(mrna_df)
            all_case_ids_temp.append(case_id)

        elif file.endswith("mirnas.quantification.txt"):
            mirna_df = mirna(file_path, case_id, final_clin)
            mirna_lst.append(mirna_df)

        elif file.endswith("gene_level_copy_number.v36.tsv"):
            cnv_df = cnv(file_path, case_id, final_clin)
            cnv_lst.append(cnv_df)

        elif file.endswith("level3betas.txt"):
            dnam_df = dnam(file_path)
            DNAm_df.iloc[current_row] = dnam_df.iloc[0]
            current_row += 1


            

In [110]:
anno = pd.read_csv("/Users/oo/Desktop/multi_omics_data/anno.csv", usecols=['Name', 'Relation_to_Island', 'UCSC_RefGene_Name', 'UCSC_RefGene_Group'])
new_anno = anno[anno['Relation_to_Island'] == 'Island'].dropna(axis=0, how='any').reset_index(drop=True)
tss_region = new_anno[new_anno['UCSC_RefGene_Group'].str.contains("TSS1500|TSS200",na=False)].reset_index(drop=True)
tss_region = tss_region.rename(columns={"Name": "ProbeID"})
tss_region = tss_region[['ProbeID', 'UCSC_RefGene_Name', 'UCSC_RefGene_Group']]

def combine(r):
    gene_name = r['UCSC_RefGene_Name'].split(';')
    gene_group = r['UCSC_RefGene_Group'].split(';')
    kinds = set()
    for i in range(len(gene_name)):
        if gene_group[i] == 'TSS1500' or gene_group[i] == 'TSS200':
            kinds.add(gene_name[i])
    return list(kinds)
def beta2m(beta):
    return np.log2(beta / (1 - beta))
DNAm_df_temp = DNAm_df.iloc[:407, :]
DNAm_df_dropna = DNAm_df_temp.dropna(axis=1, how='any')
DNAm_beta2m = DNAm_df_dropna.applymap(beta2m)

tss_region['combined'] = tss_region.apply(combine, axis=1)
filtered_tss_region = pd.DataFrame(tss_region.explode("combined", ignore_index=True).groupby("combined")['ProbeID'].apply(list)).reset_index(names="gene")
unique_genes = filtered_tss_region['gene'].unique()
final_DNAm = pd.DataFrame(columns=filtered_tss_region['gene'], index=DNAm_beta2m.index)
for gene in unique_genes:
    probe_ids = filtered_tss_region[filtered_tss_region['gene'] == gene]['ProbeID'].values[0]
#     print(probe_ids)
#     print(DNAmm_df.loc[:, DNAmm_df.columns.isin(['cg02230017'])].values)
#     print(DNAmm_df.loc[:, DNAmm_df.columns.isin(['cg00008713'])].values)
    all_values = DNAm_beta2m.loc[:, DNAm_beta2m.columns.isin(probe_ids)].values
    if all_values.size > 0:
        final_DNAm[gene] = DNAm_beta2m.loc[:, DNAm_beta2m.columns.isin(probe_ids)].mean(axis=1)
    
final_DNAm = final_DNAm.dropna(axis=1, how='any')

gene,A2BP1,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK1,AAMP,AANAT,AARS,...,ZSWIM5,ZSWIM6,ZSWIM7,ZW10,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
0,-5.038960,-4.942189,-4.353939,-5.651118,-4.817481,-5.127069,-4.577077,-5.059847,-5.046363,-5.706856,...,-5.797847,-4.759213,-3.936748,-4.961210,-4.466751,-4.996403,-5.518416,-4.415836,-5.260178,-4.887218
1,-2.426691,-4.261985,-4.565753,-5.224075,-3.083779,-4.957586,-4.638329,-4.871841,-5.071070,-5.500130,...,-5.824192,-4.625360,-3.963478,-5.097041,-4.556276,-4.514577,-5.130649,-4.275775,-5.017652,-4.805685
2,-3.787288,-4.054782,-4.501896,-5.459971,-3.129082,-5.022833,-4.535088,-5.166228,-5.182328,-5.870040,...,-6.072285,-4.659499,-4.508261,-5.159637,-4.417827,-3.973910,-5.605817,-4.716152,-5.106786,-4.856202
3,-4.230256,-4.309209,-4.320956,-5.133924,-3.647934,-4.890492,-4.413514,-4.840340,-4.850402,-5.574877,...,-6.040668,-4.481837,-4.470332,-4.970249,-4.629679,-4.117426,-4.696925,-4.204111,-4.913747,-4.437389
4,-5.194576,-4.713163,-4.605171,-5.545901,-4.581840,-5.010922,-4.387020,-4.922054,-5.118992,-5.729604,...,-6.239138,-4.574150,-3.988873,-5.139680,-4.640154,-4.507520,-5.463356,-4.217480,-5.018856,-4.605073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,-1.889993,-5.334190,-5.394890,-5.649494,-5.095115,-5.310122,-4.566384,-5.115516,-5.359388,-5.408680,...,-5.769930,-4.708249,-4.432249,-5.167960,-4.537431,-4.417101,-5.369608,-4.833699,-5.231428,-4.693636
403,-1.942931,-4.536356,-4.756496,-5.385545,-2.424405,-5.109704,-4.625983,-5.073699,-5.238147,-5.263141,...,-5.363035,-4.725285,-4.252756,-5.148390,-4.398996,-4.618878,-5.130324,-4.438440,-5.093543,-4.377848
404,-0.550556,-4.514863,-4.765004,-5.285298,-4.325729,-5.200570,-4.499500,-4.964436,-4.944652,-5.506226,...,-5.984583,-4.621260,-4.264886,-5.002296,-4.606521,-4.517767,-5.057816,-4.242503,-4.954356,-4.635488
405,-1.691567,-4.086494,-4.205698,-5.086347,-2.187440,-5.134623,-4.422246,-5.302710,-5.275923,-5.210305,...,-5.752760,-4.807014,-4.857565,-5.020948,-4.834889,-4.261646,-5.145560,-4.366688,-5.101411,-4.786351


In [159]:
clinical_temp = final_clin.copy(deep=True)
clinical_temp = clinical_temp[clinical_temp['case_submitter_id'].isin(all_case_ids_temp)].reset_index(drop=True)
clinical_temp.set_index('case_submitter_id', inplace=True)
clinical_reorder = clinical_temp.reindex(all_case_ids_temp)
clinical_reorder.reset_index(inplace=True)
concat_DNAm = pd.concat([clinical_reorder, final_DNAm], axis=1)
concat_DNAm.to_csv("/Users/oo/Desktop/final omics data/DNAm.csv")
clinical_reorder

Unnamed: 0,case_submitter_id,case_id,vital_status,duration
0,TCGA-50-5946,c95957a7-1a1a-4c8d-bb61-7c99b500f224,0,1617
1,TCGA-50-8457,d45aee46-838e-44af-8422-46710b3240a8,0,1125
2,TCGA-S2-AA1A,31458638-e19c-43e5-ab13-9c64b3b3681d,0,513
3,TCGA-86-8280,d8faa3a7-6b3f-4e69-8c88-184e41055bd7,0,701
4,TCGA-78-7220,fd5c44ef-ea50-4fba-9e8d-e371cf34ebdb,1,807
...,...,...,...,...
402,TCGA-05-4384,9a50e7e4-831d-489f-87d2-979e987561cc,0,426
403,TCGA-53-7813,42d208bd-cd77-4bfb-ad53-9fc072a87393,0,424
404,TCGA-49-6761,4cd3d483-2283-4c6a-a57a-444216119d34,0,354
405,TCGA-78-8648,5ace9608-d38a-42f2-a877-ec7c9d211808,1,1209


In [146]:
mRNA_df, miRNA_df, CNV_df = pd.concat(mrna_lst, axis=0), pd.concat(mirna_lst, axis=0), pd.concat(cnv_lst, axis=0)
mRNA_df.reset_index(drop=True, inplace=True)
miRNA_df.reset_index(drop=True, inplace=True)
CNV_df.reset_index(drop=True, inplace=True)

In [147]:
def dropna_0s(df, isCNV):
    df.reset_index(drop=True, inplace=True)
    first_four_cols = df.iloc[:, :4]
    rest_of_cols = df.iloc[:, 4:] 

    cols_20nans = rest_of_cols.shape[0] * 0.2  
    cols_to_drop = rest_of_cols.columns[((rest_of_cols == 0).sum() > cols_20nans) |
                                        (rest_of_cols.isnull().sum() > cols_20nans)]

    rest_of_cols = rest_of_cols.drop(cols_to_drop, axis=1)

    rows_20nans = rest_of_cols.shape[1] * 0.2  
    rows_to_drop = rest_of_cols.index[((rest_of_cols == 0).sum(axis=1) > rows_20nans) |
                                    (rest_of_cols.isnull().sum(axis=1) > rows_20nans)]

    first_four_cols = first_four_cols.drop(rows_to_drop, axis=0) 
    rest_of_cols = rest_of_cols.drop(rows_to_drop, axis=0)
    if (isCNV):
        rest_of_cols = rest_of_cols.apply(lambda x: x.fillna(x.mode()[0]) if x.isna().any() else x)

    df = pd.concat([first_four_cols, rest_of_cols], axis=1)
    return df
mRNA_drop20na = dropna_0s(mRNA_df, 0)
miRNA_drop20na = dropna_0s(miRNA_df, 0)
CNV_drop20na = dropna_0s(CNV_df, 1)

mRNA_drop20na.to_csv("/Users/oo/Desktop/final omics data/mRNA.csv")
miRNA_drop20na.to_csv("/Users/oo/Desktop/final omics data/miRNA.csv")
CNV_drop20na.to_csv("/Users/oo/Desktop/final omics data/CNV.csv")
