## Prep for LUAD LUSC and Combined LUNG

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler

## LUAD

In [None]:
url = r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUAD\RNAseq\TCGA-LUAD.htseq_fpkm.tsv' # r"Z:\HiWi\Popp\TCGA_Breast_2022\TCGA-BRCA.htseq_fpkm.tsv" # 
df_LUAD = pd.read_csv(url, sep='\t', index_col=0)
df_LUAD = df_LUAD.T
df_LUAD = df_LUAD.reset_index()
df_LUAD.rename(columns = {'index':'Sample_ID'}, inplace = True)
df_LUAD.dropna(how='all', axis=1, inplace=True) # drop patients
df_LUAD.set_index('Sample_ID', inplace = True)
#check double samples if present
print(df_LUAD.index.is_unique)
df = df_LUAD
df_LUAD

## LUSC

In [None]:
url = r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUSC\RNAseq\TCGA-LUSC.htseq_fpkm.tsv'
df_LUSC = pd.read_csv(url, sep='\t', index_col=0)
df_LUSC = df_LUSC.T
df_LUSC = df_LUSC.reset_index()
df_LUSC.rename(columns = {'index':'Sample_ID'}, inplace = True)
df_LUSC.dropna(how='all', axis=1, inplace=True) # drop patients
df_LUSC.set_index('Sample_ID', inplace = True)
#check double samples if present
print(df_LUSC.index.is_unique)
df_LUSC

In [None]:
#combine datasets
if list(df_LUAD.columns) == list(df_LUSC.columns): #if same order
    df = pd.concat([df_LUAD, df_LUSC]) 
df

In [None]:
#remove gene cols sum = 0
df = df.loc[:, (df.sum(axis=0) != 0)]
#remove double samples if present
assert df.index.is_unique, "Check Dataframe"
df

In [None]:
#remove in <5% of population --> kicked  nearly half!!
mask = []
threshold = df.shape[0] * 0.05 #5%
for column in df.iloc[:,:]:
    col = df[column]
    if col.where(col > 0).sum() > threshold:
        mask.append(col.name)
df = df[mask]
df

In [None]:
# kick low median deviation
from scipy.stats import median_abs_deviation

#for each col get median deviation
devs = []
for column in list(df.columns):
    col = df[column]
    devs.append(median_abs_deviation(col.values))
df.loc[len(df)] = devs

#sort and take top 10000 from 35000
devs.sort(reverse = True)
threshold_devs = devs[10000]

In [None]:
pd.Series(devs).hist()

In [None]:
#filter for threshold
import numpy as np
mask = df.iloc[-1] > threshold_devs
keep = np.where(mask)[0]
df = df.iloc[:-1,keep.tolist()] #subset & kick last row

df

In [None]:
#scale 0-1
df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns=df.columns, index= df.index)
df

In [None]:
#reset index to Sample_ID
df.reset_index(inplace=True)

In [None]:
#df.to_csv('Z:\HiWi\Popp\TCGA_Breast_2022\TCGA_BRCA_RNA_seq.csv')
#df.to_csv('Z:\HiWi\Popp\TCGA_NSCLC_2022\LUNG\LUNG_RNA_seq.csv')
#df.to_csv('Z:\HiWi\Popp\TCGA_NSCLC_2022\LUNG\LUNG_RNA_seq_all.csv')
#df.to_csv("Z:\HiWi\Popp\TCGA_NSCLC_2022\LUAD\RNAseq\LUAD_RNA_seq_36000_unscaled.csv")
df.to_csv("Z:\HiWi\Popp\TCGA_NSCLC_2022\LUAD\RNAseq\LUAD_RNA_seq.csv")