In [None]:
import pandas as pd

## LUAD

In [None]:
url = r'data\TCGA-LUAD.mutect2_snv.tsv'
df_LUAD = pd.read_csv(url, sep='\t', index_col=0)
df_LUAD = df_LUAD[['gene', 'effect']] # kick chr and vaf
df = df_LUAD
df_LUAD

## LUSC

In [None]:
url = r'data\TCGA-LUSC.mutect2_snv.tsv'
df_LUSC = pd.read_csv(url, sep='\t', index_col=0)
df_LUSC = df_LUSC[['gene', 'effect']] # kick chr and vaf
df_LUSC

In [None]:
#combine datasets
df = pd.concat([df_LUAD, df_LUSC]) 
df

In [None]:
#filter effects
include_effect = ['stop_gained', 'stop_lost', 'missense_variant', 'frameshift_variant']
df = df[df.effect.str.contains('|'.join(include_effect))]
df.shape

In [None]:
#occurence matrix 
df = pd.crosstab(df.index, df.gene)
df_TMB = df #for TMB safe
df

In [None]:
# reduce multiple muts to one in df
df = df.clip(upper=1)
df.max()

In [None]:
#remove cols sum = 0 or only one patient
df = df.loc[:, (df.sum(axis=0) > 1)]

In [None]:
## check sparsity
(df.to_numpy() == 0).mean()

In [None]:
df = df.reset_index()
df.rename(columns = {'row_0':'Sample_ID'}, inplace = True)
df.dropna(how='all', axis=1, inplace=True)

#remove double samples if present
print(df.Sample_ID.is_unique)
df

In [None]:
df.to_csv("data\TCGA_LUAD_mutation2.csv")

## Export TMB for samples

In [None]:
#saved before clipping mut to 1
df_TMB['TMB'] = df_TMB.sum(axis = 1)/30
df_TMB['TMB'].plot.hist()

In [None]:
hypermut = ['Yes' if x >= 8.7 else 'No' for x in df_TMB['TMB']]
df_TMB['Hypermut'] = hypermut
df_TMB['Sample_ID'] = df_TMB.index.values
df_TMB = df_TMB[['Sample_ID', 'TMB']]
df_TMB.reset_index(drop=True, inplace=True)
df_TMB

In [None]:
df_TMB.to_csv('data\TCGA_BRCA_TMB.csv')