# Semi-Supervised_HowTo

In [None]:
import pandas as pd
import signatureanalyzer as sa

# File paths and variable constants (REPLACE INPUTS)
REF = "cosmic3_exome"
REF_PATH = "sa_cosmic3_sbs_exome.tsv"
MAF_PATH = "example.maf"
HG_PATH = "hg38.2bit"
SIG_LIST = ['SBS1','SBS2','SBS3']
N_RUNS = 10
OUT_DIR = '.'
MAX_ITER = 3000

# Extract mutational spectra
maf = pd.read_csv(MAF_PATH,sep='\t')
spectra_df = sa.spectra.get_spectra_from_maf(maf, hgfile=HG_PATH, reference=REF)[1]

# Import signature reference data
ref_df = pd.read_csv(REF_PATH, sep='\t',index_col=0)

# Map spectra index to ref index
spectra_df.index = sa.utils._map_sbs_id_sigs(spectra_df,ref_df,REF)

# Subset reference data to create W matrix
Wref_df = ref_df.set_index('Somatic Mutation Type').iloc[:,:-2]
Wref_df = Wref_df.loc[:, SIG_LIST]

# Simulate more samples that match the specified mutational signatures
n_fake_samples = 100 # <-- Change to specify number of duplicates
Wref_weight = spectra_df.sum(axis=0).median()
Wref_df = Wref_df * Wref_weight
duplicated_Wref_df = pd.concat([Wref_df.rename(columns={c: f'{c}_{i}' for c in Wref_df.columns}) for i in range(n_fake_samples)],1)

semi_supervised_spectra_df = pd.concat([spectra_df, duplicated_Wref_df],1)

# Run semi-supervised NMF
sa.run_spectra(semi_supervised_spectra_df, 
               outdir=OUT_DIR, 
               reference=REF, 
               verbose=True,
               nruns=N_RUNS,
               max_iter=MAX_ITER, # **nmf_kwargs
               objective='poisson'
              )


In [None]:
# Stack plot of only the original data (excluding the duplicated Wref samples)
run_i = 0  # <-- select which run to use
run_i_H = pd.read_hdf(f"{OUT_DIR}/nmf_output.h5", f"run{run_i}/H")
sa.pl.stacked_bar(run_i_H.loc[spectra_df.columns.tolist()], ref_type=REF)