# Generate fake data

In [1]:
import pandas as pd
import numpy as np
import functools
import time

In [4]:
df = pd.DataFrame(index=[f'sample_{i}' for i in range(20)])
df.index.name = 'sample_id'
df['gender'] = ['male' if i > 5 else 'female' for i in np.random.randint(low=0, high=10, size=20)]
df['age'] = np.random.randint(low=30, high=80, size=20)
tissue_origins = ['breast', 'lung', 'skin', 'bone marrow']
df['tissue_origin'] = [tissue_origins[i] for i in np.random.randint(low=0, high=len(tissue_origins), size=df.shape[0])]

In [5]:
output_dir = './example_data/AnnoMate_Tutorial'

In [7]:
data_dir = f'treatments'
treatments = ['XRT', 'HSCT', 'Anthracycline', 'anti-TNF']
response = ['remission', 'relapse']
for sample_id, r in df.iterrows():
    fn = f'{data_dir}/{sample_id}.treatments.tsv'
    treat_df = pd.DataFrame(index=range(np.random.randint(low=1, high=5)))
    treat_df['treatment_name'] = [treatments[i] for i in np.random.randint(low=0, high=len(treatments), size=treat_df.shape[0])]
    treat_df['response'] = [response[i] for i in np.random.randint(low=0, high=len(response), size=treat_df.shape[0])]
    
    rand_first = np.random.randint(low=0, high=100) 
    rand_step = np.cumsum(np.random.randint(low=0, high=100, size=treat_df.shape[0]) )
    treat_df['tx_start'] = [rand_first + rand_step[i] for i in range(treat_df.shape[0])]
    treat_df.to_csv(fn, sep='\t', index=False)
    df.loc[sample_id, 'treatments_file'] = f'{output_dir}/{fn}'


In [10]:
mut_vafs_dir = f'mut_vafs'
for sample_id, r in df.iterrows():
    fn = f'{mut_vafs_dir}/{sample_id}.mut_vafs.tsv'
    
    size = np.random.randint(low=10, high=50)
    mut_df = pd.DataFrame(index=list(range(size)))
    mut_df['gene'] = [f'gene_{i}' for i in range(size)]
    mut_df['vaf'] = np.random.random(size=size)
    mut_df['sample_id'] = sample_id
    mut_df['cov'] = np.random.randint(low=50, high=200, size=size) 
    mut_df['t_alt_count'] = (mut_df['cov'] * mut_df['vaf']).astype(int)
    mut_df['t_ref_count'] = mut_df['cov'] - mut_df['t_alt_count']
    mut_df.to_csv(fn, sep='\t', index=False)
    df.loc[sample_id, 'mutations_file'] = f'{output_dir}/{fn}'
    

In [11]:
df.to_csv('data_to_review_example.tsv', sep='\t')