In [None]:
import os
import pathlib

import pandas as pd


base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
experiment_path = base_path / 'mice_data_set' / 'out'
    
phenomes = pd.concat(map(pd.read_csv, experiment_path.glob('pheno_*.csv')))
phenomes

In [None]:
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    
    # Drop NaN fields
    df = df.dropna()
    
    # Filter out invalid measurements
    df = df[((df["discard"] == "no") & (df["mixup"] == "no"))]
    
    # Remove non-relevant fields
    irrelevant_fields = pd.read_csv('irrelevant-fields.csv')
    relevant_fields = list(set(phenomes.columns) - set(irrelevant_fields['FIELDS']))
    df = df.filter(relevant_fields)
    
    # Remove unnecessary precision
    df = df.round(2)
    
    return df

phenomes = clean_dataframe(phenomes)
phenomes

In [None]:
# Write synthetic training set to CSV
phenomes.to_csv(base_path / 'mice_data_set' / 'data' / 'phenomes_train.csv', header=True, index=False)

In [None]:
# Manually batch training from matching covariates and write to CSV
def create_manual_batches(df: pd.DataFrame):

    batches = [
        ['BMD', 'SW16', 'abBMD', 'TA', 'tibia', 'EDL', 'soleus', 'plantaris', 'gastroc', 'SW6', 'sacweight', 'SW17', 'testisweight', 'methage', 'PPIweight', 'bw1'],
        ['bw2', 'bw3', 'bw0', 'taillength', 'SW3', 'D3ctrtime0to30', 'D3ctrtime0to15', 'SW11', 'fastglucose', 'SW1', 'glucoseage', 'SW19', 'SW7', 'SW22', 'SW14', 'SW24', 'SW20', 'SW10', 'SW9', 'SW4'],
        ['AvToneD3', 'AvContextD2', 'AvToneD1', 'PreTrainD1', 'PPIbox1', 'FCbox1', 'methcage9', 'PPIbox2', 'PPIbox3', 'FCbox2', 'methcage10', 'methcage11', 'FCbox3', 'PPIbox4', 'methcage7', 'methcage8', 'methcage12'],
        ['D3vact0to30', 'D3vact0to15', 'D1vact0to15', 'D1vact0to30', 'D2vact0to15', 'D2vact0to30', 'D2ctrtime0to15', 'D2ctrtime0to30', 'D1ctrtime0to30', 'D1ctrtime0to15'],
        ['D1hact0to15', 'D1hact0to30', 'D2hact0to30', 'D2hact0to15', 'D2TOTDIST5', 'D2TOTDIST15', 'D2TOTDIST10', 'D2totaldist0to15', 'D2totaldist0to30', 'D2TOTDIST20', 'D2TOTDIST30', 'D2TOTDIST25', 'D1TOTDIST25', 'D1TOTDIST30', 'D1TOTDIST20', 'D1TOTDIST15', 'D1TOTDIST10', 'D1totaldist0to15', 'D1totaldist0to30', 'D1TOTDIST5'],
        ['D3TOTDIST5', 'D3TOTDIST30', 'D3TOTDIST25', 'D3TOTDIST20', 'D3TOTDIST15', 'D3totaldist0to30', 'D3totaldist0to15', 'D3TOTDIST10', 'D3hact0to15', 'D3hact0to30'],
    ]
    
    for idx, batch in enumerate(batches):
        print(f"Writing batch_{idx} to disk ({len(batch)} fields)")
        df.filter(batch).to_csv(base_path / 'mice_data_set' / 'data' / f'phenomes_batch_{idx}.csv', header=True, index=False)

create_manual_batches(phenomes)