In [None]:
import pandas as pd
import datetime
import pickle
import numpy as np

## Data Import

In [None]:
in_dir = "/home/ivm/valid/data/processed_data/step3/"
in_file_date = "2024-10-18"
# FINNGENID, EVENT_AGE, DATE, VALUE, ABNORM (FinnGen), ABNORM_CUSTOM (Reference ranges), EDGE (helper 1: last measurement, 0: first measurement, 2: second to last measurement)
data = pd.read_csv(in_dir + "krea_" + in_file_date + ".csv")
# FINNGENID, SEX, + other metadata
metadata = pd.read_csv(in_dir + "krea_" + in_file_date + "_meta.csv")
data.DATE = pd.to_datetime(data.DATE)

## Quantile Preparation

In [None]:
# Defining quantiles
steps = 0.1 # size of quantile steps
quants = np.append(np.quantile(data.VALUE, np.arange(0, 1, steps), method="higher"), data.VALUE.max())
# Adding column with cut data
data["QUANT"] = pd.cut(data.VALUE, quants, include_lowest=True)
# Mapping quantiles to tokens
quant_df = pd.DataFrame({"INTERVAL": data.QUANT.cat.categories}).reset_index(drop=False)
quant_df.index = quant_df.index+3 # currently 0-x now 2-x+2 with 1-2 sex tokens
quant_map = dict(zip(quant_df.INTERVAL, quant_df.index))
# Mapping quantiles to tokens in data
data.loc[:,"EVENT"] = data.QUANT.map(quant_map)

## Sex token preparation

In [None]:
# Preparing sex token data
static_rows = metadata[["FINNGENID", "SEX"]]
static_rows["SEX"].values[:] = static_rows["SEX"].map({"female": 1, "male": 2})
static_rows["EVENT_AGE"] = 0 # Sex tokens at time 0
static_rows = static_rows.rename({"SEX": "EVENT"}, axis=1)  

# Adding sex tokens to data
data = data[["FINNGENID", "EVENT", "EVENT_AGE"]]
data = pd.concat([data, static_rows], ignore_index=True)
data = data.sort_values(by=["FINNGENID", "EVENT_AGE"]).reset_index(drop=True)

## Mapping IDs

In [None]:
fg_map = dict([(y, x+1) for x,y in enumerate(data.FINNGENID.unique())])
data["FINNGENID"] = data.FINNGENID.map(fg_map)
data["EVENT_AGE"] = data.EVENT_AGE*365.25 # Converting ages to days
data = data[["FINNGENID", "EVENT_AGE", "EVENT"]]
data

## Training and Validation set

In [None]:
# Set size of validation set
val_size = 0.1

n_indvs = len(fg_map)

np.random.seed(2813)
indv_idxs = np.random.permutation(n_indvs)
n_valid = int(n_indvs*val_size)

## Saving Data

In [None]:
out_dir = "/home/ivm/valid/data/processed_data/delphi/krea/2024-10-30/"

### Data Matrix

In [None]:
train_data = data.loc[data.FINNGENID.isin(indv_idxs[n_valid:])]
val_data = data.loc[data.FINNGENID.isin(indv_idxs[:n_valid])]

train_memmap = np.memmap(out_dir + "train.bin", dtype="uint32", mode="w+", shape=train_data.shape)
train_memmap[:] = train_data[:]
train_memmap.flush()

val_memmap = np.memmap(out_dir + "val.bin", dtype="uint32", mode="w+", shape=val_data.shape)
val_memmap[:] = val_data[:]
val_memmap.flush()

### Meta data

In [None]:
vocab_size = max(quant_map.values())
meta = {
    "vocab_size": vocab_size+2, ### I have no idea why this has to be +2 but otherwise code fails
    "itos": {i:i for i in range(vocab_size)}, # not using right now
    "stoi": {i:i for i in range(vocab_size)} # not really using right now ?
}
print(meta)
with open(out_dir + "krea_meta.pkl", "wb") as fout: pickle.dump(meta, fout)

In [None]:
with open(out_dir + "krea_labels.txt", "w") as fout:
    for key, value in quant_map.items():
        fout.write("{},{},{}\n".format(key.left, key.right, value))

In [None]:
with open(out_dir + "fids.txt", "w") as fout:
    for key, value in fg_map.items():
        group = 0 # training
        if value in indv_idxs[:n_valid]: group = 1 # validation
        fout.write("{},{},{}\n".format(key, value, group))