In [1]:
"""
Pipe line to process simulation data and extract features for the inverse modelling task, and save them for future use.
The purpose is to cut down on the time it takes to process the data and extract features.
"""

from pathlib import Path
import json
import pandas as pd
from inverse_modelling_tfo.data_pipelines.fetal_conc_groups import dan_iccps_pencil1, generate_grouping_from_config
from inverse_modelling_tfo.data import config_based_normalization
from inverse_modelling_tfo.data.intensity_interpolation import (
    interpolate_exp,
    get_interpolate_fit_params,
    exp_piecewise_affine,
)
from inverse_modelling_tfo.data.interpolation_function_zoo import *
from inverse_modelling_tfo.features.build_features import (
    FetalACFeatureBuilder,
    RowCombinationFeatureBuilder,
    TwoColumnOperationFeatureBuilder,
    FetalACbyDCFeatureBuilder,
    LogTransformFeatureBuilder,
    ConcatenateFeatureBuilder,
)
from inverse_modelling_tfo.features.data_transformations import (
    LongToWideIntensityTransformation,
    ToFittingParameterTransformation,
)

# Data Setup
# ==========================================================================================
out_dest = Path(r'/home/rraiyan/personal_projects/tfo_inverse_modelling/data/processed_data') / 'sythentic_ppg_base2.pkl'
# out_dest = Path(__file__).parent.parent.parent / "data" / "processed_data" / "pulastion_ratio.pkl"
# out_dest = Path(__file__).parent.parent.parent / "data" / "processed_data" / "processed1_max_long_range.pkl"
config_dest = out_dest.with_suffix(".json")

in_src = Path(r'/home/rraiyan/simulations/tfo_sim/data/compiled_intensity/synthetic_ppg_base2.pkl')
# in_src = Path(r"/home/rraiyan/simulations/tfo_sim/data/compiled_intensity/weitai_data.pkl")
config_src = in_src.with_suffix(".json")

data = pd.read_pickle(in_src)
config_based_normalization(data, config_src)

In [2]:
rounding_points = 2     # Since we are comparing floats, better to round them up to avoid comparison errors
with config_src.open("r") as file:
    config = json.load(file)

fetal_grouping_map = config["fconc_centers"]  # The key will be strings, needs to be converted to float
maternal_grouping_map = config["mconc_centers"]
fetal_grouping_map = {round(float(k), rounding_points): v for k, v in fetal_grouping_map.items()}
maternal_grouping_map = {round(float(k), rounding_points): v for k, v in maternal_grouping_map.items()}

In [12]:
maternal_grouping_map

{11.0: 0,
 10.72: 0,
 12.25: 1,
 11.94: 1,
 13.5: 2,
 13.16: 2,
 14.75: 3,
 14.38: 3,
 16.0: 4,
 15.6: 4}

$$
\mu_a = c \times \epsilon (S)
$$

$$
I = \int exp(-\mu_a * L) p(L) dL
$$

In [3]:
# Data Processing
# ==========================================================================================
data = data.drop(columns="Uterus Thickness")

# Interpolate intensity to remove noise
# data = interpolate_exp(data, weights=(1, 0.6), interpolation_function=exp_piecewise_affine, break_indices=[4, 12, 20])
# data["Intensity"] = data["Interpolated Intensity"]  # Replace OG intensity with interpolated intensity
# data = data.drop(columns="Interpolated Intensity")  # Cleanup

# Define data transformers
data_transformer = LongToWideIntensityTransformation()
# fitting_param_transformer = ToFittingParameterTransformation()

# Transform data
# fitting_params = fitting_param_transformer.transform(data)
data = data_transformer.transform(data)
labels = data_transformer.get_label_names()
intensity_columns = data_transformer.get_feature_names()

# Cleanup
data.dropna(inplace=True)

# Create fetal conc. grouping column - used for generating the AC component/which rows to choose for pairing
data["FconcCenters"] = data["Fetal Hb Concentration"].round(rounding_points).map(fetal_grouping_map)
data["MconcCenters"] = data["Maternal Hb Concentration"].round(rounding_points).map(maternal_grouping_map)
data.head()

Unnamed: 0,Maternal Wall Thickness,Maternal Hb Concentration,Maternal Saturation,Fetal Hb Concentration,Fetal Saturation,10_2.0,15_2.0,19_2.0,24_2.0,28_2.0,...,64_1.0,68_1.0,72_1.0,77_1.0,81_1.0,86_1.0,90_1.0,94_1.0,FconcCenters,MconcCenters
0,4.0,10.725,0.92,14.625,0.2,1e-05,5.70197e-07,6.087089e-08,5.395582e-09,8.157253e-10,...,8.814385e-09,2.703929e-09,1.052361e-09,4.828143e-10,2.410027e-10,1.324828e-10,7.64967e-11,4.169385e-11,0,0
1,4.0,10.725,0.92,14.625,0.22,1e-05,5.701569e-07,6.085519e-08,5.391285e-09,8.143913e-10,...,8.889911e-09,2.736206e-09,1.067033e-09,4.899996e-10,2.446058e-10,1.344247e-10,7.75712e-11,4.229942e-11,0,0
2,4.0,10.725,0.92,14.625,0.24,1e-05,5.701191e-07,6.083985e-08,5.387013e-09,8.130776e-10,...,8.9676e-09,2.769635e-09,1.082202e-09,4.974286e-10,2.483403e-10,1.364313e-10,7.868154e-11,4.292474e-11,0,0
3,4.0,10.725,0.92,14.625,0.26,1e-05,5.700776e-07,6.082448e-08,5.382728e-09,8.117611e-10,...,1.836106e-08,4.967207e-09,1.69561e-09,7.041862e-10,3.297718e-10,1.768897e-10,1.015672e-10,5.443959e-11,0,0
4,4.0,10.725,0.92,14.625,0.28,1e-05,5.700378e-07,6.080918e-08,5.378487e-09,8.104513e-10,...,1.843656e-08,4.99908e-09,1.709842e-09,7.109957e-10,3.331626e-10,1.787136e-10,1.025857e-10,5.500811e-11,0,0


In [4]:
labels = labels + ["FconcCenters", "MconcCenters"] # This new column grouping should also be treated as a label
# fitting_params['FconcCenters'] = data['FconcCenters']
fixed_columns = [
    "Maternal Wall Thickness",
    "Maternal Saturation",
    "Fetal Saturation",
    "FconcCenters",
    "MconcCenters"
]
# These groups should be combined into a single row
data_test = data.groupby(fixed_columns)
data_test.groups

{(4.0, 0.92, 0.2, 0, 0): [0, 26, 260, 286], (4.0, 0.92, 0.2, 0, 1): [520, 546, 780, 806], (4.0, 0.92, 0.2, 0, 2): [1040, 1066, 1300, 1326], (4.0, 0.92, 0.2, 0, 3): [1560, 1586, 1820, 1846], (4.0, 0.92, 0.2, 0, 4): [2080, 2106, 2340, 2366], (4.0, 0.92, 0.22, 0, 0): [1, 27, 261, 287], (4.0, 0.92, 0.22, 0, 1): [521, 547, 781, 807], (4.0, 0.92, 0.22, 0, 2): [1041, 1067, 1301, 1327], (4.0, 0.92, 0.22, 0, 3): [1561, 1587, 1821, 1847], (4.0, 0.92, 0.22, 0, 4): [2081, 2107, 2341, 2367], (4.0, 0.92, 0.24, 0, 0): [2, 28, 262, 288], (4.0, 0.92, 0.24, 0, 1): [522, 548, 782, 808], (4.0, 0.92, 0.24, 0, 2): [1042, 1068, 1302, 1328], (4.0, 0.92, 0.24, 0, 3): [1562, 1588, 1822, 1848], (4.0, 0.92, 0.24, 0, 4): [2082, 2108, 2342, 2368], (4.0, 0.92, 0.26, 0, 0): [3, 29, 263, 289], (4.0, 0.92, 0.26, 0, 1): [523, 549, 783, 809], (4.0, 0.92, 0.26, 0, 2): [1043, 1069, 1303, 1329], (4.0, 0.92, 0.26, 0, 3): [1563, 1589, 1823, 1849], (4.0, 0.92, 0.26, 0, 4): [2083, 2109, 2343, 2369], (4.0, 0.92, 0.28, 0, 0): [4,

In [5]:
data.iloc[[220, 231, 330, 341], :]

Unnamed: 0,Maternal Wall Thickness,Maternal Hb Concentration,Maternal Saturation,Fetal Hb Concentration,Fetal Saturation,10_2.0,15_2.0,19_2.0,24_2.0,28_2.0,...,64_1.0,68_1.0,72_1.0,77_1.0,81_1.0,86_1.0,90_1.0,94_1.0,FconcCenters,MconcCenters
220,4.0,10.725,1.0,14.625,0.44,1e-05,5.554418e-07,5.894154e-08,5.182645e-09,7.774925e-10,...,2.467167e-08,6.768962e-09,2.298097e-09,9.398068e-10,4.316883e-10,2.283712e-10,1.296482e-10,6.957945e-11,0,0
231,4.0,10.725,1.0,14.625,0.66,1e-05,5.55037e-07,5.879043e-08,5.140952e-09,7.647686e-10,...,2.604277e-08,7.355647e-09,2.562896e-09,1.067347e-09,4.949911e-10,2.621685e-10,1.483349e-10,8.000646e-11,0,0
330,4.0,11.0,0.94,14.625,0.56,1e-05,5.438391e-07,5.747921e-08,5.031635e-09,7.525459e-10,...,1.990531e-08,5.589755e-09,1.968407e-09,8.335463e-10,3.936723e-10,2.110707e-10,1.205647e-10,6.502893e-11,0,0
341,4.0,11.0,0.94,15.0,0.26,1e-05,5.441946e-07,5.761278e-08,5.068371e-09,7.638247e-10,...,1.841886e-08,4.961673e-09,1.686203e-09,6.979112e-10,3.261623e-10,1.748562e-10,1.004184e-10,5.378024e-11,0,0


In [6]:
# Do the combination 
fb1 = RowCombinationFeatureBuilder(intensity_columns, fixed_columns, ["Fetal Hb Concentration", "Maternal Hb Concentration"], "comb", 4)
data = fb1(data)
print(len(data))

20150


In [7]:
# Recheck the grouping - now we should have a single row per group
data_test = data.groupby(fixed_columns)
data_test.groups

{(4.0, 0.92, 0.2, 0.0, 0.0): [0], (4.0, 0.92, 0.2, 0.0, 1.0): [1], (4.0, 0.92, 0.2, 0.0, 2.0): [2], (4.0, 0.92, 0.2, 0.0, 3.0): [3], (4.0, 0.92, 0.2, 0.0, 4.0): [4], (4.0, 0.92, 0.22, 0.0, 0.0): [5], (4.0, 0.92, 0.22, 0.0, 1.0): [6], (4.0, 0.92, 0.22, 0.0, 2.0): [7], (4.0, 0.92, 0.22, 0.0, 3.0): [8], (4.0, 0.92, 0.22, 0.0, 4.0): [9], (4.0, 0.92, 0.24, 0.0, 0.0): [10], (4.0, 0.92, 0.24, 0.0, 1.0): [11], (4.0, 0.92, 0.24, 0.0, 2.0): [12], (4.0, 0.92, 0.24, 0.0, 3.0): [13], (4.0, 0.92, 0.24, 0.0, 4.0): [14], (4.0, 0.92, 0.26, 0.0, 0.0): [15], (4.0, 0.92, 0.26, 0.0, 1.0): [16], (4.0, 0.92, 0.26, 0.0, 2.0): [17], (4.0, 0.92, 0.26, 0.0, 3.0): [18], (4.0, 0.92, 0.26, 0.0, 4.0): [19], (4.0, 0.92, 0.28, 0.0, 0.0): [20], (4.0, 0.92, 0.28, 0.0, 1.0): [21], (4.0, 0.92, 0.28, 0.0, 2.0): [22], (4.0, 0.92, 0.28, 0.0, 3.0): [23], (4.0, 0.92, 0.28, 0.0, 4.0): [24], (4.0, 0.92, 0.3, 0.0, 0.0): [25], (4.0, 0.92, 0.3, 0.0, 1.0): [26], (4.0, 0.92, 0.3, 0.0, 2.0): [27], (4.0, 0.92, 0.3, 0.0, 3.0): [28], (4.

In [8]:
group_lengths = [len(data_test.groups[group_key]) for group_key in data_test.groups.keys()]
assert all([x == 1 for x in group_lengths]), "Grouping is not correct"

In [9]:
fb1.get_label_names()

['Maternal Wall Thickness',
 'Maternal Saturation',
 'Fetal Saturation',
 'FconcCenters',
 'MconcCenters',
 'Fetal Hb Concentration 1',
 'Maternal Hb Concentration 1',
 'Fetal Hb Concentration 2',
 'Maternal Hb Concentration 2',
 'Fetal Hb Concentration 3',
 'Maternal Hb Concentration 3',
 'Fetal Hb Concentration 4',
 'Maternal Hb Concentration 4']

In [10]:
data.head()

Unnamed: 0,10_2.0_1,15_2.0_1,19_2.0_1,24_2.0_1,28_2.0_1,33_2.0_1,37_2.0_1,41_2.0_1,46_2.0_1,50_2.0_1,...,FconcCenters,MconcCenters,Fetal Hb Concentration 1,Maternal Hb Concentration 1,Fetal Hb Concentration 2,Maternal Hb Concentration 2,Fetal Hb Concentration 3,Maternal Hb Concentration 3,Fetal Hb Concentration 4,Maternal Hb Concentration 4
0,1e-05,5.70197e-07,6.087089e-08,5.395582e-09,8.157253e-10,1.618866e-10,5.233838e-11,2.094453e-11,9.338248e-12,4.26047e-12,...,0.0,0.0,14.625,10.725,15.0,10.725,14.625,11.0,15.0,11.0
1,9e-06,4.790927e-07,4.986242e-08,4.374502e-09,6.713407e-10,1.376183e-10,4.547763e-11,1.836634e-11,8.22724e-12,3.753678e-12,...,0.0,1.0,14.625,11.94375,15.0,11.94375,14.625,12.25,15.0,12.25
2,8e-06,4.039807e-07,4.104585e-08,3.572873e-09,5.573694e-10,1.17848e-10,3.971265e-11,1.61667e-11,7.270812e-12,3.316537e-12,...,0.0,2.0,14.625,13.1625,15.0,13.1625,14.625,13.5,15.0,13.5
3,7e-06,3.419308e-07,3.395051e-08,2.93839e-09,4.666053e-10,1.015868e-10,3.483185e-11,1.427906e-11,6.443921e-12,2.937982e-12,...,0.0,3.0,14.625,14.38125,15.0,14.38125,14.625,14.75,15.0,14.75
4,6e-06,2.899843e-07,2.81983e-08,2.431792e-09,3.936778e-10,8.808939e-11,3.067024e-11,1.265046e-11,5.726027e-12,2.608977e-12,...,0.0,4.0,14.625,15.6,15.0,15.6,14.625,16.0,15.0,16.0


In [11]:
# Create Config file
# ==========================================================================================
# NOT AUTOGENRATED! MUST BE DONE MANUALLY FOR EACH PIPELINE
config = {
    "labels": fb1.get_label_names(),
    "features": fb1.get_feature_names(),
    "feature_builder_txt": str(fb1),
    "preprocessing_description": "Detector Normalization -> Long to Wide -> Row Combination -> Keep 4 intensity columns -> Maternal + Fetal Pulsation",
    "comments": "Data pipeline intended for synthetic ppg generation. Includes both maternal and fetal pulsation. I1 is the stationary case, I2 is fetal pulsation, I3 is maternal pulsation and I4 is a mix(currently we have no use for it)",
}

# Save data and config
# ==========================================================================================
data.to_pickle(out_dest)

with open(config_dest, "w+", encoding="utf-8") as outfile:
    json.dump(config, outfile)