# Collection date aggregation

### Imports

In [2]:
import pandas as pd
import copy
import numpy as np
from tqdm import tqdm

### Data loading

Right after loading the metadata, we separate it into genomics and non-genomics, because we are considring genomics metadata as time-independent.

In [3]:
metadata = pd.read_parquet("processed_metadata.parquet")
genomics_rows = metadata[metadata["omics"].str.contains("genomics") == True]
non_genomics_rows = metadata[metadata["omics"].str.contains("genomics") == False]

## Sample collection data collapse

To prepare the data for collapse:
1. group by unique patient

*For each patient:*

2. sort by `sample_collected_date`
3. calculate the difference (in days) between the current sample and the previous one
4. assign 0 to the first sample on each patient

*Reaggregate the patients:*

5. concatenate the ordered patient tables

In [4]:
patient_groupings = non_genomics_rows.groupby("patient_id")
ordered_list = []
for id, current_df in patient_groupings:
    sorted_by_date_df = current_df.sort_values("sample_collected_date", ascending=True)
    sorted_by_date_df["days_since_last_sample"] = (
        sorted_by_date_df["sample_collected_date"].diff().dt.days
    )

    sorted_by_date_df["days_since_last_sample"] = (
        sorted_by_date_df["days_since_last_sample"].fillna(0).astype(int)
    )
    ordered_list.append(sorted_by_date_df)

ordered_df = pd.concat(ordered_list, axis=0)

### Investigate single patient

In [6]:
subset_example_no_genomics = ordered_df[ordered_df["patient_id"] == 9032329]
subset_example_no_genomics[
    [
        "patient_id",
        "sample_id",
        "omics",
        "sample_collected_date",
        "days_since_last_sample",
        "simple_tissue",
    ]
]

Unnamed: 0,patient_id,sample_id,omics,sample_collected_date,days_since_last_sample,simple_tissue
1095,9032329,FS01338644,transcriptomics,2017-10-19,0,colon
1102,9032329,FS01629014,transcriptomics,2017-10-19,0,small_intestine
1111,9032329,10521576,proteomics,2017-10-19,0,plasma
1094,9032329,10641613,proteomics,2018-09-06,322,plasma
1104,9032329,FS04156255,transcriptomics,2018-09-06,0,small_intestine
1113,9032329,FS04156258,transcriptomics,2018-09-06,0,other
1120,9032329,FS01694799,transcriptomics,2018-09-06,0,colon
1100,9032329,FR20600048,proteomics,2019-05-30,266,plasma
1110,9032329,8051600906,transcriptomics,2019-05-30,0,colon
1121,9032329,8051600883,transcriptomics,2019-05-30,0,other


## Aggregate omic types by interval

The next steps are deployed jointly and encompass:
- Identifying unique 1-week intervals, per patient, with respective multi-omics information
- Aggregating the omics types in these periods
- Assigning the tissues to the interval information, according to transcriptomics, as transcriptomics data can have multiple different tissues
- Reconcatenating genomics information according to the `patient_id`, since we are considering genomics data as independent from collection date
- Each sample is now described by the `patient_id` and the `interval_id` and is characterized by the `transcriptomics`, `proteomics`, `genomics_exome` and `genomics_array` columns.

In [7]:
def process_interval_rows(input_rows, interval_id):
    interval_df = pd.DataFrame(input_rows)
    interval_df["interval_id"] = (
        str(interval_id) + "_" + interval_df["simple_tissue"].astype(str)
    )

    # Proteomics data is plasma, so we set it aside for now
    tissue_proteomics_df = interval_df[interval_df["omics"] == "proteomics"]
    unique_interval_ids = interval_df["interval_id"].unique()
    tissue_intervals = []
    for current_tissue_interval in unique_interval_ids:
        tissue_subset = interval_df[
            interval_df["interval_id"] == current_tissue_interval
        ]

        tissue_transcriptomics_df = tissue_subset[
            tissue_subset["omics"] == "transcriptomics"
        ]

        # Pick only the first proteomics and transcriptomics sample
        if tissue_proteomics_df.shape[0] >= 1:
            proteomics_row = copy.deepcopy(tissue_proteomics_df.iloc[0])

            # We reassign the tissue with the corresponding transcriptomics, to later match the ids
            proteomics_row["interval_id"] = current_tissue_interval
            tissue_intervals.append(proteomics_row)

        elif tissue_proteomics_df.shape[0] == 0:
            pass
        if tissue_transcriptomics_df.shape[0] >= 1:
            tissue_intervals.append(tissue_transcriptomics_df.iloc[0])
        elif tissue_transcriptomics_df.shape[0] == 0:
            pass
    return pd.DataFrame(tissue_intervals)


def process_single_patient(input_df):
    interval_grouping, interval_rows, interval_id = [], [], 1
    for index, row in input_df.iterrows():
        if row["days_since_last_sample"] > 7:
            cleaner_interval_df = process_interval_rows(interval_rows, interval_id)
            interval_grouping.append(cleaner_interval_df)
            interval_rows = []
            interval_id += 1
        interval_rows.append(row)

    # Don't forget the last interval
    cleaner_interval_df = process_interval_rows(interval_rows, interval_id)
    interval_grouping.append(cleaner_interval_df)

    return pd.concat(interval_grouping, axis=0)


def flatten_intervals(input_df, genomics_rows):
    # If there is no genomics data, assign NaN identifiers
    try:
        genomics_exome_id = genomics_rows[genomics_rows["omics"] == "genomics_exome"][
            "sample_id"
        ].values[0]
        genomics_array_id = genomics_rows[genomics_rows["omics"] == "genomics_array"][
            "sample_id"
        ].values[0]
    except:
        genomics_exome_id = np.nan
        genomics_array_id = np.nan

    unique_intervals = input_df["interval_id"].unique()
    all_intervals = []
    for current_interval in unique_intervals:
        current_subset = copy.deepcopy(
            input_df[input_df["interval_id"] == current_interval]
        )

        # If there are two samples in the interval, they must be from transcriptomics and proteomics
        if current_subset.shape[0] == 2:
            current_subset.sort_values(
                by="omics", ascending=False, inplace=True
            )  # Make sure transcriptomics appears first
            comparison_result = copy.deepcopy(
                current_subset.iloc[0].combine_first(current_subset.iloc[1])
            )
            comparison_result["transcriptomics"] = current_subset.iloc[0]["sample_id"]
            comparison_result["proteomics"] = current_subset.iloc[1]["sample_id"]

        # If there is only one sample in the interval, it must be either transcriptomics or proteomics
        elif current_subset.shape[0] == 1:
            comparison_result = copy.deepcopy(current_subset.iloc[0])

            # If the sample is transcriptomics, assign NaN to proteomics
            if comparison_result["omics"] == "transcriptomics":
                comparison_result["transcriptomics"] = comparison_result["sample_id"]
                comparison_result["proteomics"] = np.nan

            # If the sample is proteomics, assign NaN to transcriptomics
            elif comparison_result["omics"] == "proteomics":
                comparison_result["transcriptomics"] = np.nan
                comparison_result["proteomics"] = comparison_result["sample_id"]
        comparison_result["genomics_exome"] = genomics_exome_id
        comparison_result["genomics_array"] = genomics_array_id
        all_intervals.append(comparison_result)
    return pd.DataFrame(all_intervals)


complete_list = []
for current_patient in tqdm(list(ordered_df["patient_id"].unique())):
    interval_grouping_df = process_single_patient(
        ordered_df[ordered_df["patient_id"] == current_patient]
    )
    complete_list.append(
        flatten_intervals(
            interval_grouping_df,
            genomics_rows[genomics_rows["patient_id"] == current_patient],
        )
    )

complete_df = pd.concat(complete_list, axis=0)

  0%|          | 0/1970 [00:00<?, ?it/s]

100%|██████████| 1970/1970 [01:51<00:00, 17.72it/s]


The `sample_id` and the `omics` columns no longer make sense anymore as the `interval_id` is now the unique identifier and the omics columns has been pivoted.

In [8]:
cleaner_table = complete_df.drop(["sample_id", "omics"], axis=1)
cleaner_table[cleaner_table["patient_id"] == 9032329][
    [
        "patient_id",
        "interval_id",
        "transcriptomics",
        "proteomics",
        "genomics_exome",
        "genomics_array",
    ]
]

Unnamed: 0,patient_id,interval_id,transcriptomics,proteomics,genomics_exome,genomics_array
1095,9032329,1_colon,FS01338644,10521576,1203065541,203990550113_R06C02
1102,9032329,1_small_intestine,FS01629014,10521576,1203065541,203990550113_R06C02
1111,9032329,1_plasma,,10521576,1203065541,203990550113_R06C02
1094,9032329,2_plasma,,10641613,1203065541,203990550113_R06C02
1104,9032329,2_small_intestine,FS04156255,10641613,1203065541,203990550113_R06C02
1113,9032329,2_other,FS04156258,10641613,1203065541,203990550113_R06C02
1120,9032329,2_colon,FS01694799,10641613,1203065541,203990550113_R06C02
1100,9032329,3_plasma,,FR20600048,1203065541,203990550113_R06C02
1110,9032329,3_colon,8051600906,FR20600048,1203065541,203990550113_R06C02
1121,9032329,3_other,8051600883,FR20600048,1203065541,203990550113_R06C02


In [11]:
cleaner_table.to_parquet("collapsed_metadata.parquet")

In [12]:
print(f"Original metadata shape: {metadata.shape} and final shape: {complete_df.shape}")

Original metadata shape: (11270, 143) and final shape: (5309, 149)
