In [4]:
from pathlib import Path
import subprocess
import json
import util as prep
from IPython import get_ipython
import pandas as pd

In [None]:
# TODO: coldata prep, validation split

In [None]:
# Paths

DATA_PREP_DIR = Path(get_ipython().getoutput("pwd")[0]).parent
BASE_DIR = DATA_PREP_DIR.parent
DATA_URLS_PATH = DATA_PREP_DIR / "geo_data_urls.json"
METADATA_DIR = DATA_PREP_DIR / "metadata"
DATA_DIR = BASE_DIR / "data"
CELFILE_DIR = DATA_DIR / "cel_files"

In [None]:
# Download GEO data

# Create CEL file dir if it doesn't exist
CELFILE_DIR.mkdir(parents=True, exist_ok=True)

# Open JSON file with data urls
with open(DATA_URLS_PATH, "r") as json_file:
    data = json.load(json_file)

# Download GEO data
for key, url in data.items():
    print(f"Downloading file from: {url}")
    tar_path = CELFILE_DIR / (key + ".tar")
    prep.download_file(url, tar_path)
    print(f"Untarring and unzipping file at: {tar_path}")
    prep.untar_and_unzip(tar_path, CELFILE_DIR / key, delete_archive=True)

print("All files downloaded and extracted successfully.")

In [5]:
# Get counts and sample conditions

counts_dir = DATA_DIR / "all"
counts_dir.mkdir(parents=True, exist_ok=True)

probe_maps_path = METADATA_DIR / "probe_maps"

preprocessing_script = str(DATA_PREP_DIR / "rma_counts.r")
cmd = ["Rscript", preprocessing_script, str(CELFILE_DIR), str(counts_dir)]
subprocess.run(cmd)

prep.prep_geo_counts(counts_dir, probe_maps_path, METADATA_DIR)

In [None]:
# Unify counts

counts_paths = counts_dir.glob("*.tsv")
unified_counts_path = counts_dir / "all_phases_all_genes_counts.tsv"

dataframes = [pd.read_csv(filepath, index_col=0, delimiter='\t') for filepath in counts_paths]
merged_counts_df = pd.concat(dataframes, axis=1)

# Check if there are any missing values in the merged dataframe
if merged_counts_df.isnull().any().any():
    print('There are missing values in the merged dataframe.')

# save merged dataframe to tsv
merged_counts_df.to_csv(unified_counts_path, sep='\t')

# delete individual counts files
for filepath in counts_paths:
    filepath.unlink()

In [None]:
# Unify coldata and add column for batch number

coldata_paths = counts_dir.glob("*_coldata.tsv")
unified_coldata_path = counts_dir / "all_phases_all_genes_coldata.tsv"

""" Batches:
    1. GSE4888 (reference batch)
    2. GSE6364
    3. GSE51981
    4. GSE29981
"""

coldata_paths_by_batch = {}

for path in coldata_paths:
    if "gse4888" in str(path).lower():
        coldata_paths_by_batch[1] = path
    elif "gse6364" in str(path).lower():
        coldata_paths_by_batch[2] = path
    elif "gse51981" in str(path).lower():
        coldata_paths_by_batch[3] = path
    elif "gse29981" in str(path).lower():
        coldata_paths_by_batch[4] = path

batch_dfs = {
    batch: pd.read_csv(path, index_col=0, delimiter='\t')
    for batch, path in coldata_paths_by_batch.items()
}

merged_coldata_df = pd.concat([batch_dfs[batch] for batch in sorted(batch_dfs.keys())], axis=0)

# Check if there are any missing values in the merged dataframe
if merged_coldata_df.isnull().any().any():
    print('There are missing values in the merged dataframe.')

# save merged dataframe to tsv
merged_coldata_df.to_csv(counts_dir / "all_phases_all_genes_coldata.tsv", sep='\t')

# delete individual coldata files
for filepath in coldata_paths:
    filepath.unlink()

In [None]:
# Batch correction

reference_level = "healthy"
contrast_level = "endometriosis"

bc_script = str(DATA_PREP_DIR / "batch_correction.r")

cmd = ["Rscript", bc_script, str(unified_counts_path), str(unified_coldata_path), str(counts_dir)]
subprocess.run(cmd)

In [None]:
# Create filtered counts for all matrisome and core matrisome genes
with open(METADATA_DIR / "core_matrisome_genes.json", 'r') as json_file:
    matrisome_core_genes = json.load(json_file)['symbols']

with open(METADATA_DIR / "all_matrisome_genes.json", 'r') as json_file:
    matrisome_all_genes = json.load(json_file)['symbols']

counts_df = pd.read_csv(unified_counts_path, sep='\t')

core_matrisome_counts_df = counts_df[counts_df['hgnc_symbol'].isin(matrisome_core_genes)]
core_matrisome_counts_df = core_matrisome_counts_df.groupby('hgnc_symbol').mean().reset_index()
core_counts_path = DATA_DIR / "all" / "all_phases_core_matrisome_counts.tsv"
core_matrisome_counts_df.to_csv(core_counts_path, sep='\t', index=False)

all_matrisome_counts_df = counts_df[counts_df['hgnc_symbol'].isin(matrisome_all_genes)]
all_matrisome_counts_df = all_matrisome_counts_df.groupby('hgnc_symbol').mean().reset_index()
all_counts_path = DATA_DIR / "all" / "all_phases_all_matrisome_counts.tsv"
all_matrisome_counts_df.to_csv(all_counts_path, sep='\t', index=False)

In [None]:
# Separate phases

phases = [
    "all_phases",
    "early_secretory",
    "mid_secretory",
    "proliferative"
]

all_phase_counts_fp_per_gene_set = {
    "all_genes": DATA_DIR / "all" / "all_phases_all_genes_counts.tsv",
    "all_matrisome": DATA_DIR / "all" / "all_phases_all_matrisome_counts.tsv",
    "core_matrisome": DATA_DIR / "all" / "all_phases_core_matrisome_counts.tsv"
}

for gene_set in all_phase_counts_fp_per_gene_set:
    all_phase_counts_fp = all_phase_counts_fp_per_gene_set[gene_set]
    all_phase_counts_df = pd.read_csv(all_phase_counts_fp, sep='\t')

    for phase in phases:
        coldata_fp = DATA_DIR / "all" / f"{phase}_coldata.tsv"
        coldata_df = pd.read_csv(coldata_fp, sep='\t')
        sample_names = coldata_df['sample_name'].tolist()
        sample_names.insert(0, "hgnc_symbol")
        filtered_counts_df = all_phase_counts_df[sample_names]
        counts_fp = str(all_phase_counts_fp).replace("all_phases", phase)
        filtered_counts_df.to_csv(counts_fp, sep='\t', index=False)

In [None]:
"""
Set aside 45 samples for validation (24 healthy, 21 endometriosis) and separate fit/test sets.

The test set contains the 45 validation samples, and the fit set contains all other samples.
"""

n_healthy = 24
n_endometriosis = 21

