In [1]:
from pathlib import Path
import subprocess
import json

from IPython import get_ipython
import pandas as pd

import data_preprocessing as prep

In [2]:
# Paths

DATA_PREP_DIR = Path(get_ipython().getoutput("pwd")[0])
BASE_DIR = DATA_PREP_DIR.parent
METADATA_DIR = DATA_PREP_DIR / "metadata"
ANALYSIS_DIR = BASE_DIR / "analysis"
DATA_URLS_PATH = METADATA_DIR / "geo_data_urls.json"
DATA_DIR = BASE_DIR / "data"
CELFILE_DIR = DATA_DIR / "cel_files"

In [3]:
# Print out data paths for sanity check

for name, path in {
    "Data prep directory": DATA_PREP_DIR,
    "Base directory": BASE_DIR,
    "Data URLs": DATA_URLS_PATH,
    "Metadata directory": METADATA_DIR,
    "Data directory": DATA_DIR,
    "CEL file directory": CELFILE_DIR
}.items():
    print('-'*80)
    print(f"{name}: {path}")
    if path.is_dir():
        print("Directory.")
    elif path.is_file():
        print("File.")
    else:
        print("Not found.")

--------------------------------------------------------------------------------
Data prep directory: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data_prep
Directory.
--------------------------------------------------------------------------------
Base directory: /media/data/lab/gene/characterizing-endometriosis-transcriptome
Directory.
--------------------------------------------------------------------------------
Data URLs: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data_prep/metadata/geo_data_urls.json
File.
--------------------------------------------------------------------------------
Metadata directory: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data_prep/metadata
Directory.
--------------------------------------------------------------------------------
Data directory: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data
Directory.
---------------------------------------------------------------------

In [4]:
# Download GEO data

# Create CEL file dir if it doesn't exist
CELFILE_DIR.mkdir(parents=True, exist_ok=True)

# Open JSON file with data urls
with open(DATA_URLS_PATH, "r") as json_file:
    data = json.load(json_file)

# Download GEO data
for key, url in data.items():
    print(f"Downloading file from: {url}")
    tar_path = CELFILE_DIR / (key + ".tar")
    prep.download_file(url, tar_path)
    print(f"Untarring and unzipping file at: {tar_path}")
    prep.untar_and_unzip(tar_path, CELFILE_DIR / key, delete_archive=True)

print("All files downloaded and extracted successfully.")

Downloading file from: https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE4888&format=file
Untarring and unzipping file at: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data/cel_files/GSE4888.tar
Downloading file from: https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE6364&format=file
Untarring and unzipping file at: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data/cel_files/GSE6364.tar
Downloading file from: https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE51981&format=file
Untarring and unzipping file at: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data/cel_files/GSE51981.tar
Downloading file from: https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE29981&format=file
Untarring and unzipping file at: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data/cel_files/GSE29981.tar
All files downloaded and extracted successfully.


In [5]:
# Get counts and sample conditions

counts_dir = DATA_DIR / "all"
counts_dir.mkdir(parents=True, exist_ok=True)

probe_maps_path = METADATA_DIR / "probe_maps"

preprocessing_script = str(DATA_PREP_DIR / "rma_counts.r")
cmd = ["Rscript", preprocessing_script, str(CELFILE_DIR), str(counts_dir)]
print(f"Executing command:\n{' '.join(cmd)}")
subprocess.run(cmd)

prep.prep_geo_counts(counts_dir, probe_maps_path, METADATA_DIR)

Executing command:
Rscript /media/data/lab/gene/characterizing-endometriosis-transcriptome/data_prep/rma_counts.r /media/data/lab/gene/characterizing-endometriosis-transcriptome/data/cel_files /media/data/lab/gene/characterizing-endometriosis-transcriptome/data/all


Loading required package: BiocGenerics

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min

Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.



Preprocessing data in the following directories:
/media/data/lab/gene/characterizing-endometriosis-transcriptome/data/cel_files/GSE29981
/media/data/lab/gene/characterizing-endometriosis-transcriptome/data/cel_files/GSE4888
/media/data/lab/gene/characterizing-endometriosis-transcriptome/data/cel_files/GSE51981
/media/data/lab/gene/characterizing-endometriosis-transcriptome/data/cel_files/GSE6364
------------------------------------------------






Background correcting
Normalizing
Calculating Expression
Background correcting
Normalizing
Calculating Expression
Background correcting
Normalizing
Calculating Expression
Background correcting
Normalizing
Calculating Expression


1: replacing previous import ‘AnnotationDbi::tail’ by ‘utils::tail’ when loading ‘hgu133plus2cdf’ 
2: replacing previous import ‘AnnotationDbi::head’ by ‘utils::head’ when loading ‘hgu133plus2cdf’ 


Probe map found: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data_prep/metadata/probe_maps/hsapiens_affy_hg_u133_plus_2.tsv
Probe map found: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data_prep/metadata/probe_maps/hsapiens_affy_hg_u133_plus_2.tsv
Probe map found: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data_prep/metadata/probe_maps/hsapiens_affy_hg_u133_plus_2.tsv
Probe map found: /media/data/lab/gene/characterizing-endometriosis-transcriptome/data_prep/metadata/probe_maps/hsapiens_affy_hg_u133_plus_2.tsv


In [6]:
# Unify counts

all_series = ["GSE4888", "GSE6364", "GSE51981", "GSE29981"]

counts_paths = [DATA_DIR / "all" / (series + "_counts.tsv") for series in all_series]
unified_counts_path = counts_dir / "all_phases_all_genes_counts.tsv"

dataframes = [pd.read_csv(filepath, index_col=0, delimiter='\t') for filepath in counts_paths]
merged_counts_df = pd.concat(dataframes, axis=1)

# Check if there are any missing values in the merged dataframe
if merged_counts_df.isnull().any().any():
    print('There are missing values in the merged dataframe.')

# save merged dataframe to tsv
merged_counts_df.to_csv(unified_counts_path, sep='\t')

# delete individual counts files
for filepath in counts_paths:
    filepath.unlink()

In [8]:
# Copy coldata into data directory
original_unified_coldata_path = METADATA_DIR / "coldata.tsv"
unified_coldata_path = DATA_DIR / "all" / "all_phases_coldata.tsv"
unified_coldata_path.write_text(original_unified_coldata_path.read_text())

In [9]:
# Drop samples from counts that are not present in coldata (e.g. due to ambiguous histology reading)
coldata = pd.read_csv(unified_coldata_path, sep='\t', index_col=0)
counts = pd.read_csv(unified_counts_path, sep='\t', index_col=0)

counts = counts.loc[:, counts.columns.isin(coldata.index)]

counts.to_csv(unified_counts_path, sep='\t')

In [10]:
# Batch correction

reference_level = "healthy"
contrast_level = "endometriosis"

bc_script = str(DATA_PREP_DIR / "batch_correction.r")

cmd = ["Rscript", bc_script, str(unified_counts_path), str(unified_coldata_path)]
subprocess.run(cmd)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: mgcv
Loading required package: nlme

Attaching package: ‘nlme’

The following object is masked from ‘package:dplyr’:

    collapse

This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
Loading required package: genefilter

Attaching package: ‘genefilter’

The following object is masked from ‘package:readr’:

    spec

Loading required package: BiocParallel
Rows: 44341 Columns: 217
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr   (1): hgnc_symbol
dbl (216): GSM109814, GSM109815, GSM109816, GSM109817, GSM109820, GSM109821,...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 216 Columns: 4
── Colu

CompletedProcess(args=['Rscript', '/media/data/lab/gene/characterizing-endometriosis-transcriptome/data_prep/batch_correction.r', '/media/data/lab/gene/characterizing-endometriosis-transcriptome/data/all/all_phases_all_genes_counts.tsv', '/media/data/lab/gene/characterizing-endometriosis-transcriptome/data/all/all_phases_coldata.tsv'], returncode=0)

In [27]:
# Create filtered counts for all matrisome and core matrisome genes
with open(ANALYSIS_DIR / "core_matrisome_genes.json", 'r') as json_file:
    matrisome_core_genes = json.load(json_file)['symbols']

with open(ANALYSIS_DIR / "all_matrisome_genes.json", 'r') as json_file:
    matrisome_all_genes = json.load(json_file)['symbols']

counts_df = pd.read_csv(unified_counts_path, sep='\t')

core_matrisome_counts_df = counts_df[counts_df['symbol'].isin(matrisome_core_genes)]
core_matrisome_counts_df = core_matrisome_counts_df.groupby('symbol').mean().reset_index()
core_matrisome_counts_path = DATA_DIR / "all" / "all_phases_core_matrisome_counts.tsv"
core_matrisome_counts_df.to_csv(core_matrisome_counts_path, sep='\t', index=False)

all_matrisome_counts_df = counts_df[counts_df['symbol'].isin(matrisome_all_genes)]
all_matrisome_counts_df = all_matrisome_counts_df.groupby('symbol').mean().reset_index()
all_matrisome_counts_path = DATA_DIR / "all" / "all_phases_all_matrisome_counts.tsv"
all_matrisome_counts_df.to_csv(all_matrisome_counts_path, sep='\t', index=False)

In [31]:
# Separate by phase

phases = [
    "early_secretory",
    "mid_secretory",
    "proliferative"
]

gene_sets = {
    "all_genes": unified_counts_path,
    "all_matrisome": all_matrisome_counts_path,
    "core_matrisome": core_matrisome_counts_path
}

"""
Using the following existing files:
    - all_phases_all_genes_counts.tsv
    - all_phases_all_matrisome_counts.tsv
    - all_phases_core_matrisome_counts.tsv
    - all_phases_coldata.tsv

Create new files:
    - all_phases_all_matrisome_counts.tsv
    - all_phases_core_matrisome_counts.tsv
    - early_secretory_all_matrisome_counts.tsv
    - early_secretory_coldata.tsv
    - early_secretory_matrisome_core_counts.tsv
    - mid_secretory_coldata.tsv
    - mid_secretory_core_counts.tsv
    - mid_secretory_counts.tsv
    - proliferative_coldata.tsv
    - proliferative_core_counts.tsv
    - proliferative_counts.tsv
"""

coldata_df = pd.read_csv(unified_coldata_path, sep='\t')

for gene_set, counts_path in gene_sets.items():
    counts_df = pd.read_csv(counts_path, sep='\t')
    for phase in phases:
        # Filter counts columns using the phase column of coldata
        phase_samples = coldata_df[coldata_df['phase'] == phase]['sample_name'].values
        phase_counts_df = counts_df[['symbol'] + list(phase_samples)]

        # Save the counts data for each phase and gene set
        phase_counts_file = DATA_DIR / "all" / f"{phase}_{gene_set}_counts.tsv"
        phase_counts_df.to_csv(phase_counts_file, sep='\t', index=False)

        # Save the coldata for each phase
        if gene_set == "all_genes":
            phase_coldata_df = coldata_df[coldata_df['phase'] == phase]
            phase_coldata_file = DATA_DIR / "all" / f"{phase}_coldata.tsv"
            phase_coldata_df.to_csv(phase_coldata_file, sep='\t', index=False)

In [13]:
"""
Set aside 45 samples for validation (24 healthy, 21 endometriosis) and separate fit/test sets.

The test set contains the 45 validation samples, and the fit set contains all other samples.
"""

n_healthy = 24
n_endometriosis = 21

coldata_df = pd.read_csv(unified_coldata_path, sep='\t')

healthy_samples = coldata_df[coldata_df['condition'] == 'healthy']['sample_name'].tolist()
endometriosis_samples = coldata_df[coldata_df['condition'] == 'endometriosis']['sample_name'].tolist()

# Choose 24 healthy and 21 endometriosis samples from the end of the list
validation_samples = healthy_samples[-n_healthy:] + endometriosis_samples[-n_endometriosis:]

# Remove validation samples from the list of all samples
fit_samples = healthy_samples[:-n_healthy] + endometriosis_samples[:-n_endometriosis]

samples = {
    "fit": fit_samples,
    "test": validation_samples
}

# Create fit and test sets
fit_dir = DATA_DIR / "fit"
test_dir = DATA_DIR / "test"
fit_dir.mkdir(exist_ok=True)
test_dir.mkdir(exist_ok=True)

for gene_set, counts_path in gene_sets.items():
    counts_df = pd.read_csv(counts_path, sep='\t')
    for phase in phases + ["all_phases"]:
        # Filter counts columns using the phase column of coldata
        if phase == "all_phases":
            phase_samples = coldata_df['sample_name'].values
        else:
            phase_samples = coldata_df[coldata_df['phase'] == phase]['sample_name'].values
        
        for sample_type in samples:
            # Filter counts and coldata for fit/test samples
            sample_counts_df = counts_df[['symbol'] + [sample for sample in samples[sample_type] if sample in phase_samples]]
            sample_coldata_df = coldata_df[coldata_df['sample_name'].isin(samples[sample_type])]

            # Save files for each phase and sample type
            if sample_type == "fit":
                sample_counts_file = fit_dir / f"{phase}_{gene_set}_counts.tsv"
                sample_coldata_file = fit_dir / f"{phase}_coldata.tsv"
            else:
                sample_counts_file = test_dir / f"{phase}_{gene_set}_counts.tsv"
                sample_coldata_file = test_dir / f"{phase}_coldata.tsv"

            sample_counts_df.to_csv(sample_counts_file, sep='\t', index=False)
            sample_coldata_df.to_csv(sample_coldata_file, sep='\t', index=False)