In [2]:
import shutil
from pathlib import Path
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
import numpy as np
import plotly.io as pio
from rpy2 import robjects

from src.visualization import graph

In [3]:
### Constants

COHORT = "CESC"
RAW_DATA_DIR = Path("../data/raw")
assert RAW_DATA_DIR.exists()
INTERIM_DATA_DIR = Path("../data/interim/TCGA")
COUNTS_FILE = INTERIM_DATA_DIR / f"counts/{COHORT}.csv.gz"
if not COUNTS_FILE.exists():
    raise FileNotFoundError(
        f"{COUNTS_FILE} does not exist. Run `0_preprocess_and_label.ipynb` first."
    )
COLDATA_FILE = INTERIM_DATA_DIR / f"coldata/{COHORT}.csv"
assert COLDATA_FILE.exists()
FIGURES_DIR = Path("../reports/figures")
assert FIGURES_DIR.exists()
DE_DIR = INTERIM_DATA_DIR / "de"
assert DE_DIR.exists()
DE_RESULTS_FILE = DE_DIR / f"DE_results_{COHORT}.csv"
assert DE_RESULTS_FILE.exists()
GSEA_DIR = INTERIM_DATA_DIR / "gsea"
GSEA_DIR.mkdir(exist_ok=True)

In [5]:
robjects.r('suppressPackageStartupMessages(library("src"))')
gsea = robjects.r["gsea_go"]
de_file = str(DE_RESULTS_FILE)
gsea_dir = str(GSEA_DIR)
gsea(de_file, gsea_dir)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

R[write to console]: In addition: 

R[write to console]: 1: replacing previous import ‘data.table::last’ by ‘dplyr::last’ when loading ‘src’ 

R[write to console]: 2: replacing previous import ‘data.table::first’ by ‘dplyr::first’ when loading ‘src’ 

R[write to console]: 3: replacing previous import ‘data.table::between’ by ‘dplyr::between’ when loading ‘src’ 

R[write to console]: 4: replacing previous import ‘dplyr::select’ by ‘AnnotationDbi::select’ when loading ‘src’ 

R[write to console]: 5: replacing previous import ‘AnnotationDbi::select’ by ‘clusterProfiler::select’ when loading ‘src’ 

R[write to console]: 6: replacing previous import ‘ggplot2::last_plot’ by ‘plotly::last_plot’ when loading ‘src’ 

R[write to console]: 7: replacing previous import ‘ggpubr::color_palette’ by ‘enrichplot::color_palette’ when loading ‘src’ 



Processing: CESC 


R[write to console]: 'select()' returned 1:many mapping between keys and columns

R[write to console]: preparing geneSet collections...

R[write to console]: GSEA analysis...

R[write to console]: leading edge analysis...

R[write to console]: done...

R[write to console]: preparing gene to GO mapping data...

R[write to console]: preparing IC data...

R[write to console]: 'select()' returned 1:1 mapping between keys and columns

R[write to console]: 'select()' returned 1:1 mapping between keys and columns

R[write to console]: 'select()' returned 1:1 mapping between keys and columns

R[write to console]: 'select()' returned 1:1 mapping between keys and columns

R[write to console]: 'select()' returned 1:1 mapping between keys and columns

R[write to console]: 'select()' returned 1:1 mapping between keys and columns

R[write to console]: 'select()' returned 1:1 mapping between keys and columns

R[write to console]: 'select()' returned 1:1 mapping between keys and columns

R[write to co

Saving 7 x 7 in image


<rpy2.rinterface_lib.sexp.NULLType object at 0x7fec33be3150> [0]

In [6]:
# Copy output_dir/*.png to FIGURES_DIR
for png in GSEA_DIR.glob("*.png"):
    shutil.copy(png, FIGURES_DIR)

## Gene-concept network

In [4]:
gsea_res = pd.read_csv(GSEA_DIR / f"gsea_go_bp_top_terms_{COHORT}.csv", index_col=0)
de_res = pd.read_csv(DE_RESULTS_FILE, index_col=0)
genes_df = pd.read_csv(RAW_DATA_DIR / "genes.csv.gz", index_col=0)

fig = graph.gene_concept_network_plot(
    gsea_res,
    de_res,
    genes_df,
    color_metric="log2FoldChange",
    pvalue_threshold=0.01,
    layout_seed=0,
    color_seed=0,
)
pio.write_html(fig, str(FIGURES_DIR / f"gene_concept_network_{COHORT}.html"))