# Why is PPV lower when differential expression is lower?

PPV in recent results is a bit lower for `log2_fc == 1.0` than for `log2_fc == 1.5`

Here i look into the results to see what the reason for this is

In [1]:
import logging

logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")
logging.getLogger("ppv_issue").setLevel("DEBUG")
format_string = "%(asctime)s %(process)d %(name)-12s %(levelname)-8s %(message)s"
logging.basicConfig(level="INFO", format=format_string, datefmt="%Y-%m-%d %H:%M:%S")

In [2]:
%load_ext autoreload
%autoreload 2

%load_ext sql
%sql duckdb:///:default:
%config SqlMagic.autopandas = True
# %config SqlMagic.feedback = False
# %config SqlMagic.displaycon = False

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [3]:
from results import gene_stats

In [5]:
%%sql df <<
select
    origin,
    malignant_means,
    log2_fc::float as log2_fc,
    run_id::int as run_id,
    gene_symbol,
    perturbed,
    "pval",
    "-log10_pval",
    "pval_adjusted_bh",
    -1.0 * log10("pval_adjusted_bh") as "-log10_pval_adjusted_bh",
from gene_stats
where
    origin = 'malignant_cibersortx'
    and malignant_means = 'None,None'
    and log2_fc in ('-1.50', '-1.00')
    --and run_id = '0'
    --and gene_symbol like 'HLA%'
order by 1, 2, 3, 4
;

*  duckdb:///:default:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Done.


In [7]:
from helpers.running_cibersortx.loading_results import get_arrow_dataset_for_deg_analysis_results

gene_stats_2 = get_arrow_dataset_for_deg_analysis_results(
    "gs://liulab/differential_composition_and_expression/copied/20230505_21h41m44s/deg_analysis/"
)

In [8]:
%%sql df_2 <<
select
    origin,
    malignant_means,
    --log2_fc::float as log2_fc,
    log2_fc,
    --run_id::int as run_id,
    run_id,
    gene_symbol,
    perturbed,
    "pval",
    "-log10_pval",
    "pval_adjusted_bh",
    -1.0 * log10("pval_adjusted_bh") as "-log10_pval_adjusted_bh",
from gene_stats_2
where
    origin = 'malignant_cibersortx'
    and malignant_means = 'None,None'
    and log2_fc in (-1.50, -1.00)
    --and run_id = '0'
    --and gene_symbol like 'HLA%'
order by 1, 2, 3, 4
;

*  duckdb:///:default:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Done.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160630 entries, 0 to 160629
Data columns (total 10 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   origin                   160630 non-null  object 
 1   malignant_means          160630 non-null  object 
 2   log2_fc                  160630 non-null  float64
 3   run_id                   160630 non-null  int64  
 4   gene_symbol              160630 non-null  object 
 5   perturbed                160630 non-null  bool   
 6   pval                     160630 non-null  float64
 7   -log10_pval              160630 non-null  float64
 8   pval_adjusted_bh         160630 non-null  float64
 9   -log10_pval_adjusted_bh  160630 non-null  float64
dtypes: bool(1), float64(5), int64(1), object(3)
memory usage: 11.2+ MB


In [7]:
logging.getLogger("query_gene_stats").setLevel("DEBUG")
import query_gene_stats

2023-05-25 20:33:00 28789 query_gene_stats DEBUG    loading module query_gene_stats


In [8]:
threshold_field = "-log10_pval_adjusted_bh"

In [14]:
df_precision_recall = query_gene_stats.compute_all_precision_recall_curves(df, ["origin", "malignant_means", "log2_fc", "run_id"], threshold_field)

2023-05-25 20:37:35 28789 query_gene_stats DEBUG    loading module query_gene_stats
2023-05-25 20:37:35 28789 query_gene_stats INFO     computing precision-recall curve for group=('malignant_cibersortx', 'None,None', -1.5, 0)
2023-05-25 20:37:35 28789 query_gene_stats DEBUG    p[0]=0.006225487144369047, p[-1]=1.0, r[0]=1.0, r[-1]=0.0, t[0]=-0.0, t[-1]=10.361838990542063
2023-05-25 20:37:35 28789 query_gene_stats INFO     computing precision-recall curve for group=('malignant_cibersortx', 'None,None', -1.5, 1)
2023-05-25 20:37:35 28789 query_gene_stats DEBUG    p[0]=0.006225487144369047, p[-1]=1.0, r[0]=1.0, r[-1]=0.0, t[0]=-0.0, t[-1]=10.530554303487751
2023-05-25 20:37:35 28789 query_gene_stats INFO     computing precision-recall curve for group=('malignant_cibersortx', 'None,None', -1.5, 2)
2023-05-25 20:37:35 28789 query_gene_stats DEBUG    p[0]=0.006225487144369047, p[-1]=1.0, r[0]=1.0, r[-1]=0.0, t[0]=-0.0, t[-1]=10.662868986206044
2023-05-25 20:37:35 28789 query_gene_stats INFO  

In [16]:
query_gene_stats.plot_precision_recall_by_threshold(df_precision_recall, threshold_field)

2023-05-25 20:37:41 28789 query_gene_stats DEBUG    adding vertical line at -log10(0.1) for FDR=0.1


In [None]:
# all gene stats
%% df <<
select
    origin,
    malignant_means,
    log2_fc::float as log2_fc,
    run_id::int as run_id,
    gene_symbol,
    perturbed,
    "-log10_pval",
    -1.0 * log10("pval_adjusted_bh") as "-log10_pval_adjusted_bh",
from gene_stats
where
    origin = 'malignant_cibersortx'
    and malignant_means in ('None,None', '0.5,0.8', '0.6,0.8', '0.70,0.72'
    --and log2_fc in ('-1.50', '-1.00')
    and run_id = '0'
    --and gene_symbol like 'HLA%'
order by 1, 2, 3, 4
;

# PPV grid for all experiments


In [None]:
# distribution of -log10_pval
fig = px.scatter(
    result,
    x="-log10_pval",
    y="-log10_pval_adjusted_bh",
    color="perturbed",
    facet_col="log2_fc",
    hover_name="gene_symbol",
    hover_data=["perturbed", "run_id"],
    # markers=True,
)
fig.add_hline(y=1.0)  # add horizontal line at y=1.0
fig