Logic for parsing chembl dataset to curate effector gene list
Load chembl dataset
filter by phase 3+4
keep efo-gene pairings which were successful

Load L2G feature matrix
Load study index
Load credible sets # maybe not needed

For each efo-gene pairing:
    Identify studylocusId-to-gene pairings that match the efo
    Select the entry with the least amount of feature missingness

Filter entries based on feature strength, do not want entries with 1 feature or 2 features only.
    Investigate, how many entries remain after different thresholds? at least 5/4/3 features?

This will be the new effector gene list / gold standard positives

For the negatives:
Join the studylocusId in the entries back to the credible set.
    Using the variantId information in the credible set, join with the gene index to identify other nearby genes.
    These will become gold standard negatives.
    Recycle the logic from current GSN generation to filter using stringDB < 0.7

    Furthermore, apply a similar filter on the GSN feature matrix, retaining only top 1/2/3/4/5 entries with at least 3/4/5 features.

Investigate the schema required for the gold standards
Annotate the GSP + GSN entries accordingly.
Modify the code in l2g.py to utilise GSP + GSN input directly, instead of automatically identifying GSNs each run.




In [5]:
# Generate new gold standards from chembl:
from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession

from pyspark.sql import Window
from pyspark.sql.functions import row_number, desc, col

from pyspark.sql.functions import explode
import sys
from gentropy.common.session import Session
from pyspark.sql import functions as f

#session=Session(
#    extended_spark_conf={"spark.driver.memory": "12g", "spark.kryoserializer.buffer.max": "500m","spark.driver.maxResultSize":"3g"}
#    )

from pyspark import SparkConf
from pyspark.sql import SparkSession
app_name = "example_app"
CREDENTIALS = "/Users/xg1/.config/gcloud/service_account_credentials.json" 


GCS_CONNECTOR_CONF = {
    "spark.hadoop.fs.gs.impl": "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem",
    "spark.jars": "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar",
    "spark.hadoop.google.cloud.auth.service.account.enable": "true",
    "spark.hadoop.google.cloud.auth.service.account.json.keyfile": CREDENTIALS,
}

extended_spark_conf = {
    "spark.driver.memory": "12g",
    "spark.kryoserializer.buffer.max": "500m",
    "spark.driver.maxResultSize": "3g",
}

# Combine both configurations
combined_conf = {**GCS_CONNECTOR_CONF, **extended_spark_conf}

# Initialize SparkConf with the combined configuration
spark_config = SparkConf().setAll(combined_conf.items())

# Create the Spark session
session = SparkSession.builder.config(conf=spark_config).appName(app_name).getOrCreate()
#spark_config = SparkConf().setAll(GCS_CONNECTOR_CONF.items())
#session = SparkSession.builder.config(conf=spark_config).appName("example").getOrCreate()

def TA_OncoOrNot (diseases):
    ### create a dataframe asigning TA code, names and Oncology/Other 
    taDf = session.createDataFrame(
        data=[
            ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
            ("EFO_0005741", "infectious disease", "Other"),
            ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
            ("EFO_0005932", "animal disease", "Other"),
            ("MONDO_0024458", "disease of visual system", "Other"),
            ("EFO_0000319", "cardiovascular disease", "Other"),
            ("EFO_0009605", "pancreas disease", "Other"),
            ("EFO_0010282", "gastrointestinal disease", "Other"),
            ("OTAR_0000017", "reproductive system or breast disease", "Other"),
            ("EFO_0010285", "integumentary system disease", "Other"),
            ("EFO_0001379", "endocrine system disease", "Other"),
            ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
            ("EFO_0009690", "urinary system disease", "Other"),
            ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
            ("MONDO_0021205", "disease of ear", "Other"),
            ("EFO_0000540", "immune system disease", "Other"),
            ("EFO_0005803", "hematologic disease", "Other"),
            ("EFO_0000618", "nervous system disease", "Other"),
            ("MONDO_0002025", "psychiatric disorder", "Other"),
            ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
            ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
            ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
            ("EFO_0000651", "phenotype", "Other"),
            ("EFO_0001444", "measurement", "Other"),
            ("GO_0008150", "biological process", "Other"),
        ],
        schema=StructType(
            [
                StructField("taId", StringType(), True),
                StructField("taLabel", StringType(), True),
                StructField("taLabelSimple", StringType(), True),
            ]
        ),
    ).withColumn("taRank", f.monotonically_increasing_id())

    ### window over disease to take Oncology VS non oncology
    wByDisease = Window.partitionBy("diseaseId")
  ### explode therapy areas of diseases and joining the dataframe, categorise them between Onco or Others
    return (
        diseases.withColumn("taId", f.explode("therapeuticAreas"))
        .select(f.col("id").alias("diseaseId"), "taId", "parents")
        .join(taDf, on="taId", how="left")
        .withColumn("minRank", f.min("taRank").over(wByDisease))
        .filter(f.col("taRank") == f.col("minRank"))
        .drop("taRank", "minRank")
    )

diseases=session.read.parquet("/Users/xg1/Downloads/otg_releases/diseases.parquet")
TA_diseases=TA_OncoOrNot(diseases).select("diseaseId", "taLabelSimple").persist()

windowSpec = Window.partitionBy("diseaseFromSourceMappedId", "targetId")

[Stage 7:>  (24 + 8) / 89][Stage 10:> (0 + 0) / 200][Stage 12:>   (0 + 0) / 1]

In [2]:
sys.path.append("../../gentropy/src/")
release_path="../../otg_releases"
release_ver="2406"


chembl_evidence=session.read.parquet(
    "/Users/xg1/Downloads/platform_2409_evidence/evidence/sourceId\=chembl").select(
        "targetId", "drugId", "clinicalPhase", 
        "diseaseFromSourceMappedId", 
        "diseaseFromSource", 
        "clinicalStatus").withColumn(
            "maxClinicalPhase", f.max("clinicalPhase").over(windowSpec)).filter(
                f.col("clinicalPhase") == f.col("maxClinicalPhase")).drop("clinicalPhase", "drugId").distinct()

chembl_evidence_noOncology=chembl_evidence.join(TA_diseases.withColumnRenamed(
    "diseaseId", "diseaseFromSourceMappedId"), on="diseaseFromSourceMappedId", how="inner").filter(
    (f.col("taLabelSimple") != "Oncology") | f.col("taLabelSimple").isNull())

chembl_evidence_noOncology.withColumnRenamed(
    "diseaseFromSourceMappedId", "efo_terms").withColumnRenamed(
        "targetId", "geneId").filter(f.col("maxClinicalPhase") >= 3).filter(f.col("clinicalStatus") == "Completed").select("efo_terms", "geneId").distinct().count()

                                                                                

14874

In [3]:
GS=session.read.json(
    f"{release_path}/{release_ver}/locus_to_gene_gold_standard.json")

selected_df = GS.filter(f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"])).select(
    "gold_standard_info.gene_id",
    "trait_info.ontology",
    "trait_info.reported_trait_name"
).withColumnRenamed(
    "ontology", "efo_terms").withColumnRenamed(
        "gene_id", "geneId").withColumnRenamed(
            "reported_trait_name", "diseaseFromSource").select(
                "efo_terms", "geneId", "diseaseFromSource").withColumn(
                    "GS_source", f.lit("old_otg_gs"))

chembl_GS=chembl_evidence_noOncology.withColumnRenamed(
    "diseaseFromSourceMappedId", "efo_terms").withColumnRenamed(
        "targetId", "geneId").filter(f.col("maxClinicalPhase") >= 3).filter(f.col("clinicalStatus") == "Completed").select("efo_terms", "geneId", "diseaseFromSource").distinct().withColumn(
                    "GS_source", f.lit("chembl_p3_p4_2409"))

expanded_df = selected_df.withColumn("efo_terms", f.explode("efo_terms"))
new_egl=expanded_df.unionByName(chembl_GS)

#new_egl.write.parquet("/Users/xg1/Downloads/Effector_gene_list.parquet")

In [2]:
new_egl=session.read.parquet("/Users/xg1/Downloads/Effector_gene_list.parquet")
fm=session.read.parquet("gs://ot_orchestration/releases/24.10_freeze4/locus_to_gene_feature_matrix")
study_index=session.read.parquet("gs://ot_orchestration/releases/24.10_freeze4/study_index")
credible_set=session.read.parquet("gs://ot_orchestration/releases/24.10_freeze4/credible_set")

                                                                                

In [3]:
study_to_credible_set=credible_set.select("studyLocusId", "studyId").join(
    study_index.filter(f.col("studyType") == "gwas").select("studyId", "traitFromSource", "traitFromSourceMappedIds").withColumn("traitFromSourceMappedId", f.explode("traitFromSourceMappedIds")).drop("traitFromSourceMappedIds"), on="studyId", how="inner").persist()
#study_to_credible_set.show()



+--------------------+--------------------+--------------------+-----------------------+
|             studyId|        studyLocusId|     traitFromSource|traitFromSourceMappedId|
+--------------------+--------------------+--------------------+-----------------------+
|FINNGEN_R11_H7_CO...|bba52052adb27be98...|Convergent concom...|             HP_0000486|
|FINNGEN_R11_I9_HY...|11aa030e604b3aa3e...|Hypertension, ess...|            EFO_0000537|
|FINNGEN_R11_I9_HY...|53a324ea198571dfc...|Hypertension, ess...|            EFO_0000537|
|FINNGEN_R11_I9_HY...|b8d0c1771bffcd371...|Hypertension, ess...|            EFO_0000537|
|FINNGEN_R11_I9_HY...|05d95d399ce46a802...|Hypertension, ess...|            EFO_0000537|
|FINNGEN_R11_I9_HY...|0afe4f019d1c9a08e...|Hypertension, ess...|            EFO_0000537|
|FINNGEN_R11_I9_HY...|114b72c0f9c2af406...|Hypertension, ess...|            EFO_0000537|
|FINNGEN_R11_I9_HY...|2f116bb8a31b6bbf5...|Hypertension, ess...|            EFO_0000537|
|FINNGEN_R11_I9_HY...

                                                                                

In [4]:
study_to_credible_set_fm=study_to_credible_set.join(fm, on="studyLocusId", how="inner").persist()
#study_to_credible_set_fm.show()

24/11/04 21:38:30 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


ERROR:root:KeyboardInterrupt while sending command.               (0 + 0) / 200]
Traceback (most recent call last):
  File "/Users/xg1/Library/Caches/pypoetry/virtualenvs/gentropy-NMtW8s8F-py3.10/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/xg1/Library/Caches/pypoetry/virtualenvs/gentropy-NMtW8s8F-py3.10/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/xg1/.pyenv/versions/3.10.8/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [7]:
GS_fm=study_to_credible_set_fm.join(new_egl.withColumnRenamed("efo_terms", "traitFromSourceMappedId"), on=["traitFromSourceMappedId", "geneId"], how="inner").persist()

In [8]:
GS_fm.count()

                                                                                

32563

32563 rows in the feature matrix for 21603 unique efo-gene-traitname effector gene list.

In [9]:
GS_fm.select("traitFromSourceMappedId", "geneId").distinct().count()

                                                                                

1282

This comprised of 1282 efo-gene pairings from the EGL.
272 from old otg gold standards.
1077 from chembl 

so there is an overlap of 67 efo-gene pairings.

In [15]:
GS_fm.printSchema()

root
 |-- traitFromSourceMappedId: string (nullable = true)
 |-- geneId: string (nullable = true)
 |-- studyLocusId: string (nullable = true)
 |-- studyId: string (nullable = true)
 |-- traitFromSource: string (nullable = true)
 |-- distanceFootprintMean: float (nullable = true)
 |-- distanceFootprintMeanNeighbourhood: float (nullable = true)
 |-- distanceSentinelFootprint: float (nullable = true)
 |-- distanceSentinelFootprintNeighbourhood: float (nullable = true)
 |-- distanceSentinelTss: float (nullable = true)
 |-- distanceSentinelTssNeighbourhood: float (nullable = true)
 |-- distanceTssMean: float (nullable = true)
 |-- distanceTssMeanNeighbourhood: float (nullable = true)
 |-- eQtlColocClppMaximum: float (nullable = true)
 |-- eQtlColocClppMaximumNeighbourhood: float (nullable = true)
 |-- eQtlColocH4Maximum: float (nullable = true)
 |-- eQtlColocH4MaximumNeighbourhood: float (nullable = true)
 |-- pQtlColocClppMaximum: float (nullable = true)
 |-- pQtlColocClppMaximumNeighbourh

In [10]:
# exclude_columns = ["traitFromSourceMappedId", "geneId", "studyLocusId", "studyId", "traitFromSource", "diseaseFromSource"]
exclude_columns = ["traitFromSourceMappedId", "geneId", "studyLocusId", "studyId", "traitFromSource", "diseaseFromSource", "distanceFootprintMean", "distanceFootprintMeanNeighbourhood", "distanceSentinelFootprint", "distanceSentinelFootprintNeighbourhood", "distanceSentinelTss", "distanceSentinelTssNeighbourhood", "distanceTssMean", "distanceTssMeanNeighbourhood", ]
columns_to_check = [col for col in GS_fm.columns if col not in exclude_columns]

non_zero_condition = sum(f.when((f.col(c) != 0) & (f.col(c).isNotNull()), 1).otherwise(0) for c in columns_to_check) == 0

filtered_df = GS_fm.groupBy("studyLocusId").agg(f.max(non_zero_condition.cast("int")).alias("all_zeros_or_nulls"))

result_count = filtered_df.filter(f.col("all_zeros_or_nulls") == 1).count()

print(f"Number of studyLocusIds where all rows have zero or null values: {result_count}")



Number of studyLocusIds where all rows have zero or null values: 3289


                                                                                

In [None]:
# Define specific columns to exclude
specific_exclude_columns = ["traitFromSourceMappedId", "geneId", "studyLocusId", "studyId", "traitFromSource", "diseaseFromSource"]

# Dynamically exclude columns containing "Neighbourhood"
exclude_columns = specific_exclude_columns + [col for col in GS_fm.columns if "Neighbourhood" in col]

# Filter the columns based on the exclude list
columns_to_check = [col for col in GS_fm.columns if col not in exclude_columns]

# Define the non-zero condition for the selected columns
non_zero_condition = sum(f.when((f.col(c) != 0) & (f.col(c).isNotNull()), 1).otherwise(0) for c in columns_to_check) == 0

# Perform the aggregation and filtering
filtered_df = GS_fm.groupBy("studyLocusId").agg(f.max(non_zero_condition.cast("int")).alias("all_zeros_or_nulls"))

# Count the result
result_count = filtered_df.filter(f.col("all_zeros_or_nulls") == 1).count()

print(f"Number of studyLocusIds where all rows have zero or null values: {result_count}")

3289 rows can be excluded as they have no features other than distance based featured.

In [13]:
GS_fm.filter(f.col("GS_source") == "old_otg_gs").select("traitFromSourceMappedId", "geneId").distinct().count()

                                                                                

272

In [14]:
GS_fm.filter(f.col("GS_source") == "chembl_p3_p4_2409").select("traitFromSourceMappedId", "geneId").distinct().count()

                                                                                

1077

In [16]:
from pyspark.sql import functions as f

exclude_columns = ["traitFromSourceMappedId", "geneId", "studyLocusId", "studyId", "traitFromSource", "diseaseFromSource"]
columns_to_check = [col for col in GS_fm.columns if col not in exclude_columns]

non_zero_count_per_row = sum(f.when((f.col(c) != 0) & (f.col(c).isNotNull()), 1).otherwise(0) for c in columns_to_check)

GS_fm_with_count = GS_fm.withColumn("non_zero_non_null_count", non_zero_count_per_row)

aggregated_df = GS_fm_with_count.groupBy("studyLocusId").agg(f.sum("non_zero_non_null_count").alias("total_non_zero_non_null_count"))

GS_fm_with_count.show()

+-----------------------+---------------+--------------------+------------+--------------------+---------------------+----------------------------------+-------------------------+--------------------------------------+-------------------+--------------------------------+---------------+----------------------------+--------------------+---------------------------------+------------------+-------------------------------+--------------------+---------------------------------+------------------+-------------------------------+-------------+--------------------+---------------------------------+------------------+-------------------------------+----------+-----------------------+-----------+--------------------+--------------------+----------+-----------------------+
|traitFromSourceMappedId|         geneId|        studyLocusId|     studyId|     traitFromSource|distanceFootprintMean|distanceFootprintMeanNeighbourhood|distanceSentinelFootprint|distanceSentinelFootprintNeighbourhood|distanceS

In [17]:
from pyspark.sql import Window

window_spec = Window.partitionBy("traitFromSourceMappedId", "geneId").orderBy(f.col("non_zero_non_null_count").desc())
ranked_df = GS_fm_with_count.withColumn("row_number", f.row_number().over(window_spec))
filtered_df = ranked_df.filter(f.col("row_number") == 1).drop("row_number")

filtered_df.show()

+-----------------------+---------------+--------------------+--------------------+--------------------+---------------------+----------------------------------+-------------------------+--------------------------------------+-------------------+--------------------------------+---------------+----------------------------+--------------------+---------------------------------+------------------+-------------------------------+--------------------+---------------------------------+------------------+-------------------------------+-------------+--------------------+---------------------------------+------------------+-------------------------------+----------+-----------------------+------------+--------------------+--------------------+-----------------+-----------------------+
|traitFromSourceMappedId|         geneId|        studyLocusId|             studyId|     traitFromSource|distanceFootprintMean|distanceFootprintMeanNeighbourhood|distanceSentinelFootprint|distanceSentinelFootprin

In [20]:
gs_source_counts = filtered_df.groupBy("non_zero_non_null_count").count()

# Show the result
gs_source_counts.orderBy(f.col("non_zero_non_null_count")).show(50)



+-----------------------+-----+
|non_zero_non_null_count|count|
+-----------------------+-----+
|                      1|    3|
|                      2|   54|
|                      4|   62|
|                      5|    4|
|                      6|    1|
|                      7|   60|
|                      8|  139|
|                      9|   30|
|                     10|    6|
|                     11|  494|
|                     12|    8|
|                     13|  159|
|                     14|   29|
|                     15|   81|
|                     16|   11|
|                     17|   71|
|                     18|    1|
|                     19|   22|
|                     21|   40|
|                     23|    4|
|                     25|    3|
+-----------------------+-----+



                                                                                

Given that there are 8 distance based features which should be available for all.

Filter the Gold standard set using a count > 10 ?

Can be revised later.

Based on and Feature matrix freeze 4, this returns 923 Goldstandard positives.
    Of which 695 from chembl, 228 from Old OTG GS.

In [30]:
.write.parquet("/Users/xg1/Downloads/feature_matrix_gsp.parquet")filtered_df.filter(f.col("non_zero_non_null_count") > 10)

                                                                                

In [32]:
filtered_df.filter(f.col("non_zero_non_null_count") > 10).select("traitFromSourceMappedId", "geneId", "studyLocusId", "studyId", "traitFromSource", "GS_source").write.parquet("/Users/xg1/Downloads/EGL_GSP.parquet")

                                                                                

Join back to feature matrix, identify all gold standard negatives at each studylocusId

In [22]:
filtered_df.filter(f.col("non_zero_non_null_count") > 10).count()

                                                                                

923

In [24]:
filtered_df.filter(f.col("non_zero_non_null_count") > 10).groupBy("GS_source").count().show()



+-----------------+-----+
|        GS_source|count|
+-----------------+-----+
|chembl_p3_p4_2409|  695|
|       old_otg_gs|  228|
+-----------------+-----+



                                                                                

In [28]:
# Use functions from l2g.py to generate GSN:

from gentropy.common.spark_helpers import get_record_with_maximum_value

gene_interactions=session.read.parquet("gs://genetics_etl_python_playground/static_assets/interaction/")
gene_interactions_formatted=get_record_with_maximum_value(
            gene_interactions,
            ["targetA", "targetB"],
            "scoring",
        ).selectExpr(
            "targetA as geneIdA",
            "targetB as geneIdB",
            "scoring as score",
        )
gene_interactions_formatted.show()

[Stage 320:>                                                        (0 + 1) / 1]

+--------------------+---------------+-----+
|             geneIdA|        geneIdB|score|
+--------------------+---------------+-----+
|ENSANAG00000012804.1|ENSG00000122180| null|
|ENSANAG00000014030.1|ENSG00000185591| null|
|ENSBIXG00005022251.1|ENSG00000168769| null|
|ENSBMUG00000010571.1|ENSG00000168769| null|
|ENSCAFG00845002382.1|ENSG00000168769| null|
|ENSCANG00000005717.1|ENSG00000185591| null|
|ENSCANG00000006865.1|ENSG00000102935| null|
|ENSCATG00000015753.1|ENSG00000168769| null|
|ENSCATG00000016993.1|ENSG00000009709| null|
|ENSCATG00000017690.1|           null| 0.28|
|ENSCATG00000019377.1|ENSG00000106511| null|
|ENSCATG00000019782.1|ENSG00000140443| null|
|ENSCHIG00000002081.1|           null| 0.28|
|ENSCHIG00000009164.1|ENSG00000168769| null|
|ENSCJAG00000029861.3|ENSG00000122180| null|
|ENSCLAG00000020885.1|ENSG00000168769| null|
|ENSCSAG00000020610.1|ENSG00000122691| null|
|ENSCSAG00000022699.1|ENSG00000009709| null|
|ENSCWAG00000006152.1|ENSG00000122180| null|
|ENSEASG00

                                                                                

In [29]:
gene_interactions_formatted.count()

                                                                                

13303672

In [27]:
credible_set.show()

[Stage 316:>                                                        (0 + 1) / 1]

+--------------------+------------+-----------------+----------+---------+--------------------+----+-------------------+--------------+--------------+-------------------------------+-------------+-------------------+--------------------+-----------------+----------------+------------------+------------------+-------------------+----------+---------+----------+--------------------+--------------------+--------------------+---------+
|        studyLocusId|     studyId|        variantId|chromosome| position|              region|beta|             zScore|pValueMantissa|pValueExponent|effectAlleleFrequencyFromSource|standardError|subStudyDescription|     qualityControls|finemappingMethod|credibleSetIndex|credibleSetlog10BF|      purityMeanR2|        purityMinR2|locusStart| locusEnd|sampleSize|               ldSet|               locus|          confidence|studyType|
+--------------------+------------+-----------------+----------+---------+--------------------+----+-------------------+--------

                                                                                

In [8]:
study_index.show()

+--------------------+---------------+-----------+---------+--------------------+------------------------+---------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+-----------------+------+---------+--------+-------+---------------------+----------------+------------------+---------------+-------------+--------------------+-----------+--------------------+------------------+---------------+----------+--------------------+-----------+
|             studyId|         geneId|  projectId|studyType|     traitFromSource|traitFromSourceMappedIds|biosampleFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds|initialSampleSize|nCases|nControls|nSamples|cohorts|ldPopulationStructure|discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|           condition|sumStatQCPerformed|sumStatQCValues|dis

                                                                                

In [5]:
fm.show()

24/10/29 14:13:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 16:>                                                         (0 + 1) / 1]

+--------------------+---------------+---------------------+----------------------------------+-------------------------+--------------------------------------+-------------------+--------------------------------+---------------+----------------------------+--------------------+---------------------------------+------------------+-------------------------------+--------------------+---------------------------------+------------------+-------------------------------+-------------+--------------------+---------------------------------+------------------+-------------------------------+----------+-----------------------+------------+--------------------+
|        studyLocusId|         geneId|distanceFootprintMean|distanceFootprintMeanNeighbourhood|distanceSentinelFootprint|distanceSentinelFootprintNeighbourhood|distanceSentinelTss|distanceSentinelTssNeighbourhood|distanceTssMean|distanceTssMeanNeighbourhood|eQtlColocClppMaximum|eQtlColocClppMaximumNeighbourhood|eQtlColocH4Maximum|eQtlColo

                                                                                

In [1]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, desc, col

from pyspark.sql.functions import explode
import sys
from gentropy.common.session import Session
from pyspark.sql import functions as f

sys.path.append("../../gentropy/src/")
release_path="../../otg_releases"
release_ver="2406"

session=Session(
    extended_spark_conf={"spark.driver.memory": "12g", "spark.kryoserializer.buffer.max": "500m","spark.driver.maxResultSize":"3g"}
    )

study_index_path=f"{release_path}/{release_ver}/study_index/"

# load prev. study index
prev_study_index_path="/Users/xg1/prev_studies.parquet/"
prev_study_index=session.spark.read.parquet(prev_study_index_path, recursiveFileLookup=True)
# 57k studies (57,246), of which, 56k have EFO mappings (56,262).

# Recover finngen EFOs from prev.release
    # 2.8k finngen studies from prev.release
    # 2.4k finngen studies from current release.

prev_study_index = prev_study_index.withColumn(
    "study_id",
    f.regexp_replace(f.col("study_id"), "FINNGEN_R6", "FINNGEN_R10")
)
finngen_study_index=session.spark.read.parquet(study_index_path+"finngen/", recursiveFileLookup=True)
finngen_study_index=finngen_study_index.join(
    prev_study_index.withColumnRenamed(
        "study_id", "studyId").withColumnRenamed(
            "trait_efos", "traitFromSourceMappedIds")
        .select("studyId", 
                "n_initial", 
                "traitFromSourceMappedIds", 
                "trait_reported"), 
                on="studyId", how="inner").select(
             "studyId", 
             f.col("n_initial").alias("nSamples"), 
             f.col("trait_reported").alias("traitFromSource"), 
             "traitFromSourceMappedIds", 
             "hasSumstats")

# 1,858 finngen studies maps directly. (~75%)

gwascat_study_index=session.spark.read.parquet(
    study_index_path+"gwas_catalog/", recursiveFileLookup=True).select(
        "studyId", 
        "nSamples", 
        "traitFromSource", 
        "traitFromSourceMappedIds", 
        "hasSumstats")
# 79,872 studies total, 79,861 with efo
# 18,442 contains sumstats.

study_index=gwascat_study_index.union(finngen_study_index).persist()


#"/Users/xg1/Downloads/otg_releases/2406/new_GS_jsons/Distance_and_prev_GS_non_distanceOnly.json"

GS=session.spark.read.json(
    f"{release_path}/{release_ver}/locus_to_gene_gold_standard.json")

GS.filter(f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"])).select("gold_standard_info").distinct().count()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/10/28 14:41:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

925

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 49887)
Traceback (most recent call last):
  File "/Users/xg1/.pyenv/versions/3.10.8/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/xg1/.pyenv/versions/3.10.8/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Users/xg1/.pyenv/versions/3.10.8/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/xg1/.pyenv/versions/3.10.8/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/Users/xg1/Library/Caches/pypoetry/virtualenvs/gentropy-NMtW8s8F-py3.10/lib/python3.10/site-packages/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/Users/xg1/Library/Caches/pypoetry/virtualenvs/gentropy-NMtW8s8F-py3.1

In [None]:
New_chembl_GS=session.spark.read.parquet("/Users/xg1/Downloads/otg_releases/2406/New_chembl_GS.parquet")
New_chembl_GS.printSchema()

In [6]:
GS=session.spark.read.json(
    f"{release_path}/{release_ver}/locus_to_gene_gold_standard.json")

selected_df = GS.filter(f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"])).select(
    "gold_standard_info.gene_id",
    "trait_info.ontology",
    "trait_info.reported_trait_name"
).withColumnRenamed("ontology", "efo_terms").withColumnRenamed("gene_id", "geneId").withColumnRenamed("reported_trait_name", "diseaseFromSource").select("efo_terms", "geneId", "diseaseFromSource")

chembl_GS=chembl_evidence_noOncology.withColumnRenamed(
    "diseaseFromSourceMappedId", "efo_terms").withColumnRenamed(
        "targetId", "geneId").filter(f.col("maxClinicalPhase") >= 3).filter(f.col("clinicalStatus") == "Completed").select("efo_terms", "geneId", "diseaseFromSource").distinct()

expanded_df = selected_df.withColumn("efo_terms", f.explode("efo_terms"))
new_egl=expanded_df.unionByName(chembl_GS)

new_egl.write.parquet("/Users/xg1/Downloads/Effector_gene_list.parquet")


                                                                                

In [15]:
chembl_evidence.withColumnRenamed(
    "diseaseFromSourceMappedId", "efo_terms").withColumnRenamed(
        "targetId", "geneId").filter(f.col("maxClinicalPhase") >= 3).filter(f.col("clinicalStatus") == "Completed").select("efo_terms", "geneId").distinct().count()

                                                                                

20235

In [18]:
chembl_evidence=session.spark.read.parquet(
    "/Users/xg1/Downloads/platform_2409_evidence/evidence/sourceId\=chembl").select(
        "targetId", "drugId", "clinicalPhase", 
        "diseaseFromSourceMappedId", 
        "diseaseFromSource", 
        "clinicalStatus").withColumn(
            "maxClinicalPhase", f.max("clinicalPhase").over(windowSpec)).filter(
                f.col("clinicalPhase") == f.col("maxClinicalPhase")).drop("clinicalPhase", "drugId").distinct()

chembl_evidence_noOncology=chembl_evidence.join(TA_diseases.withColumnRenamed(
    "diseaseId", "diseaseFromSourceMappedId"), on="diseaseFromSourceMappedId", how="inner").filter(
    (f.col("taLabelSimple") != "Oncology") | f.col("taLabelSimple").isNull())

chembl_evidence_noOncology.withColumnRenamed(
    "diseaseFromSourceMappedId", "efo_terms").withColumnRenamed(
        "targetId", "geneId").filter(f.col("maxClinicalPhase") >= 3).filter(f.col("clinicalStatus") == "Completed").select("efo_terms", "geneId").distinct().count()

                                                                                

4426

In [19]:

chembl_evidence_noOncology.withColumnRenamed(
    "diseaseFromSourceMappedId", "efo_terms").withColumnRenamed(
        "targetId", "geneId").filter(f.col("maxClinicalPhase") <= 3).filter(f.col("clinicalStatus") == "Completed").select("geneId").distinct().count()

                                                                                

1170

925

In [24]:
GS.show(truncate=False)

+------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------+----------------------------------------------------+----------------------------------------------------------------------------------------------------------+
|association_info                                                  |gold_standard_info                                                                                                                                                                                                    

In [23]:
GS.printSchema()

root
 |-- association_info: struct (nullable = true)
 |    |-- ancestry: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- doi: string (nullable = true)
 |    |-- gwas_catalog_id: string (nullable = true)
 |    |-- neg_log_pval: double (nullable = true)
 |    |-- otg_id: string (nullable = true)
 |    |-- pubmed_id: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- gold_standard_info: struct (nullable = true)
 |    |-- evidence: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- class: string (nullable = true)
 |    |    |    |-- confidence: string (nullable = true)
 |    |    |    |-- curated_by: string (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- pubmed_id: string (nullable = true)
 |    |    |    |-- source: string (nullable = true)
 |    |-- gene_id: string (nullable = true)
 |    |-- highest_confidence: string (nullable = true)
 