In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [2]:
concept_frequency_df = spark.read.option("header", "true").option("delimiter", ",").option("inferschema", "true") \
                            .csv("concept_id_frequency").select(col("standard_concept_id").alias("standard_concept_id"), col("count").alias("freq_count"))

In [3]:
concept_frequency_df.show()

+-------------------+----------+
|standard_concept_id|freq_count|
+-------------------+----------+
|             432807|     17680|
|              80502|     72686|
|             433911|      1278|
|             436972|       204|
|             139188|     16310|
|           46272452|      2717|
|             436176|     10562|
|             141372|      1776|
|             437469|       240|
|           40480500|      1851|
|             434354|        38|
|             440069|      5396|
|             438555|      3333|
|            4134565|     58876|
|             441223|        63|
|             432312|      6259|
|            4120275|      8477|
|             200562|       174|
|             444455|      1423|
|             319041|     10366|
+-------------------+----------+
only showing top 20 rows



In [4]:
total_sum = concept_frequency_df.groupBy().agg(sum("freq_count")).collect()[0][0]
total_sum

197752964

In [5]:
concept_ancestor = spark.read.option("header", "true").option("delimiter", ",").csv("concept_ancestor.csv").filter(col('min_levels_of_separation') >= 1)

In [6]:
concept_ancestor.printSchema()

root
 |-- ancestor_concept_id: string (nullable = true)
 |-- descendant_concept_id: string (nullable = true)
 |-- min_levels_of_separation: string (nullable = true)
 |-- max_levels_of_separation: string (nullable = true)



In [7]:
# left join b/w descendent_concept_id and the standard_concept_id in the concept freq table
freq_df = concept_ancestor.join(concept_frequency_df, concept_ancestor['descendant_concept_id'] == concept_frequency_df['standard_concept_id'], 'left_outer') \
                        .where(col('standard_concept_id').isNotNull()).groupBy('ancestor_concept_id').sum("freq_count") \
                        .withColumnRenamed('ancestor_concept_id', 'concept_id')

In [8]:
freq_df.printSchema()

root
 |-- concept_id: string (nullable = true)
 |-- sum(freq_count): long (nullable = true)



In [9]:
information_content_table = freq_df.orderBy(col('sum(freq_count)').desc()).withColumn("Information_content", (-log(col('sum(freq_count)') / total_sum))) \
                                                                        .withColumn("Probability", col('sum(freq_count)') / total_sum)

In [10]:
information_content_table.show()

+----------+---------------+-------------------+-------------------+
|concept_id|sum(freq_count)|Information_content|        Probability|
+----------+---------------+-------------------+-------------------+
|  36206173|       79967894| 0.9053933660359812|0.40438278335995004|
|  45876249|       67245415|  1.078669757438401| 0.3400475706649838|
|  36300000|       63437173| 1.1369685806144145| 0.3207899983739308|
|  36302170|       63278962|  1.139465675119548| 0.3199899547396923|
|  36303153|       63071493|  1.142749702651699| 0.3189408225507052|
|  36208195|       61776154| 1.1635011627611662|0.31239053387842014|
|    441840|       41086928| 1.5713285777729744|0.20776896168292072|
|  36313966|       41083055| 1.5714228457735622|0.20774937664145454|
|  45876033|       35653536| 1.7131702666800224|0.18029330776554126|
|  40772935|       35368172| 1.7212062754921829|0.17885027503304576|
|  45876002|       33697134| 1.7696058059259916|0.17040014631588532|
|  40796128|       33305379|  1.78

In [11]:
training_data = spark.read.option("header", "true").option("delimiter", ",").csv("pheMLSamplerefset.csv")           

In [12]:
training_data.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- GROUND_TRUTH: string (nullable = true)
 |-- PHENOTYPE_ID: string (nullable = true)
 |-- CONCEPT_ID_1: string (nullable = true)
 |-- CONCEPT_ID_2: string (nullable = true)
 |-- CONCEPT_NAME_1: string (nullable = true)
 |-- CONCEPT_NAME_2: string (nullable = true)
 |-- SAME_DOMAIN: string (nullable = true)
 |-- IS_PARENT: string (nullable = true)
 |-- IS_ANCESTOR: string (nullable = true)
 |-- MIN_DISTANCE: string (nullable = true)
 |-- IS_SIBLING_W_SAME_PARENT: string (nullable = true)
 |-- RISK_DIFF: string (nullable = true)
 |-- RISK_RATIO: string (nullable = true)
 |-- RN1: string (nullable = true)



In [13]:
common_anc_ic_df = training_data.join(concept_ancestor, col('concept_id_1') == concept_ancestor['descendant_concept_id']) \
                            .select(col('ancestor_concept_id').alias('ancestor_concept_id_1'), col('concept_id_1'), col('concept_id_2')) \
                        .join(concept_ancestor, col('concept_id_2') == col('descendant_concept_id')) \
                            .select(col('ancestor_concept_id_1'), col('concept_id_1'), col('ancestor_concept_id').alias('ancestor_concept_id_2'),  col('concept_id_2')) \
                        .filter(col("ancestor_concept_id_1") == col("ancestor_concept_id_2")) \
                        .join(information_content_table, col('ancestor_concept_id_1') == col('concept_id')) \
                            .select(col('concept_id').alias("ancestor_concept_id"), col('concept_id_1'), col('concept_id_2'), \
                                    col('sum(freq_count)'), col('Information_content') , col('Probability'))


In [14]:
common_anc_ic_df.printSchema()

root
 |-- ancestor_concept_id: string (nullable = true)
 |-- concept_id_1: string (nullable = true)
 |-- concept_id_2: string (nullable = true)
 |-- sum(freq_count): long (nullable = true)
 |-- Information_content: double (nullable = true)
 |-- Probability: double (nullable = true)



In [15]:
intersection_sum = common_anc_ic_df.groupBy(col('concept_id_1'), col('concept_id_2')).agg(sum('Information_content')).withColumnRenamed("sum(Information_content)", "intersection")


In [16]:
#intersection_sum.cache()

In [17]:
intersection_sum.show()

+------------+------------+------------------+
|concept_id_1|concept_id_2|      intersection|
+------------+------------+------------------+
|      439693|      444406| 971.0343583885979|
|    44792251|    46284592|208.16461241950432|
|    43021835|    46284592|208.16461241950432|
|    43531653|    45771075|  274.059190851704|
|    44792229|    45757445|208.16461241950432|
|    43021854|    43531559|208.16461241950432|
|    43021835|    44792253|208.16461241950432|
|     4322556|    43021835|208.16461241950432|
|    43020456|    46284598|208.16461241950432|
|    44782703|    44792249|208.16461241950432|
|     4322556|    43531653|208.16461241950432|
|    45763855|    46284567|208.16461241950432|
|    45769901|    46270355|208.16461241950432|
|     4128067|    44792231|208.16461241950432|
|    44784639|    44792231|208.16461241950432|
|    44782703|    46284591|208.16461241950432|
|    45769352|    45771045|123.64967907159546|
|     2107095|     2107311|234.47858167750567|
|     2107197

In [18]:
temp_df = training_data.select(col('concept_id_1'), col('concept_id_2'))

In [19]:
unioned_sum = temp_df.join(concept_ancestor, col('concept_id_1') == concept_ancestor['descendant_concept_id']) \
                .select(col('concept_id_1'), col('concept_id_2'), concept_ancestor['ancestor_concept_id']) \
                .union(temp_df.join(concept_ancestor, col('concept_id_2') == concept_ancestor['descendant_concept_id']) \
                .select(col('concept_id_1'), col('concept_id_2'), concept_ancestor['ancestor_concept_id'])) \
                .join(information_content_table, col('ancestor_concept_id') == information_content_table['concept_id']) \
                .select(col('concept_id_1'), col('concept_id_2'), col('ancestor_concept_id'), information_content_table['Information_content']) \
                .groupBy(col('concept_id_1'), col('concept_id_2')).agg(sum(col('Information_content')))

In [20]:
unioned_sum.show()

+------------+------------+------------------------+
|concept_id_1|concept_id_2|sum(Information_content)|
+------------+------------+------------------------+
|     4265212|    44809286|      1221.9532728435065|
|     4265212|    37395518|       1174.659201545537|
|       28356|      321107|      1055.1455737040878|
|      437220|      439738|      1642.0606628402754|
|      134579|      439738|      1644.9780319074039|
|      438691|     4116235|      1248.7015039466341|
|      438692|     4116235|      1312.8245006597324|
|      192384|     2211624|       656.0312833077304|
|      440335|     2211328|      1109.4842748294172|
|     2107494|     2110592|       859.0424171676667|
|     2107860|     2110597|      1309.4682105005272|
|     2107098|     2107970|       899.4218240873519|
|     2107099|     2107965|       849.0984146935807|
|      441593|    44782733|       685.2503411753123|
|    40766092|    46284600|      260.55156710669587|
|     2109069|    40759656|       841.45954605

In [21]:
graph_ic_measure_df = unioned_sum.join(intersection_sum, (unioned_sum['concept_id_1'] == intersection_sum['concept_id_1']) & (unioned_sum['concept_id_2'] == intersection_sum['concept_id_2'])) \
                                    .select(unioned_sum['concept_id_1'], unioned_sum['concept_id_2'], unioned_sum['sum(Information_content)'].alias('Union_sum'), intersection_sum['intersection'].alias('intersection_sum'))

In [22]:
graph_ic_measure = graph_ic_measure_df.withColumn('graph_ic_measure', (col('intersection_sum') / col('Union_sum')))

In [23]:
graph_ic_measure.cache()

DataFrame[concept_id_1: string, concept_id_2: string, Union_sum: double, intersection_sum: double, graph_ic_measure: double]

In [24]:
graph_ic_measure.show()


+------------+------------+------------------+------------------+--------------------+
|concept_id_1|concept_id_2|         Union_sum|  intersection_sum|    graph_ic_measure|
+------------+------------+------------------+------------------+--------------------+
|      132414|      432690|  739.505903245484| 49.27248566031775| 0.06662892810466371|
|      132575|      199752|1196.1671945004316| 79.17703632362709| 0.06619228205526458|
|      132575|    40490918| 709.9528374300353|107.26015123796006| 0.15108067125449073|
|      132718|      133003| 731.2980823060825| 332.5065643835223| 0.45467993480167873|
|      132781|      316135|334.71811430155714| 8.478583121224403|0.025330517707164796|
|      132841|      136928|1208.2687734766744| 365.8430779503427|  0.3027828625394872|
|      132841|      192270| 1758.618969046716|  559.449948625747|  0.3181189094810029|
|      132852|      140666| 735.1839591190453|293.84717867096714|  0.3996920430950068|
|      133003|      443597| 531.28405710853

In [None]:
training_data.join(graph_ic_measure, (training_data['concept_id_1'] == graph_ic_measure['concept_id_1']) & (training_data['concept_id_2'] == graph_ic_measure['concept_id_2']), "left_outer") \
                .select(training_data['concept_id_1'], training_data['concept_id_2'], graph_ic_measure['graph_ic_measure'], training_data['is_connected']) \
                .where("is_connected = 0").show(500)


In [25]:
common_anc_ic_df.printSchema()

root
 |-- ancestor_concept_id: string (nullable = true)
 |-- concept_id_1: string (nullable = true)
 |-- concept_id_2: string (nullable = true)
 |-- sum(freq_count): long (nullable = true)
 |-- Information_content: double (nullable = true)
 |-- Probability: double (nullable = true)



In [26]:
max_ic_df = common_anc_ic_df.groupBy("ancestor_concept_id", "concept_id_1", "concept_id_2").agg(max("Information_content"), max("Probability")) \
                            .withColumnRenamed("max(Information_content)", "information_content").withColumnRenamed("max(Probability)", "MICA_probability")

In [None]:
training_data.join(max_ic_df, (training_data["concept_id_1"] == max_ic_df["concept_id_1"]) 
                   & (training_data["concept_id_2"] == max_ic_df["concept_id_2"]), "left_outer") \
    .select([training_data["concept_id_1"], training_data["concept_id_2"], training_data["is_connected"]] + [max_ic_df[f] for f in max_ic_df.schema.fieldNames()]) \
    .where("is_connected == 1 AND information_content == 0.0").show(500, False)

In [27]:
lin_sim_df = max_ic_df.join(information_content_table, max_ic_df['concept_id_1'] == information_content_table['concept_id'], "left_outer") \
                            .select(col('concept_id_1'), col('concept_id_2'), max_ic_df['information_content'].alias('MICA'), information_content_table['Information_content'].alias('information_content_1'), col('MICA_probability')) \
                        .join(information_content_table, max_ic_df['concept_id_2'] == information_content_table['concept_id'], "left_outer") \
                            .select(col('concept_id_1'), col('concept_id_2'), col('MICA'), col('information_content_1'), col('MICA_probability'), information_content_table['Information_content'].alias('information_content_2'))


In [41]:
lin_sim_df.where("(information_content_1 IS NULL OR information_content_2 IS NULL) AND MICA IS NOT NULL").show()

+------------+------------+------------------+---------------------+--------------------+---------------------+-----------+
|concept_id_1|concept_id_2|              MICA|information_content_1|    MICA_probability|information_content_2|lin_measure|
+------------+------------+------------------+---------------------+--------------------+---------------------+-----------+
|      133024|      194106| 1.142749702651699|                 null|  0.3189408225507052|   14.384030281826613|       null|
|      133024|      194106|1.9166477392922043|                 null| 0.14709925156924575|   14.384030281826613|       null|
|      133024|      194106|2.5638212271406142|                 null| 0.07700990514609936|   14.384030281826613|       null|
|      133024|      194106|2.1319522132688857|                 null| 0.11860552441580598|   14.384030281826613|       null|
|      133024|      194106| 4.124573033981115|                 null| 0.01617039732461355|   14.384030281826613|       null|
|      1

In [29]:
lin_sim_df = lin_sim_df.withColumn('lin_measure', ((2 * col('MICA')) / (col('information_content_1') * col('information_content_2'))))

In [30]:
jiang_cornath_measure = lin_sim_df.withColumn('jiang_measure', (1 - (col('information_content_1') + col('information_content_2') - 2 * col('MICA'))))

In [31]:
jiang_cornath_measure.printSchema()

root
 |-- concept_id_1: string (nullable = true)
 |-- concept_id_2: string (nullable = true)
 |-- MICA: double (nullable = true)
 |-- information_content_1: double (nullable = true)
 |-- MICA_probability: double (nullable = true)
 |-- information_content_2: double (nullable = true)
 |-- lin_measure: double (nullable = true)
 |-- jiang_measure: double (nullable = true)



In [32]:
information_coefficient = jiang_cornath_measure.withColumn('information_coefficient', (col('lin_measure') * (1 - ((1)/(1 + col('MICA'))))))

In [33]:
relevence_measure = information_coefficient.withColumn('relevance_measure', (col('lin_measure') * (1 - col('MICA_probability'))))

In [34]:
output_features = relevence_measure.select(col('concept_id_1'), col('concept_id_2'), col('MICA'), col('lin_measure'), \
                                                  col('jiang_measure'), col('relevance_measure'), col('information_coefficient'))

In [35]:
training_data.join(output_features, (training_data["concept_id_1"] == output_features["concept_id_1"]) 
                   & (training_data["concept_id_2"] == output_features["concept_id_2"]), "left_outer") \
    .select([training_data["concept_id_1"], training_data["concept_id_2"], training_data["is_connected"]] + [output_features[f] for f in output_features.schema.fieldNames()]) \
    .where("is_connected == 1 AND MICA == 0.0").show(500, False)

AnalysisException: 'Cannot resolve column name "is_connected" among (_c0, GROUND_TRUTH, PHENOTYPE_ID, CONCEPT_ID_1, CONCEPT_ID_2, CONCEPT_NAME_1, CONCEPT_NAME_2, SAME_DOMAIN, IS_PARENT, IS_ANCESTOR, MIN_DISTANCE, IS_SIBLING_W_SAME_PARENT, RISK_DIFF, RISK_RATIO, RN1);'

In [36]:
output_features = output_features.join(graph_ic_measure, (output_features['concept_id_1'] == graph_ic_measure['concept_id_1']) & (output_features['concept_id_2'] == graph_ic_measure['concept_id_2']), "left_outer") \
                        .select([output_features[field] for field in output_features.schema.fieldNames()] + [graph_ic_measure["graph_ic_measure"]])

In [37]:
output_features.cache()

DataFrame[concept_id_1: string, concept_id_2: string, MICA: double, lin_measure: double, jiang_measure: double, relevance_measure: double, information_coefficient: double, graph_ic_measure: double]

In [43]:
output_features.show(500)

+------------+------------+------------------+--------------------+-------------------+--------------------+-----------------------+--------------------+
|concept_id_1|concept_id_2|              MICA|         lin_measure|      jiang_measure|   relevance_measure|information_coefficient|    graph_ic_measure|
+------------+------------+------------------+--------------------+-------------------+--------------------+-----------------------+--------------------+
|      133024|      194106| 1.142749702651699|                null|               null|                null|                   null|0.023897257101755286|
|      133024|      194106|1.9166477392922043|                null|               null|                null|                   null|0.023897257101755286|
|      133024|      194106|2.5638212271406142|                null|               null|                null|                   null|0.023897257101755286|
|      133024|      194106|2.1319522132688857|                null|         

In [None]:
output_features.repartition(1).write.option("header", "true").mode("overwrite").csv("similarity_measures.csv")

In [None]:
ic_output = training_data.join(output_features, (training_data["concept_id_1"] == output_features["concept_id_1"]) 
                   & (training_data["concept_id_2"] == output_features["concept_id_2"]), "left_outer") \
    .select([training_data[field] for field in training_data.schema.fieldNames()] + [output_features["MICA"] \
                                .alias("information_content_semantic_sim"), output_features['lin_measure'], \
                                output_features['jiang_measure'], output_features['information_coefficient']])

In [None]:
#ic_output.repartition(1).write.option("header", "true").mode("overwrite").csv("PheML_refset_sample_semantic_sim_new.csv")