In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [2]:
concept_frequency_df = spark.read.option("header", "true").option("delimiter", ",").option("inferschema", "true") \
                            .csv("concept_id_frequency").select(col("standard_concept_id").alias("standard_concept_id"), col("count").alias("freq_count"))

In [3]:
concept_frequency_df.show()

+-------------------+----------+
|standard_concept_id|freq_count|
+-------------------+----------+
|             432807|     17680|
|              80502|     72686|
|             433911|      1278|
|             436972|       204|
|             139188|     16310|
|           46272452|      2717|
|             436176|     10562|
|             141372|      1776|
|             437469|       240|
|           40480500|      1851|
|             434354|        38|
|             440069|      5396|
|             438555|      3333|
|            4134565|     58876|
|             441223|        63|
|             432312|      6259|
|            4120275|      8477|
|             200562|       174|
|             444455|      1423|
|             319041|     10366|
+-------------------+----------+
only showing top 20 rows



In [4]:
total_sum = concept_frequency_df.groupBy().agg(sum("freq_count")).collect()[0][0]
total_sum

197752964

In [5]:
concept_ancestor = spark.read.option("header", "true").option("delimiter", ",").csv("concept_ancestor.csv").filter(col('min_levels_of_separation') >= 1)

In [6]:
concept_ancestor.printSchema()

root
 |-- ancestor_concept_id: string (nullable = true)
 |-- descendant_concept_id: string (nullable = true)
 |-- min_levels_of_separation: string (nullable = true)
 |-- max_levels_of_separation: string (nullable = true)



In [7]:
# left join b/w descendent_concept_id and the standard_concept_id in the concept freq table
freq_df = concept_ancestor.join(concept_frequency_df, concept_ancestor['descendant_concept_id'] == concept_frequency_df['standard_concept_id'], 'left_outer') \
                        .where(col('standard_concept_id').isNotNull()).groupBy('ancestor_concept_id').sum("freq_count") \
                        .withColumnRenamed('ancestor_concept_id', 'concept_id')

In [8]:
freq_df.printSchema()

root
 |-- concept_id: string (nullable = true)
 |-- sum(freq_count): long (nullable = true)



In [9]:
information_content_table = freq_df.orderBy(col('sum(freq_count)').desc()).withColumn("Information_content", (-log(col('sum(freq_count)') / total_sum))) \
                                                                        .withColumn("Probability", col('sum(freq_count)') / total_sum)

In [10]:
information_content_table.show()

+----------+---------------+-------------------+-------------------+
|concept_id|sum(freq_count)|Information_content|        Probability|
+----------+---------------+-------------------+-------------------+
|  36206173|       79967894| 0.9053933660359812|0.40438278335995004|
|  45876249|       67245415|  1.078669757438401| 0.3400475706649838|
|  36300000|       63437173| 1.1369685806144145| 0.3207899983739308|
|  36302170|       63278962|  1.139465675119548| 0.3199899547396923|
|  36303153|       63071493|  1.142749702651699| 0.3189408225507052|
|  36208195|       61776154| 1.1635011627611662|0.31239053387842014|
|    441840|       41086928| 1.5713285777729744|0.20776896168292072|
|  36313966|       41083055| 1.5714228457735622|0.20774937664145454|
|  45876033|       35653536| 1.7131702666800224|0.18029330776554126|
|  40772935|       35368172| 1.7212062754921829|0.17885027503304576|
|  45876002|       33697134| 1.7696058059259916|0.17040014631588532|
|  40796128|       33305379|  1.78

In [11]:
training_data = spark.read.option("header", "true").option("delimiter", ",").csv("PheML_refset_sample_semantic_sim.csv") \
                        

In [12]:
common_anc_ic_df = training_data.join(concept_ancestor, col('concept_id_1') == concept_ancestor['descendant_concept_id']) \
                            .select(col('ancestor_concept_id').alias('ancestor_concept_id_1'), col('concept_id_1'), col('concept_id_2')) \
                        .join(concept_ancestor, col('concept_id_2') == col('descendant_concept_id')) \
                            .select(col('ancestor_concept_id_1'), col('concept_id_1'), col('ancestor_concept_id').alias('ancestor_concept_id_2'),  col('concept_id_2')) \
                        .filter(col("ancestor_concept_id_1") == col("ancestor_concept_id_2")) \
                        .join(information_content_table, col('ancestor_concept_id_1') == col('concept_id')) \
                            .select(col('concept_id').alias("ancestor_concept_id"), col('concept_id_1'), col('concept_id_2'), col('sum(freq_count)'), col('Information_content') , col('Probability'))


In [13]:
common_anc_ic_df.printSchema()

root
 |-- ancestor_concept_id: string (nullable = true)
 |-- concept_id_1: string (nullable = true)
 |-- concept_id_2: string (nullable = true)
 |-- sum(freq_count): long (nullable = true)
 |-- Information_content: double (nullable = true)
 |-- Probability: double (nullable = true)



In [14]:
intersection_sum = common_anc_ic_df.groupBy(col('concept_id_1'), col('concept_id_2')).agg(sum('Information_content')).withColumnRenamed("sum(Information_content)", "intersection")


In [15]:
intersection_sum.cache()

DataFrame[concept_id_1: string, concept_id_2: string, intersection: double]

In [16]:
intersection_sum.show()

+------------+------------+------------------+
|concept_id_1|concept_id_2|      intersection|
+------------+------------+------------------+
|      439693|      444406|242.75858959714952|
|     2107095|     2107311|234.47858167750567|
|     2107197|     2107377|306.38383127697455|
|     2107269|     2107376| 236.6172790106602|
|       77317|      317009|19.046500499155663|
|       75621|      200687|14.740421150960398|
|      317576|      133295|  22.6606571715489|
|      372607|      133295|16.888421529388474|
|      200616|      432919|16.888421529388474|
|       75622|      374906|17.584764764528867|
|       80182|      378726|16.888421529388474|
|       77317|      444406| 26.08511730970354|
|       30683|      374919|14.740421150960398|
|       81893|      201343| 74.09265246814918|
|      133295|       26942|17.584764764528867|
|       75621|      320299|14.740421150960398|
|      194077|       75621| 42.88080735446119|
|      315296|      135215|16.888421529388474|
|      443412

In [17]:
temp_df = training_data.select(col('concept_id_1'), col('concept_id_2'))

In [21]:
unioned_sum = temp_df.join(concept_ancestor, col('concept_id_1') == concept_ancestor['descendant_concept_id']) \
                .select(col('concept_id_1'), col('concept_id_2'), concept_ancestor['ancestor_concept_id']) \
                .union(temp_df.join(concept_ancestor, col('concept_id_2') == concept_ancestor['descendant_concept_id']) \
                .select(col('concept_id_1'), col('concept_id_2'), concept_ancestor['ancestor_concept_id'])) \
                .join(information_content_table, col('ancestor_concept_id') == information_content_table['concept_id']) \
                .select(col('concept_id_1'), col('concept_id_2'), col('ancestor_concept_id'), information_content_table['Information_content']) \
                .groupBy(col('concept_id_1'), col('concept_id_2')).agg(sum(col('Information_content')))

In [22]:
unioned_sum.cache()

DataFrame[concept_id_1: string, concept_id_2: string, sum(Information_content): double]

In [23]:
unioned_sum.show()

+------------+------------+------------------------+
|concept_id_1|concept_id_2|sum(Information_content)|
+------------+------------+------------------------+
|     1101922|       81064|       943.2003673510263|
|     1592300|       81945|       563.4659735737604|
|     1592300|      374906|       826.2704875041296|
|     1592300|       73291|       671.3713985937106|
|     2107098|     2107970|       899.4218240873519|
|     2107410|     2107875|      1027.6055926137956|
|     2107499|     2107411|       842.7117139265118|
|     2107375|     2107410|        927.275250862684|
|       77317|      317009|       782.0803151175878|
|       77317|      444406|      1005.1918825638887|
|      315286|     1502827|       802.8984054544221|
|     2107891|     2107339|       887.9757967273947|
|     2107453|     2107339|       731.6786671448638|
|     2106709|     2107339|        689.370479990665|
|     2107153|     2107339|       712.7850203561471|
|     2107878|     2107339|        933.4315163

In [28]:
graph_ic_measure_df = unioned_sum.join(intersection_sum, (unioned_sum['concept_id_1'] == intersection_sum['concept_id_1']) & (unioned_sum['concept_id_2'] == intersection_sum['concept_id_2'])) \
                                    .select(unioned_sum['concept_id_1'], unioned_sum['concept_id_2'], unioned_sum['sum(Information_content)'].alias('Union_sum'), intersection_sum['intersection'].alias('intersection_sum'))

In [29]:
graph_ic_measure = graph_ic_measure_df.withColumn('graph_ic_measure', (col('intersection_sum') / col('Union_sum')))

In [30]:
graph_ic_measure.cache()

DataFrame[concept_id_1: string, concept_id_2: string, Union_sum: double, intersection_sum: double, graph_ic_measure: double]

In [31]:
graph_ic_measure.show()

+------------+------------+------------------+------------------+--------------------+
|concept_id_1|concept_id_2|         Union_sum|  intersection_sum|    graph_ic_measure|
+------------+------------+------------------+------------------+--------------------+
|     2107098|     2107970| 899.4218240873519| 62.21519370256184| 0.06917243059527929|
|     2107410|     2107875|1027.6055926137956| 207.7275498714375| 0.20214715778557235|
|     2107499|     2107411| 842.7117139265118|206.14805807273237| 0.24462465000303696|
|     2107375|     2107410|  927.275250862684| 284.7800194058572| 0.30711487138356613|
|       77317|      317009| 782.0803151175878|19.046500499155663|0.024353637511374995|
|       77317|      444406|1005.1918825638887| 26.08511730970354| 0.02595038595334618|
|     2107891|     2107339| 887.9757967273947| 59.78206313694599| 0.06732397815038518|
|     2107453|     2107339| 731.6786671448638|  71.7197318781182| 0.09802080489510641|
|     2106709|     2107339|  689.3704799906

In [32]:
max_ic_df = common_anc_ic_df.groupBy("ancestor_concept_id", "concept_id_1", "concept_id_2").agg(max("Information_content"), max("Probability")) \
                            .withColumnRenamed("max(Information_content)", "information_content").withColumnRenamed("max(Probability)", "MICA_probability")

In [33]:
max_ic_df.printSchema()

root
 |-- ancestor_concept_id: string (nullable = true)
 |-- concept_id_1: string (nullable = true)
 |-- concept_id_2: string (nullable = true)
 |-- information_content: double (nullable = true)
 |-- MICA_probability: double (nullable = true)



In [34]:
lin_sim_df = max_ic_df.join(information_content_table, max_ic_df['concept_id_1'] == information_content_table['concept_id']) \
                            .select(col('concept_id_1'), col('concept_id_2'), max_ic_df['information_content'].alias('MICA'), information_content_table['Information_content'].alias('information_content_1'), col('MICA_probability')) \
                        .join(information_content_table, max_ic_df['concept_id_2'] == information_content_table['concept_id']) \
                            .select(col('concept_id_1'), col('concept_id_2'), col('MICA'), col('information_content_1'), col('MICA_probability'), information_content_table['Information_content'].alias('information_content_2'))


In [35]:
lin_sim_df.printSchema()

root
 |-- concept_id_1: string (nullable = true)
 |-- concept_id_2: string (nullable = true)
 |-- MICA: double (nullable = true)
 |-- information_content_1: double (nullable = true)
 |-- MICA_probability: double (nullable = true)
 |-- information_content_2: double (nullable = true)



In [36]:
lin_sim_df = lin_sim_df.withColumn('lin_measure', ((2 * col('MICA')) / (col('information_content_1') * col('information_content_2'))))

In [37]:
jiang_cornath_measure = lin_sim_df.withColumn('jiang_measure', (1 - (col('information_content_1') + col('information_content_2') - 2 * col('MICA'))))

In [38]:
jiang_cornath_measure.printSchema()

root
 |-- concept_id_1: string (nullable = true)
 |-- concept_id_2: string (nullable = true)
 |-- MICA: double (nullable = true)
 |-- information_content_1: double (nullable = true)
 |-- MICA_probability: double (nullable = true)
 |-- information_content_2: double (nullable = true)
 |-- lin_measure: double (nullable = true)
 |-- jiang_measure: double (nullable = true)



In [39]:
information_coefficient = jiang_cornath_measure.withColumn('information_coefficient', (col('lin_measure') * (1 - ((1)/(1 + col('MICA'))))))

In [40]:
relevence_measure = information_coefficient.withColumn('relevance_measure', (col('lin_measure') * (1 - col('MICA_probability'))))

In [41]:
output_features = relevence_measure.select(col('concept_id_1'), col('concept_id_2'), col('MICA'), col('lin_measure'), \
                                                  col('jiang_measure'), col('relevance_measure'), col('information_coefficient'))

In [42]:
output_features = output_features.join(graph_ic_measure, (output_features['concept_id_1'] == graph_ic_measure['concept_id_1']) & (output_features['concept_id_2'] == graph_ic_measure['concept_id_2'])) \
                        .select([output_features[field] for field in output_features.schema.fieldNames()] + [graph_ic_measure["graph_ic_measure"]])

In [43]:
output_features.cache()

DataFrame[concept_id_1: string, concept_id_2: string, MICA: double, lin_measure: double, jiang_measure: double, relevance_measure: double, information_coefficient: double, graph_ic_measure: double]

In [44]:
output_features.show()

+------------+------------+------------------+--------------------+-------------------+--------------------+-----------------------+-------------------+
|concept_id_1|concept_id_2|              MICA|         lin_measure|      jiang_measure|   relevance_measure|information_coefficient|   graph_ic_measure|
+------------+------------+------------------+--------------------+-------------------+--------------------+-----------------------+-------------------+
|      443344|      314665|3.3614508133174006| 0.04397821382847052|  -17.0138800581896| 0.04245283383095776|   0.033894822839814646|0.05145749343665868|
|      443344|      314665| 1.142749702651699|0.014950714309587061|-21.451282279521003|0.010182321189966766|   0.007973364462759302|0.05145749343665868|
|      443344|      314665|2.1319522132688857|0.027892554588561498| -19.47287725828663|0.024584343524288666|   0.018986749937267202|0.05145749343665868|
|      443344|      314665|1.1369685806144145|0.014875079283152369| -21.4628445235

In [45]:
output_features.repartition(1).write.option("header", "true").mode("overwrite").csv("similarity_measures.csv")

In [47]:
ic_output = training_data.join(output_features, (training_data["concept_id_1"] == output_features["concept_id_1"]) 
                   & (training_data["concept_id_2"] == output_features["concept_id_2"]), "left_outer") \
    .select([training_data[field] for field in training_data.schema.fieldNames()] + [output_features["MICA"] \
                                .alias("information_content_semantic_sim"), output_features['lin_measure'], \
                                output_features['jiang_measure'], output_features['information_coefficient']])

In [None]:
#ic_output.repartition(1).write.option("header", "true").mode("overwrite").csv("PheML_refset_sample_semantic_sim_new.csv")