In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql import Row
import pandas as pd
import itertools
from pyspark.sql.functions import col

### Read concept ancestor file
#### Convert the string into int datatype and find the direct_child_parent dataframe

In [None]:
concept_ancestor = concept_ancestor.withColumn("ancestor_concept_id", concept_ancestor["ancestor_concept_id"].cast(IntegerType()))
concept_ancestor = concept_ancestor.withColumn("descendant_concept_id", concept_ancestor["descendant_concept_id"].cast(IntegerType()))
concept_ancestor = concept_ancestor.withColumn("min_levels_of_separation", concept_ancestor["min_levels_of_separation"].cast(IntegerType()))
concept_ancestor = concept_ancestor.withColumn("max_levels_of_separation", concept_ancestor["max_levels_of_separation"].cast(IntegerType()))

In [None]:
direct_child_parent_df = concept_ancestor.where("min_levels_of_separation=1").rdd.map(lambda r: (r[1], r[0])).groupByKey().collectAsMap()

### Define the functions  

In [2]:
def recur_connect_path_bottom_up(concept_id, direct_child_parent_df):
    node_paths = []
    if concept_id in direct_child_parent_df:
        for parent_concept_id in direct_child_parent_df[concept_id]:
            parent_node_paths = recur_connect_path_bottom_up(parent_concept_id, direct_child_parent_df)
            
            if len(parent_node_paths) == 0:
                node_paths.append(str(parent_concept_id) + '.' + str(concept_id))
            for parent_node_path in parent_node_paths:
                node_paths.append(str(parent_node_path) + '.' + str(concept_id))
    return node_paths

In [3]:
def calculate_sim(node_path_1, node_path_2):
    
    node_path_1_ids = node_path_1.split(".")
    node_path_2_ids = node_path_2.split(".")
    max_iter = max(len(node_path_1_ids), len(node_path_2_ids))
    
    shared_distance = 0
    
    for i in range(max_iter): #0-8
        if (len(node_path_1_ids) > i) & (len(node_path_2_ids) > i):
            if node_path_1_ids[i] != node_path_2_ids[i]:
                break
            shared_distance += 1
    return (shared_distance * 2) / (len(node_path_1_ids) + len(node_path_2_ids))

In [4]:
def calculate_pairwise_sim(node_paths_1, node_paths_2):
    max_score = 0
    best_node_path_1 = ''
    best_node_path_2 = ''
    if (len(node_paths_1)!=0) & (len(node_paths_2)!=0):
        for node_path_1 in node_paths_1:
            for node_path_2 in node_paths_2:
                score = calculate_sim(node_path_1, node_path_2)
                if max_score < score:
                    max_score = score
                    best_node_path_1 = node_path_1
                    best_node_path_2 = node_path_2
    return (max_score, best_node_path_1, best_node_path_2)

### Create the udf for semantic similarity calculation 

In [5]:
semantic_score = udf(lambda x,y: calculate_pairwise_sim(recur_connect_path_bottom_up(x, direct_child_parent_df),  \
                 recur_connect_path_bottom_up(y, direct_child_parent_df))[0], FloatType())

### Run the udf
#### df is the dataframe with two columns: concept_id_1 and concept_id_2
#### Or you can parse the two concepts into the semantic_score udf

In [None]:
df.select("concept_id_1", "concept_id_2", semantic_score('concept_id_1','concept_id_2').alias('semantic_score')).show()