In [55]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.window import Window
import os
import copy
import configparser

In [56]:
class Concept:
    def __init__(self, concept_id: int, concept_name: str, concept_synonyms=None):
        self.concept_id = concept_id
        self.concept_name = concept_id
        self.concept_synonyms = concept_synonyms

In [57]:
class Feature:
    
    def __init__(self, name: str):
        self.name = name
    
    def description(self):
        return "This feature is {}".format(self.name)
    
    def compute(self, concept_1: Concept, concept_2: Concept):
        pass

In [58]:
class SemanticSimilarityFeature(Feature):
    
    def __init__(self, name: str, base_url: str, configs=dict()):
        
        Feature.__init__(self, name)
        self.base_url = base_url
        self.configs = configs
        self.direct_child_parent_df = spark.read \
            .jdbc(base_url, "dbo.concept_ancestor", properties=properties).where("min_levels_of_separation=1") \
            .rdd.map(lambda r: (r[1], r[0])).groupByKey().collectAsMap()
    
    def compute(self, concept_1: Concept, concept_2: Concept):
        return calculate_concept_sim(concept_1.concept_id, concept_2.concept_id)
    
        
    def calculate_concept_sim(concept_id_1, concept_id_2):
        node_paths_1 = recur_connect_path_bottom_up(concept_id_1)
        node_paths_2 = recur_connect_path_bottom_up(concept_id_2)
        similarity = calculate_pairwise_sim(node_paths_1, node_paths_2)
        return similarity
    
    def recur_connect_path_bottom_up(concept_id):
        node_paths = []
        if concept_id in direct_child_parent_df:
            for parent_concept_id in direct_child_parent_df[concept_id]:
                parent_node_paths = recur_connect_path_bottom_up(parent_concept_id)

                if len(parent_node_paths) == 0:
                    node_paths.append(str(parent_concept_id) + '.' + str(concept_id))
                for parent_node_path in parent_node_paths:
                    node_paths.append(str(parent_node_path) + '.' + str(concept_id))
        return node_paths
    
    def calculate_pairwise_sim(node_paths_1, node_paths_2):
        max_score = 0
        best_node_path_1 = ''
        best_node_path_2 = ''
        if (len(node_paths_1)!=0) & (len(node_paths_2)!=0):
            for node_path_1 in node_paths_1:
                for node_path_2 in node_paths_2:
                    score = calculate_sim(node_path_1, node_path_2)
                    if max_score < score:
                        max_score = score
                        best_node_path_1 = node_path_1
                        best_node_path_2 = node_path_2
        return (max_score, best_node_path_1, best_node_path_2)
    
    def calculate_sim(node_path_1, node_path_2):
        node_path_1_ids = node_path_1.split(".")
        node_path_2_ids = node_path_2.split(".")
        max_iter = max(len(node_path_1_ids), len(node_path_2_ids))

        shared_distance = 0

        for i in range(max_iter): #0-8
            if (len(node_path_1_ids) > i) & (len(node_path_2_ids) > i):
                if node_path_1_ids[i] != node_path_2_ids[i]:
                    break
                shared_distance += 1
        return (shared_distance * 2) / (len(node_path_1_ids) + len(node_path_2_ids))

In [24]:
concept_1 = Concept(1, "hypertension")
concept_2 = Concept(2, "hypertensive disorder")

In [63]:
#Parse the properties
config = configparser.ConfigParser()
config.read("omop_database_properties.ini")
properties = config.defaults()
base_url = properties["base_url"]

In [66]:
properties

OrderedDict([('base_url',
              'jdbc:jtds:sqlserver://url;useNTLMv2=true;domain=NYH;databaseName=ohdsi_cumc_deid_pending'),
             ('driver', 'net.sourceforge.jtds.jdbc.Driver'),
             ('user', 'DB_USER'),
             ('password', 'DB_PASSWORD')])

In [52]:

semanticSimilarityFeature = SemanticSimilarityFeature("semantic similarity", base_url, properties)

Py4JJavaError: An error occurred while calling o34.jdbc.
: java.sql.SQLException: Unknown server host name 'url'.
	at net.sourceforge.jtds.jdbc.JtdsConnection.<init>(JtdsConnection.java:427)
	at net.sourceforge.jtds.jdbc.Driver.connect(Driver.java:184)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:63)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:54)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:56)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation$.getSchema(JDBCRelation.scala:210)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:35)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:318)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167)
	at org.apache.spark.sql.DataFrameReader.jdbc(DataFrameReader.scala:238)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.UnknownHostException: url
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:184)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:589)
	at net.sourceforge.jtds.jdbc.SharedSocket.createSocketForJDBC3(SharedSocket.java:288)
	at net.sourceforge.jtds.jdbc.SharedSocket.<init>(SharedSocket.java:251)
	at net.sourceforge.jtds.jdbc.JtdsConnection.<init>(JtdsConnection.java:331)
	... 22 more
