In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.window import Window
from features.feature import *
from features.lexical_similarity_feature import *
from features.semantic_sim_feature import *
from features.embedding_feature import *
from features.cooccurrence_feature import *
from features.information_content_feature import *
import configparser
import os

In [18]:
config = configparser.ConfigParser()
config.read("resource_property_file.ini")
properties = config.defaults()
resource_folder = os.path.join(properties["resource_folder"], '')

## Load the training example

In [13]:
dataset = spark.read.option("header", "true").option("inferSchema", "true").csv(resource_folder + "pheMLSamplerefset.csv")

## Generate the lexical similarity features

In [None]:
lexicalSimilarityFeature = LexicalSimilarityFeature("lexical similarity")

In [None]:
dataset = lexicalSimilarityFeature.annotate(dataset)

## Generate the cooccurrence features

In [None]:
cooccurrence_visit_df = spark.read.parquet(resource_folder + "cooccurrence_matrix_visit")
cooccurrence_60_df = spark.read.parquet(resource_folder + "cooccurrence_matrix_60")
cooccurrence_180_df = spark.read.parquet(resource_folder + "cooccurrence_matrix_180")
cooccurrence_360_df = spark.read.parquet(resource_folder + "cooccurrence_matrix_360")
cooccurrence_lifetime_df = spark.read.parquet(resource_folder + "cooccurrence_matrix_lifetime")

In [None]:
cooccurrence_visit_feature = OmopCooccurrenceFeature("cooccurrence visit", cooccurrence_visit_df)
cooccurrence_60_feature = OmopCooccurrenceFeature("cooccurrence 60 days", cooccurrence_60_df)
cooccurrence_180_feature = OmopCooccurrenceFeature("cooccurrence 180 days", cooccurrence_180_df)
cooccurrence_360_feature = OmopCooccurrenceFeature("cooccurrence 360 days", cooccurrence_360_df)
cooccurrence_lifetime_feature = OmopCooccurrenceFeature("cooccurrence lifetime", cooccurrence_lifetime_df)

In [None]:
dataset = cooccurrence_visit_feature.annotate(dataset)
dataset = cooccurrence_60_feature.annotate(dataset)
dataset = cooccurrence_180_feature.annotate(dataset)
dataset = cooccurrence_360_feature.annotate(dataset)
dataset = cooccurrence_lifetime_feature.annotate(dataset)

## Generate semantic similarity features

In [None]:
concept_ancestor = spark.read.parquet(resource_folder + "concept_ancestor")
semanticSimilarityFeature = SemanticSimilarityFeature("semantic similarity", concept_ancestor)

information_content = spark.read.parquet(resource_folder + "information_content")
informationContentFeature = InformationContentFeature("information content", information_content, concept_ancestor)

In [None]:
dataset = semanticSimilarityFeature.annotate(dataset)
dataset = informationContentFeature.annotate(dataset)

## Generate embedding sim similarity feature

In [None]:
weights_lifetime = spark.read.parquet(resource_folder + "embedding_time_window_lifetime/weight")
vocab_lifetime = spark.read.parquet(resource_folder + "embedding_time_window_lifetime/concept_occurrence_lifetime")
embeddingFeature_lifetime = EmbeddingFeature("lifetime cooccur embedding", vocab_lifetime, weights_lifetime)

In [None]:
weights_5 = spark.read.parquet(resource_folder + "embedding_time_window_5/weight")
vocab_5 = spark.read.parquet(resource_folder + "embedding_time_window_5/concept_occurrence_5")
embeddingFeature_5 = EmbeddingFeature("5 year cooccur embedding", vocab_5, weights_5)

In [None]:
weights_visit = spark.read.parquet(resource_folder + "embedding_time_window_visit/weight")
vocab_visit = spark.read.parquet(resource_folder + "embedding_time_window_visit/concept_occurrence_visit")
embeddingFeature_visit = EmbeddingFeature("visit cooccur embedding", vocab_visit, weights_visit)

In [None]:
dataset = embeddingFeature_lifetime.annotate(dataset)
dataset = embeddingFeature_5.annotate(dataset)
dataset = embeddingFeature_visit.annotate(dataset)

## Fill 'na' with default values

In [None]:
dataset.printSchema()

In [None]:
dataset.fillna({'min_distance' : -1, 
             'risk_ratio' : '0',
             'risk_diff' : '0',
             'distance' : '0',
             'ratio' : '0',
             'jaro' : '0',
             'jaro_wrinkler' : '0',
             'fuzz_partial_ratio' : '0',
             'cooccurrence_lifetime' : '0',
             'cooccurrence_5_year' : '0',
             'cooccurrence_visit' : '0',
             'lifetime_cooccur_embedding_cosine' : '0',
             '5_year_cooccur_embedding_cosine' : '0',
             'visit_cooccur_embedding_cosine' : '0',
             'semantic_similarity' : '0',
             'mica_information_content' : '0',
             'lin_measure' : '0',
             'jiang_measure' : '0',
             'relevance_measure' : '0',
             'information_coefficient' : '0',
             'graph_ic_measure' : '0'
            }).write.mode('overwrite').option("header", "true").csv(resource_folder + "pheMLSamplerefset_features.csv")