In [None]:
from functions import *
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
import os
import unittest
forward = True
iri = False
specific_types_edge = False
specific_types_node = False
level = 4

qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
path_to_json = '/Users/cuiyeshuai/Documents/UG modules/submission/openprov/'
json_file = '/Users/cuiyeshuai/Documents/UG modules/submission/datasets/CM-Buildings/Building729.0.json'

In [None]:
class TestFunctions(unittest.TestCase):
    def test_node_and_edge_number(self):
        f = open(json_file, 'r')
        data = f.read()
        f.close()
        prov_encoding = json_to_encoding(data, iri = iri, forward = forward, qualified_names = qualified_names)
        node_number = len(list(prov_encoding[0].keys()))
        edge_number = 0
        for start in prov_encoding[1]:
            for end in prov_encoding[1][start]:
                edge_number += 1
        file = json.decoder.JSONDecoder().decode(data)
        node_number_file = 0
        edge_number_file = 0
        for rec_type in PROV_NODE:
            if file.get(rec_type) is not None:
                for x in file.get(rec_type):
                    node_number_file += 1
        for rec_type in PROV_EDGE:
            if file.get(rec_type) is not None:
                for x in file.get(rec_type):
                    edge_number_file += 1
        self.assertEqual(node_number, node_number_file)
        self.assertEqual(edge_number, edge_number_file)
    
    def test_node_label(self):
        f = open(json_file, 'r')
        data = f.read()
        f.close()
        prov_encoding = json_to_encoding(data, iri = iri, forward = forward, qualified_names = qualified_names)
        self.assertEqual(prov_encoding[0]["BuildingVerification463"][0], "activity")
        self.assertEqual(prov_encoding[0]["BuildingVerification463"][1], frozenset(["collabmap:BuildingVerification"]))
        #self.assertEqual(prov_encoding[0]["BuildingVerification463"][1], frozenset(["http://www.orchid.ac.uk/ontologies/collabmap.owl#BuildingVerification"]))
    
    def test_equal_encoding(self):
        f = open(json_file, 'r')
        data = f.read()
        f.close()
        prov = ProvDocument()
        prov = document_to_encoding(ProvDocument.deserialize(json_file), iri = iri, forward = forward)
        prov1 = json_to_encoding(data, iri = iri, forward = forward, qualified_names = qualified_names)
        self.assertEqual(prov, prov1)
    
    def test_equal_encoding_all(self):
        for file_name in [file for file in os.listdir(path_to_json) if file.endswith('0.json')]:
            with open(path_to_json + file_name) as json_file:              
                data = json_file.read()
                prov = document_to_encoding(ProvDocument.deserialize(content=data), iri = iri, forward = forward)
                prov1 = json_to_encoding(data, iri = iri, forward = forward, qualified_names = qualified_names)
                self.assertEqual(prov, prov1)
    
    def test_type_generation(self):
        for file_name in [file for file in os.listdir(path_to_json) if file.endswith('0.json')]:
            with open(path_to_json + file_name) as json_file:              
                data = json_file.read()
                prov = json_to_encoding(data, iri = iri, forward = True, qualified_names = qualified_names)
                prov1 = json_to_encoding(data, iri = iri, forward = False, qualified_names = qualified_names)
                prov_type = type_generate(prov, 5, True, True)
                prov_type_R = type_generate_R(prov1, 5, True, True)
                self.assertEqual(prov_type, prov_type_R)
    def test_type_generation_mixed(self):
        for file_name in [file for file in os.listdir(path_to_json) if file.endswith('0.json')]:
            with open(path_to_json + file_name) as json_file:              
                data = json_file.read()
                prov = json_to_encoding(data, iri = iri, forward = True, qualified_names = qualified_names)
                prov_type = type_generate(prov, 5, False, False)
                prov_type_mixed = type_generate_mixed(prov, 5, False, False)
                self.assertEqual(prov_type, prov_type_mixed)
    def test_feature_vector(self):
        json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
        conf = SparkConf().setAppName("spark").setMaster("local[*,20]").set("spark.driver.memory", "10g")
        sc = SparkContext(conf=conf)
        sc.setLogLevel("ERROR")
        spark = SparkSession(sc)
        file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
        # (file_name, Graphic_encoding_of_ProvDocument)
        #encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
        encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
        # (file_name, prov_types of nodes)
        if forward:
            types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
            # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
        else:
            types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
        # (file_name, prov_types occurence in the graph)
        types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
        # All prov_types in this collection of graphs
        all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
        # Number of distinct prov_types
        types_count = len(all_types)
        # index_map for prov_types, prov_type -> index
        index_map = {all_types[i]: i for i in range(types_count)}
        # index -> prov_type
        reverse_index_map = {i: all_types[i] for i in range(types_count)}
        # Contruct feature vectors for each graph
        sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
        a = types_count_rdd.take(100)
        b = sparse_matrix_rdd.take(100)
        for x in range(100):
            sum_a = 0
            sum_b = 0
            for y in a[x][1].keys():
                self.assertEqual(a[x][1][y], b[x][1][index_map[y]])
                sum_a += a[x][1][y]
            for z in range(len(b[x][1])):
                sum_b += b[x][1][z]
            self.assertEqual(sum_a, sum_b)

In [None]:
unittest.main(argv=[''], exit=False,verbosity=2)