diff --git a/.github/workflows/assemble-files.yml b/.github/workflows/assemble-files.yml index 7d582919..3eb6ef0c 100644 --- a/.github/workflows/assemble-files.yml +++ b/.github/workflows/assemble-files.yml @@ -85,7 +85,7 @@ jobs: - name: Download SemTK shell: bash run: | - curl -LSfs https://github.com/ge-semtk/semtk/releases/download/v2.5.0-20221117/semtk-opensource-v2.5.0-20221117-dist.tar.gz -o RACK/rack-box/files/semtk.tar.gz + curl -LSfs https://github.com/ge-semtk/semtk/releases/download/v2.5.0-20230110/semtk-opensource-v2.5.0-20230110-dist.tar.gz -o RACK/rack-box/files/semtk.tar.gz - name: Download CSS stylesheet shell: bash diff --git a/.github/workflows/build-virtual-machine.yml b/.github/workflows/build-virtual-machine.yml index 61cdcd1a..aa84b010 100644 --- a/.github/workflows/build-virtual-machine.yml +++ b/.github/workflows/build-virtual-machine.yml @@ -52,7 +52,7 @@ jobs: - name: Split rack-box virtual machine run: | cd RACK/rack-box - mv output-output-virtualbox-ovf rack-box-${{ inputs.version }} + mv output-virtualbox-ovf rack-box-${{ inputs.version }} zip -r rack-box-${{ inputs.version }}.zip rack-box-${{ inputs.version }} split -b 1500m rack-box-${{ inputs.version }}.zip rack-box-${{ inputs.version }}.zip rm rack-box-${{ inputs.version }}.zip @@ -66,6 +66,10 @@ jobs: RACK/rack-box/GitHub-Release-README.md RACK/rack-box/rack-box-${{ inputs.version }}.zip* + # softprops/action-gh-release has many issues and PRs filed + # against it; replace it with "gh release upload" if CI fails + # run: gh release upload ${{ github.event.release.tag_name }} RACK/rack-box/GitHub-Release-README.md RACK/rack-box/rack-box-${{ inputs.version }}.zip* --clobber + - name: Upload split virtual machine to release uses: softprops/action-gh-release@v1 if: github.event_name == 'release' diff --git a/.github/workflows/continuous.yml b/.github/workflows/continuous.yml index bac317f1..ffacb07a 100644 --- a/.github/workflows/continuous.yml +++ b/.github/workflows/continuous.yml @@ -51,7 +51,7 @@ jobs: ./assist/bin/check - name: Lint shell scripts - uses: ludeeus/action-shellcheck@1.1.0 + uses: ludeeus/action-shellcheck@2.0.0 env: SHELLCHECK_OPTS: -x -P assist/databin -e SC1008 diff --git a/.gitignore b/.gitignore index f1258253..79322083 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,5 @@ cli/.project /Turnstile-Example/Turnstile-IngestionPackage/CounterApplicationImplementation/*.o rack-ui/cache/ rack-ui/.project +EntityResolution/.project +EntityResolution/Resolutions/Summary.csv diff --git a/EntityResolution/CheckBar.py b/EntityResolution/CheckBar.py new file mode 100644 index 00000000..85d3ba3d --- /dev/null +++ b/EntityResolution/CheckBar.py @@ -0,0 +1,20 @@ +#!/usr/bin/python3 + +from tkinter import * + +class Checkbar(Frame): + def __init__(self, parent=None, picks=[], side=LEFT, anchor=W, command=None): + Frame.__init__(self, parent) + self.command = command + self.vars = {} + self.buttons = [] + for pick in picks: + self.buttons.append(Checkbutton(self, text=pick, command=lambda pick=pick: self.callback(pick))) + self.buttons[-1].pack(side=side, anchor=anchor, expand=YES) + self.vars[pick] = True + self.buttons[-1].select() + def callback(self, pick): + self.vars[pick] = not self.vars[pick] + self.command() + def state(self): + return self.vars diff --git a/EntityResolution/CreateIngestion.py b/EntityResolution/CreateIngestion.py new file mode 100755 index 00000000..0ed5888e --- /dev/null +++ b/EntityResolution/CreateIngestion.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +import DataAccess as da +import tkinter.filedialog as fd +import shutil +import os +import os.path +DEBUG = False +def Debug(*args): + if DEBUG: + print(*args) +##################################### +# Queries +##################################### + +##################################### +# helper Functions +##################################### +def createIngestion(decisions): + saveLocation = fd.asksaveasfilename(filetypes=[("Manifest File","*.zip")], defaultextension =".zip") + print("Saving Manifest File to {}".format(saveLocation)) + tempFolder = os.path.splitext(saveLocation)[0] + shutil.copytree("manifest_template", tempFolder) + + with open(os.path.join(tempFolder, "resolutions","SAME_AS.csv"), "w") as outfile: + outfile.write("primary_identifier,primary_THING_type,secondary_identifier, secondary_THING_type\n") + for p in decisions: + #print(decisions[p] ) + if decisions[p] != 4 and decisions[p] != 5: + for s in decisions[p]: + if decisions[p][s] == 2 or decisions[p][s] == 3: + print("Primary:{}".format(p)) + print("Secondary:{}".format(s)) + outfile.write('"{}","{}!","{}","{}!"\n'.format(da.getIdentifier(p), da.getType(p), da.getIdentifier(s), da.getType(s))) + shutil.make_archive(tempFolder, 'zip', tempFolder) + diff --git a/EntityResolution/DataAccess.py b/EntityResolution/DataAccess.py new file mode 100644 index 00000000..13818f1d --- /dev/null +++ b/EntityResolution/DataAccess.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +import os +import json +import semtk3 +import os.path +import time +import RACK_CONSTANTS as rc +def cacheData(e): + guid = e.split("#")[-1] + graph = "http://rack001/Data" + res = semtk3.query_raw_sparql(rc.dataQuery\ + .replace("{{GUID}}",guid) \ + .replace("{{GRAPH}}",graph),\ + result_type=semtk3.RESULT_TYPE_GRAPH_JSONLD) + with open("cache/"+guid+".json", "w") as dataFile: + json.dump(res, dataFile, indent = 4) + +def getRelationships(e): + relationships = [] + guid = e.split("#")[-1] + data = getData(e)["@graph"] + if type(data) == list: + identData = {} + for el in data: + identData[el['@id']] = el['PROV_S:identifier'] + for el in data: + if el['@id'][6:] == guid: + for p in el: + if type(el[p]) == dict: + relationships.append((p, identData[el[p]['@id']], "Outgoing")) + else: + for p in el: + if type(el[p]) == dict: + relationships.append((p, el['PROV_S:identifier'], "Incoming")) + return relationships + +def getDataProperties(e): + dataProperties = [] + guid = e.split("#")[-1] + data = getData(e)["@graph"] + if type(data) == list: + for el in data: + if el['@id'][6:] == guid: + for p in el: + if type(el[p]) != dict: + dataProperties.append((p, el[p])) + break + else: + for p in data: + if type(data[p]) != dict: + dataProperties.append((p, el[p])) + return dataProperties + +def getDescription(e): + guid = e.split("#")[-1] + data = getData(e)["@graph"] + if type(data) == list: + for el in data: + if el['@id'][6:] == guid: + if 'PROV_S:description' in el: + return el['PROV_S:description'] + else: + return None + else: + if 'PROV_S:description' in data: + return data['PROV_S:description'] + else: + return None + +def getType(e): + guid = e.split("#")[-1] + data = getData(e)["@graph"] + context = None + if "@context" in getData(e): + context = getData(e)["@context"] + elif "@context" in data: + context = data['@context'] + else: + print("ERROR: Could not find context from data graph!!!") + print("{}".format()) + if type(data) == list: + for el in data: + if el['@id'][6:] == guid: + if '@type' in el: + ns, _type = el['@type'].split(":") + return context[ns]+_type + else: + return None + else: + if '@type' in data: + ns, _type = data['@type'].split(":") + return context[ns]+_type + else: + return None + +def getIdentifier(e): + guid = e.split("#")[-1] + data = getData(e)["@graph"] + if type(data) == list: + for el in data: + if el['@id'][6:] == guid: + if 'PROV_S:identifier' in el: + return el['PROV_S:identifier'] + else: + return None + else: + if 'PROV_S:identifier' in data: + return data['PROV_S:identifier'] + else: + return None + +def getData(e): + guid = e.split("#")[-1] + data = None + if not os.path.exists("cache/"+guid+".json"): + cacheData(e) + #This is to handle the case with multiprocssing where one thread has created the file but not yet populated with data + while os.path.getsize("cache/"+guid+".json") ==0: + time.sleep(0.1) + with open("cache/"+guid+".json", "r") as dataFile: + data = json.load(dataFile) + if "@graph" not in data: + data = {"@graph":data} + return data + +if __name__ == "__main__": + semtk3.upload_owl("Model.owl", rc.connStringSource2, model_or_data=semtk3.SEMTK3_CONN_DATA, conn_index = 0) diff --git a/EntityResolution/Entity.py b/EntityResolution/Entity.py new file mode 100644 index 00000000..6e343dce --- /dev/null +++ b/EntityResolution/Entity.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +import DataAccess as da +import tkinter as tk +from tkinter import ttk +class Entity(tk.Frame): + + uri = None + def __init__(self, updateCallback): + super().__init__() + self.updateCallback = updateCallback + self.propertyString = '' + + self.properties = ttk.Treeview(self, selectmode='browse') + self.properties["columns"]=["Property","Value"] + self.properties["show"]="headings" + self.properties.heading("Property", text="Property") + self.properties.heading("Value", text="Value") + self.properties.column("Property", width=200, stretch=tk.NO) + self.properties.bind('', self.selectProperty) + self.properties.pack(fill=tk.X, expand=True) + + self.relationships = ttk.Treeview(self, selectmode='none') + self.relationships["columns"]=["Relationship","Identifier","Direction"] + self.relationships["show"]="headings" + self.relationships.heading("Identifier", text="Identifier") + self.relationships.heading("Relationship", text="Relationship") + self.relationships.heading("Direction", text="Direction") + self.relationships.column("Relationship", width=200, stretch=tk.NO) + self.relationships.pack(fill=tk.X, expand=True) + '''=================================================== + Callback for selecting property for an Entity + ===================================================''' + def selectProperty(self,a): + currItem = self.properties.focus() + if self.properties.item(currItem)['values'] != "": #if nothing is selected then we just stop, otherwise update the property string the run the call back to update the text box + self.propertyString = self.properties.item(currItem)['values'][1] + self.updateCallback() + + def update(self, e): + + self.propertyString = '' + # Clear ListView + for item in self.relationships.get_children(): + self.relationships.delete(item) + for item in self.properties.get_children(): + self.properties.delete(item) + print(e) + if e !=None: + properties = da.getDataProperties(e) + relationships = da.getRelationships(e) + ## Update Relationships and Properties + for k in properties: + self.properties.insert("", 'end', values=k) + for k in relationships: + self.relationships.insert("", 'end', values=k) diff --git a/EntityResolution/EntityResolver.py b/EntityResolution/EntityResolver.py deleted file mode 100755 index a5509206..00000000 --- a/EntityResolution/EntityResolver.py +++ /dev/null @@ -1,297 +0,0 @@ -#!/usr/bin/env python3 -import os -import json -from difflib import SequenceMatcher -import csv -from colorama import Fore, Back, Style -import semtk3 - -DEBUG = False -REPORT_TEMPLATE = """{}{}: - Primary Identifier : {} - Secondary Identifier : {} - Description Ratio : {}""" -def Debug(*args): - if DEBUG: - print(*args) - - -class RequirementResolution: - requirementData = {} - resolutionData = {} - - def __init__(self): - # Loading raw requirement data - allDescriptions = "" - with open(os.path.join(".", "rawRequirements.csv")) as csvFile: - for row in csv.DictReader(csvFile): - if row["REQUIREMENT"] not in self.requirementData: - self.requirementData[row["REQUIREMENT"]] = {} - self.requirementData[row["REQUIREMENT"]]["identifier"]=row["identifier"] - self.requirementData[row["REQUIREMENT"]]["description"]=row["description"] - allDescriptions += row["identifier"]+"\n" - self.requirementData[row["REQUIREMENT"]]["REQUIREMENT_type"]=row["REQUIREMENT_type"] - self.requirementData[row["REQUIREMENT"]]["dataInsertedBy_identifier"]=list() - self.requirementData[row["REQUIREMENT"]]["dataInsertedBy_identifier"].append(row["dataInsertedBy_identifier"]) - # Loading existing resolution data - self.bagOfWords ={} - for w in self.cleanString(allDescriptions).split(" "): - if w not in self.bagOfWords: - self.bagOfWords[w] = 0 - self.bagOfWords[w] += 1 - - self.identifierStopWords = list() - for w in sorted(self.bagOfWords, key=self.bagOfWords.get, reverse=True): - #print(w, self.bagOfWords[w]) - if self.bagOfWords[w] / len(self.requirementData) > 0.05 and not w.isnumeric(): - self.identifierStopWords.append(w) - - print(self.identifierStopWords) - - with open(os.path.join(".","resolutionData.csv")) as csvFile: - for row in csv.DictReader(csvFile): - if row["primaryEntity"] not in self.resolutionData: - self.resolutionData[row["primaryEntity"]] = list() - self.resolutionData[row["primaryEntity"]].append([row["EntityResolution_type"],row["secondaryEntity"]]) - self.resolutionData ={} - - def cleanString(self, string): - cleanString = "" - for c in string: - if c.isalnum(): - cleanString+=c - else: - cleanString += " " - while cleanString.find(" ")!=-1: - cleanString = cleanString.replace(" "," ") - return cleanString.upper().rstrip(" ").lstrip(" ") - - def deepCleanString(self, string): - string =self.cleanString(string) - NewString = "" - for t in string.split(" "): - if t not in self.identifierStopWords: - NewString+= t + " " - - return NewString.lstrip() - - ###################################################################### - # Function check to see if the descriptions have discriminator words - ###################################################################### - def checkForDiscriminators(self, reqP, reqS) - descriptionP = self.cleanString(self.requirementData[reqP]["description"]).rstrip() - descriptionS = self.cleanString(self.requirementData[reqS]["description"]).rstrip() - - added = list() - removed = list() - matched = list() - stopped = list() - for p in descriptionP.split(" "): - if p not in descriptionS.split(" "): - if p not in self.identifierStopWords: - removed.append(p) - else: - stopped.append(p) - else: - if p not in self.identifierStopWords: - matched.append(p) - else: - stopped.append(p) - for s in descriptionS.split(" "): - if s not in descriptionP.split(" "): - if p not in self.identifierStopWords: - added.append(s) - else: - stopped.append(s) - - - discriminators == [["GPS1","GPS2"],["IRS1","IRS2"]] - - - return - ###################################################################### - # Function does a final categorization of potential matches - ###################################################################### - def FinalCategorization(self, reqP, reqS): - identifierP = self.deepCleanString(self.requirementData[reqP]["identifier"]) - identifierS = self.deepCleanString(self.requirementData[reqS]["identifier"]) - descriptionP = self.cleanString(self.requirementData[reqP]["description"]) - descriptionS = self.cleanString(self.requirementData[reqS]["description"]) - print(identifierP,"<->",identifierS) - if identifierP == identifierS: - print(REPORT_TEMPLATE.format(Fore.GREEN, "AssumedSameAs", self.requirementData[reqP]["identifier"], self.requirementData[reqS]["identifier"],"N/A")) - if reqP not in self.resolutionData: - self.resolutionData[reqP] = list() - self.resolutionData[reqP].append(["AssumedSameAs",reqS]) - return - - simRatio = 0.0 - if descriptionP !="" and descriptionS != "": - matcher = SequenceMatcher(None, descriptionP, descriptionS) - simRatio = matcher.ratio() - - if simRatio > 0.9: - print(REPORT_TEMPLATE.format(Fore.GREEN, "AssumedSameAs", self.requirementData[reqP]["identifier"], self.requirementData[reqS]["identifier"],simRatio)) - - if reqP not in self.resolutionData: - self.resolutionData[reqP] = list() - self.resolutionData[reqP].append(["AssumedSameAs",reqS]) - - elif simRatio < 0.5 and simRatio!=0.0: - print(REPORT_TEMPLATE.format(Fore.RED, "AssumedDifferent", self.requirementData[reqP]["identifier"], self.requirementData[reqS]["identifier"],simRatio)) - - if reqP not in self.resolutionData: - self.resolutionData[reqP] = list() - self.resolutionData[reqP].append(["AssumedDifferent",reqS]) - else: - print(REPORT_TEMPLATE.format(Fore.YELLOW, "PossibleSameAs", self.requirementData[reqP]["identifier"], self.requirementData[reqS]["identifier"],simRatio)) - - if reqP not in self.resolutionData: - self.resolutionData[reqP] = list() - self.resolutionData[reqP].append(["PossibleSameAs",reqS]) - ###################################################################### - # Function returns True if two requirements are part of the same data ingestion - ###################################################################### - def checkDataInsertedBy(self, reqP, reqS): - dataInsertedByP = self.requirementData[reqP]["dataInsertedBy_identifier"] - dataInsertedByS = self.requirementData[reqS]["dataInsertedBy_identifier"] - for i in dataInsertedByP: - if i in dataInsertedByS: - return True - return False - - ###################################################################### - # Function returns True if two requirements have disimilar identifiers - ###################################################################### - def checkIdentifier(self, reqP, reqS): - identifierP = self.cleanString(self.requirementData[reqP]["identifier"]).rstrip() - identifierS = self.cleanString(self.requirementData[reqS]["identifier"]).rstrip() - - #Requirements have the same identifier following cleaning so they are a match - if identifierP == identifierS: - return False - matcher = SequenceMatcher(None, identifierP, identifierS) - - #Sequence matcher ratio is less than 0.8 so it can be assumed that they disimilar - ratio = matcher.real_quick_ratio() - if ratio< 0.8: - return True - - added = list() - removed = list() - matched = list() - stopped = list() - for p in identifierP.split(" "): - if p not in identifierS.split(" "): - if p not in self.identifierStopWords: - removed.append(p) - else: - stopped.append(p) - else: - if p not in self.identifierStopWords: - matched.append(p) - else: - stopped.append(p) - for s in identifierS.split(" "): - if s not in identifierP.split(" "): - if p not in self.identifierStopWords: - added.append(s) - else: - stopped.append(s) - # Matched minus stop words - if len(added) == 0 and len(removed) == 0: - return False - return True - - ###################################################################### - # Function returns True if two requirements are not of compatible types - ###################################################################### - def checkForValidTypes(self, reqP, reqS): - typeP = self.requirementData[reqP]["REQUIREMENT_type"] - typeS = self.requirementData[reqS]["REQUIREMENT_type"] - if typeP == typeS and reqP>reqS: - return False - elif typeP != typeS: - if typeS == "http://arcos.rack/REQUIREMENTS#REQUIREMENT": - return False - else: - return True - - ###################################################################### - # Function returns True if two requirements already have resolution data - ###################################################################### - def checkForResolutionData(self, reqP, reqS): - return False - - def findPossibleSameAs(self): - i = 0 - for reqP in self.requirementData: - i+=1 - print(Style.RESET_ALL) - print("===========================") - print("{} {}/{} :: Found {}".format(self.requirementData[reqP]["identifier"], i, len(self.requirementData), len(self.resolutionData))) - for reqS in self.requirementData: - # Check - if reqP == reqS: # Same entitiy so move to the next - Debug(Fore.YELLOW, "Requirements are the Same") - elif self.checkForResolutionData(reqP, reqS): - Debug(Fore.YELLOW, "Requirements already have Resolution Data") - elif self.checkForValidTypes(reqP, reqS): - Debug(Fore.YELLOW, "Requirements have in compatible types.") - elif self.checkDataInsertedBy(reqP, reqS): - Debug(Fore.YELLOW, "Requirements are from the same data ingestion") - elif self.checkIdentifier(reqP, reqS): - Debug(Fore.YELLOW, "Requirements have disimiliar identifiers") - else: - self.FinalCategorization(reqP, reqS) - - def loadResolutionData(self): - with open(os.path.join(".", "resolutionData.json"), "r") as jsonFile: - self.resolutionData = json.load(jsonFile) - - - def writeResolutionData(self): - # Loading raw requirement data - with open(os.path.join(".", "resolutionData.json"), "w") as jsonFile: - json_object = json.dumps(self.resolutionData, indent = 4) - jsonFile.write(json_object) - with open(os.path.join(".", "resolutionData.csv"), "w") as csvFile: - writer = csv.DictWriter(csvFile,fieldnames=["EntityResolution_type","primaryEntity","secondaryEntity","primaryIdentifier","secondaryIdentifier","primaryDescription","secondaryDescription"]) - writer.writeheader() - for req in self.resolutionData: - #self.resolutionData[reqP].append(["AssumedAssumedSameAs",reqS]) - for res in self.resolutionData[req]: - reqP = self.requirementData[req] - reqS = self.requirementData[res[1]] - rowDict = {"EntityResolution_type":res[0], - "primaryEntity":req, - "secondaryEntity":res[1], - "primaryIdentifier":reqP["identifier"], - "secondaryIdentifier":reqS["identifier"], - "primaryDescription":reqP["description"], - "secondaryDescription":reqS["description"]} - writer.writerow(rowDict) - - def resolveGraph(self): - print(len(self.requirementData)) - conn_str = '''{"name":"RACK local fuseki Apache Phase 2","domain":"","enableOwlImports":false,"model":[{"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/model"}],"data":[{"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/data"},{"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/mitre-cwe"},{"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/nist-800-53"}]} -{"name":"RACK","domain":"","enableOwlImports":false,"model":[{"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/model"}],"data":[{"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/data"},{"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/turnstiledata"}]} -{"name":"RACK1","domain":"","enableOwlImports":false,"model":[{"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/model"}],"data":[{"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/data"}]}''' - semtk3.set_connection_override(conn_str) - for req in self.resolutionData: - for res in self.resolutionData[req]: - if res[0] == "AssumedSameAs": - primaryEntity = req - secondaryEntity = res[1] - print(Fore.GREEN, "semtk3.combine_entities(",self.requirementData[secondaryEntity]["REQUIREMENT_type"],",",secondaryEntity,",",primaryEntity,")",Style.RESET_ALL) - semtk3.combine_entities(self.requirementData[secondaryEntity]["REQUIREMENT_type"], primaryEntity, secondaryEntity,None, None) - -if __name__ =="__main__": - T = RequirementResolution() - T.findPossibleSameAs() - - #T.loadResolutionData() - - T.writeResolutionData() - #T.resolveGraph() - diff --git a/EntityResolution/Gui.py b/EntityResolution/Gui.py new file mode 100755 index 00000000..4f9ae5c4 --- /dev/null +++ b/EntityResolution/Gui.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +from MainWindow import * + + +if __name__ =="__main__": + mw = MainWindow() + mw.mainloop() diff --git a/EntityResolution/MainWindow.py b/EntityResolution/MainWindow.py new file mode 100755 index 00000000..16973c76 --- /dev/null +++ b/EntityResolution/MainWindow.py @@ -0,0 +1,422 @@ +#!/usr/bin/env python3 +import DataAccess as da +import os +import json +from difflib import SequenceMatcher +import csv +import tkinter as tk +from tkinter import ttk +from tkinter.messagebox import askyesno +import semtk3 +import os.path +import RACK_CONSTANTS as rc +import CreateIngestion as ci +from SelectClassWindow import SelectClass +from Entity import * + +CONFIRMED_DIFFERENT =0 +ASSUMED_DIFFERENT = 1 +ASSUMED_SAME_AS=2 +CONFIRMED_SAME_AS = 3 +CONFIRMED_COMBINED = 4 +ASSUMED_COMBINED =5 +class MainWindow(tk.Tk): + def __init__(self): + super().__init__() + #========================================================================== + # Work around related to the Treeview Coloring issue with some version of python for windows + # https://bugs.python.org/issue36468 + #========================================================================== + def fixed_map(option): + # Fix for setting text colour for Tkinter 8.6.9 + # From: https://core.tcl.tk/tk/info/509cafafae + # + # Returns the style map for 'option' with any styles starting with + # ('!disabled', '!selected', ...) filtered out. + + # style.map() returns an empty list for missing options, so this + # should be future-safe. + return [elm for elm in style.map('Treeview', query_opt=option) if + elm[:2] != ('!disabled', '!selected')] + + style = ttk.Style() + style.map('Treeview', foreground=fixed_map('foreground'), + background=fixed_map('background')) + #========================================================================== + + + self.title('RACK Entity Resolution Tool') + self.primary = '' + + ## Menu + self.menubar = tk.Menu(self) + self.fileMenu = tk.Menu(self.menubar) + self.fileMenu.add_command(label="Load Data...", command=self.loadData) + self.fileMenu.add_command(label="Save Data", command=self.saveData) + self.fileMenu.add_separator() + self.fileMenu.add_command(label="Exit",command=self.close) + self.menubar.add_cascade(label="File", menu=self.fileMenu) + + self.rackMenu = tk.Menu(self) + self.rackMenu.add_command(label="Create Ingestion Data", command=self.push) + self.rackMenu.add_command(label="Start Resolution...", command=self.pull) + self.menubar.add_cascade(label="RACK", menu=self.rackMenu) + + + self.config(menu=self.menubar) + + ## Primary Frame + self.primaryFrame = tk.Frame() + self.primaryFrame.grid(column=0, row=0, rowspan=2,sticky='ew', padx=10,pady=10) + + + ## Secondary Frame + self.secondaryFrame = tk.Frame() + self.secondaryFrame.grid(column=2, row=0,rowspan=2,sticky='ew', padx=10,pady=10) + + ## Primary Treeview + self.primaryTree = ttk.Treeview(self.primaryFrame, selectmode="browse", height=10) + self.primaryTree["columns"]=["Identifier","Score"] + self.primaryTree["show"]="headings" + self.primaryTree.heading("Identifier", text="Primary") + self.primaryTree.heading("Score", text="Best Score") + self.primaryTree.column("Score", width=100, stretch=tk.NO) + self.primaryTree.bind('', self.selectPrimary) + self.primaryTree.tag_configure('confirmedCombined', background="red") + self.primaryTree.tag_configure('assumedCombined', background="#E38699") + self.primaryTree.tag_configure('confirmedRemaining', background="green") + self.primaryTree.tag_configure('assumedRemaining', background="#89E0A8") + self.primaryTree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + + ## Primary Scrollbar + self.primaryScrollbar = tk.Scrollbar(self.primaryFrame, orient=tk.VERTICAL) + self.primaryScrollbar.pack(side=tk.RIGHT, fill=tk.Y) + + self.primaryTree.config(yscrollcommand=self.primaryScrollbar.set) + self.primaryScrollbar.config(command=self.primaryTree.yview) + + ## Secondary Treeview + self.secondaryTree = ttk.Treeview(self.secondaryFrame, selectmode="browse", height=10) + self.secondaryTree["columns"]=["Identifier","Score"] + self.secondaryTree["show"]="headings" + self.secondaryTree.heading("Identifier", text="Secondary") + self.secondaryTree.heading("Score", text="Score") + self.secondaryTree.column("Score", width=100, stretch=tk.NO) + self.secondaryTree.bind('', self.selectSecondary) + self.secondaryTree.tag_configure('confirmedSameAs', background="green") + self.secondaryTree.tag_configure('confirmedDifferent', background="red") + self.secondaryTree.tag_configure('assumedSameAs', background="#89E0A8") + self.secondaryTree.tag_configure('assumedDifferent', background="#E38699") + self.secondaryTree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + + ## Compare Text + self.compareText = tk.Text(self, height=5) + self.compareText.grid(row=2,column=0, columnspan=3,sticky='ew', padx=10,pady=10) + + ## Secondary Scrollbar + self.secondaryScrollbar = tk.Scrollbar(self.secondaryFrame, orient=tk.VERTICAL) + self.secondaryScrollbar.pack(side=tk.RIGHT, fill=tk.Y) + + self.secondaryTree.config(yscrollcommand=self.secondaryScrollbar.set) + self.secondaryScrollbar.config(command=self.secondaryTree.yview) + + ## Primary Entity + self.primaryEntity = Entity(self.updateCompare) + self.primaryEntity.grid(row=4,column=0,sticky='ew', padx=10,pady=10) + + ## Secondary Entity + self.secondaryEntity = Entity(self.updateCompare) + self.secondaryEntity.grid(row=4,column=2,sticky='ew', padx=10,pady=10) + + ## Different Button + self.differentButton = ttk.Button(self, text="Confirmed Different", command=self.confirmDifferent) + self.differentButton.grid(row=5, column=0,sticky='ew', padx=10,pady=10) + + ## Same as Button + self.sameAsButton = ttk.Button(self, text="Confirmed Same As", command=self.confirmSameAs) + self.sameAsButton.grid(row=5, column=2,sticky='ew', padx=10,pady=10) + + ## Assumed SameAs Threshold + self.sameAsLabel = ttk.Label(self, text="Assumed\nSame As\nThreshold", justify=tk.CENTER) + self.sameAsLabel.grid(column=4, row=0, padx=10,pady=10) + self.sameAsScale= tk.Scale(self, resolution=-1) + self.sameAsScale.grid(column=4, row=1, rowspan=4, sticky='ns') + + ## Different Threshold + self.differentLabel = ttk.Label(self, text="Assumed\nDifferent\nThreshold", justify=tk.CENTER) + self.differentLabel.grid(column=3, row=0, padx=10,pady=10) + self.differentScale = tk.Scale(self, resolution=-1) + self.differentScale.grid(column=3, row=1, rowspan=4, sticky='ns') + + ## Update Button + self.updateButton = ttk.Button(self, text="Update", command=self.assumptions) + self.updateButton.grid(row=5, column=3, columnspan=2,sticky='ew', padx=10,pady=10) + + self.grid_columnconfigure(0,weight=1) + self.grid_columnconfigure(1,weight=0) + self.grid_columnconfigure(2,weight=1) + semtk3.set_connection_override(rc.connStringSource) + def push(self): + ci.createIngestion(self.decisions) + + def pull(self): + all_ok = semtk3.check_services(); + if not all_ok: + print("Semtk services are not properly running on localhost") + return + # Get User selected PROV-S subclass to perform entity resolution on + classes = SelectClass() + print("Selected:", classes) + classStr = "" + for c in classes: + classStr += "<"+c+"> " + + tab = semtk3.query_raw_sparql(rc.instanceQuery.replace("<{{Types}}>", classStr)) + instances = {} + for r in tab.get_rows(): + if r[1] not in instances: + instances[r[1]] = [] + instances[r[1]].append(r[0]) + relations = {} + for i in instances: + relations[i] = [] + tab = semtk3.query_raw_sparql(rc.subClassQuery.replace("{{Type}}", i)) + for c in tab.get_column("super"): + if c not in relations[i]: + relations[i].append(c) + secondaryDict = {} + for r in relations: + secondaryDict [r] = list(instances[r]) + for i in relations[r]: + if i in instances: + secondaryDict[r] += list(instances[i]) + else: + print('No instances of {} found.'.format(i)) + for k in instances: + print("count:", k, len(instances[k])) + + for k in secondaryDict: + print('possible matches:', k, len(secondaryDict[k])) + primaryDict = {} + for k in instances: + for i in instances[k]: + primaryDict[i] = secondaryDict[k] + + import ResolveThings + ResolveThings.run(primaryDict) + self.loadData() + '''=================================================== + Callback for selecting close menu button + ===================================================''' + def close(self): + print("Close") + del(self) + '''=================================================== + Callback for selecting save data menu button + ===================================================''' + def saveData(self): + with open("Decisions.json","w") as decisionFile: + json.dump(self.decisions, decisionFile, indent=4) + '''=================================================== + Callback for selecting load data menu button + ===================================================''' + def loadData(self): + self.summary ={} + self.decisions = {} + self.maxScore = 0.0 + + with open("Resolutions/Summary.csv","r") as summaryFile: + reader = csv.DictReader(summaryFile) + for row in reader: + self.summary[row["Primary"]] = float(row["Score"]) + if os.path.exists("Decisions.json"): + if askyesno("Load Existing Decisions", "Existing decisions file was found. Do you want to load previous decisions?"): + with open("Decisions.json","r") as decisionFile: + self.decisions = json.load(decisionFile) + ## Clear Primary Tree + for item in self.primaryTree.get_children(): + self.primaryTree.delete(item) + for p in sorted(self.summary.items(), key=lambda x:x[1],reverse=True): + bestMatch = p[1] + if bestMatch > self.maxScore: + self.maxScore = bestMatch + identifier = da.getIdentifier(p[0]) + self.primaryTree.insert("", 'end', text=p[0], values =(identifier, "{:.3f}".format(bestMatch))) + + self.sameAsScale.configure(to=self.maxScore) + self.sameAsScale.set(self.maxScore) + + self.differentScale.set(0) + self.differentScale.configure(to=self.maxScore) + self.updatePrimary() + self.updateSecondary() + self.updateCompare() + '''=================================================== + Callback for selecting a primary entity + ===================================================''' + def selectPrimary(self, a): + currItem = self.primaryTree.focus() + self.primary = self.primaryTree.item(currItem)['text'] + self.updateSecondary() + self.updateCompare() + '''=================================================== + Callback for selecting a secondary entity + ===================================================''' + def selectSecondary(self,a): + currItem = self.secondaryTree.focus() + secondary = self.secondaryTree.item(currItem)['text'] + if secondary != '': + self.secondaryEntity.update(secondary) + else: + self.secondaryEntity.update(None) + self.updateCompare() + + '''=================================================== + Callback for updating the compare Text box + ===================================================''' + def updateCompare(self): + + self.compareText.delete("1.0","end") + primary = self.primaryEntity.propertyString + secondary = self.secondaryEntity.propertyString + s = SequenceMatcher(None, primary,secondary) + for code in s.get_opcodes(): + if code[0] == "equal": + self.compareText.insert("end", primary[code[1]:code[2]],('equal')) + elif code[0] == "delete": + self.compareText.insert("end", primary[code[1]:code[2]],('delete')) + elif code[0] == "insert": + self.compareText.insert("end", secondary[code[3]:code[4]],('insert')) + elif code[0] == "replace": + self.compareText.insert("end", primary[code[1]:code[2]],('delete')) + self.compareText.insert("end", secondary[code[3]:code[4]],('insert')) + self.compareText.tag_config("equal", background="white", foreground="black") + self.compareText.tag_config("delete", background="white", foreground="red") + self.compareText.tag_config("insert", background="white", foreground="green") + + def updatePrimary(self): + for item in self.primaryTree.get_children(): + p = self.primaryTree.item(item)['text'] + if p in self.decisions and self.decisions[p] != None: + if self.decisions[p] == ASSUMED_COMBINED: + tags = ("assumedCombined",) + elif self.decisions[p] == CONFIRMED_COMBINED: + tags = ("confirmedCombined",) + else: + for s in self.decisions[p]: + if self.decisions[p][s] == CONFIRMED_SAME_AS: + tags = ("confirmedRemaining",) + break + tags = ("assumedRemaining",) + self.primaryTree.item(item, tags = tags) + + def updateSecondary(self): + ## Clear secondary Tree + for item in self.secondaryTree.get_children(): + self.secondaryTree.delete(item) + if self.primary != '': + self.resolution = {} + with open("Resolutions/"+self.primary.split("#")[-1]+".json","r") as resFile: + self.resolution = json.load(resFile) + + for p in sorted(self.resolution.items(), key=lambda x:x[1],reverse=True): + identifier = da.getIdentifier(p[0]) + tags = () + if self.primary in self.decisions: + if type(self.decisions[self.primary]) == int: + # this object is either ASSUMED_COMBINED or CONFIRMED_COMBINED so no need to populate the seconary object list + break + elif p[0] in self.decisions[self.primary]: #Get the tag from the decisions + tags = (self.decisions[self.primary][p[0]],) + self.secondaryTree.insert("",'end', text=p[0], values=(identifier, "{:.3f}".format(p[1])), tags = tags) + + self.primaryEntity.update(self.primary) + self.secondaryEntity.update(None) + else: + self.primaryEntity.update(None) + self.secondaryEntity.update(None) + + '''=================================================== + Callback for selecting a the confirmed same as button + ===================================================''' + def confirmSameAs(self): + currItem = self.primaryTree.focus() + primary = self.primaryTree.item(currItem)['text'] + + currItem = self.secondaryTree.focus() + secondary = self.secondaryTree.item(currItem)['text'] + print(primary, secondary) + + if primary not in self.decisions: + self.decisions[primary] = {} + self.decisions[primary][secondary] = CONFIRMED_SAME_AS + self.decisions[secondary] = CONFIRMED_COMBINED + self.updatePrimary() + self.updateSecondary() + self.updateCompare() + + '''=================================================== + Callback for selecting a the confirmed different button + ===================================================''' + def confirmDifferent(self): + currItem = self.primaryTree.focus() + primary = self.primaryTree.item(currItem)['text'] + + currItem = self.secondaryTree.focus() + secondary = self.secondaryTree.item(currItem)['text'] + + if primary not in self.decisions: + self.decisions[primary] = {} + self.decisions[primary][secondary] = CONFIRMED_DIFFERENT + self.updatePrimary() + self.updateSecondary() + self.updateCompare() + '''=================================================== + Callback for selecting a the Update Assumptions button + ===================================================''' + def assumptions(self): + i = 0 + for p in self.summary: + print(i, "/", len(self.summary)) + i+=1 + print(p) + temp = {} + with open("Resolutions/"+p.split("#")[-1]+".json","r") as resFile: + temp = json.load(resFile) + + ## Check if there is some confirmed + if p not in self.decisions: + self.decisions[p] = {} + for s in temp: + if type(self.decisions[p]) is not dict: # Not a dictionary this was previously combined + if self.decisions[p] == ASSUMED_COMBINED: # Reset if this was done by Assumption + self.decisions[p] = {} + if self.decisions[p] ==CONFIRMED_COMBINED: # () + continue + else: + print("Unexpected Decisions Setting: Key : {} : Value:{}".format(p, self.decisions[p])) + continue + if s in self.decisions[p]: + if self.decisions[p][s] == CONFIRMED_DIFFERENT or self.decisions[p][s] == CONFIRMED_SAME_AS \ + or self.decisions[p][s] == CONFIRMED_COMBINED : + continue + else: + del self.decisions[p][s] + if temp[s] < self.differentScale.get(): + self.decisions[p][s] = ASSUMED_DIFFERENT + elif temp[s] >= self.sameAsScale.get(): + self.decisions[p][s] = ASSUMED_SAME_AS + for p in self.decisions: + if type(self.decisions[p]) is dict: + for s in self.decisions[p]: + if self.decisions[p][s] == ASSUMED_SAME_AS: + self.decisions[s]= ASSUMED_COMBINED + elif self.decisions[p][s] == CONFIRMED_SAME_AS: + self.decisions[s]= CONFIRMED_COMBINED + self.updatePrimary() + self.updateSecondary() + + +if __name__ =="__main__": + mw = MainWindow() + mw.mainloop() diff --git a/EntityResolution/RACK_CONSTANTS.py b/EntityResolution/RACK_CONSTANTS.py new file mode 100644 index 00000000..4d7c695c --- /dev/null +++ b/EntityResolution/RACK_CONSTANTS.py @@ -0,0 +1,117 @@ + +connStringSource = """ +{ "name":"RACK local fuseki Apache Phase 2 Resolved", + "domain":"", + "enableOwlImports":false, + "model":[ + {"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/model"} + ], + "data":[ + {"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/Data"} + ] +}""" +connStringResolved = """ +{ "name":"RACK local fuseki Apache Phase 2 Resolved", + "domain":"", + "enableOwlImports":false, + "model":[ + {"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/model"} + ], + "data":[ + {"type":"fuseki","url":"http://localhost:3030/RACK","graph":"http://rack001/ResolvedData"} + ] +}""" + +entityTypeQuery = '''prefix rdf: +prefix PROV_S: +prefix rdfs: +select distinct ?directSub + FROM + where { ?directSub rdfs:subClassOf ?super. + values ?super{PROV_S:ENTITY} .} +''' + +activityTypeQuery = '''prefix rdf: +prefix PROV_S: +prefix rdfs: +select distinct ?directSub + FROM + where { ?directSub rdfs:subClassOf ?super. + values ?super{PROV_S:ACTIVITY} .} +''' + +agentTypeQuery = '''prefix rdf: +prefix PROV_S: +prefix rdfs: +select distinct ?directSub + FROM + where { ?directSub rdfs:subClassOf ?super. + values ?super{PROV_S:AGENT} .} +''' + +classQuery = '''prefix rdf: +prefix PROV_S: +prefix rdfs: +select distinct ?directSub + FROM + where { ?directSub rdfs:subClassOf ?super. + values ?super{<{{Type}}>} .} +''' + +subClassQuery = '''prefix rdf: +prefix PROV_S: +prefix rdfs: +select distinct ?super + FROM + where { ?directSub rdfs:subClassOf ?super. + values ?directSub{<{{Type}}>} .} +''' + +instanceQuery = '''prefix rdf: +prefix PROV_S: +prefix rdfs: +select distinct ?instance ?super + FROM + where { ?instance a ?super. + values ?super{<{{Types}}>} .} +''' + +dataQuery = """prefix rdf: +prefix semtk: +prefix XMLSchema: +prefix PROV_S: +prefix rdfs: +CONSTRUCT { + ?THING a ?THING_type . + ?THING ?dp ?o . + + ?THING ?p ?OBJ . + ?OBJ a ?OBJ_type . + ?OBJ PROV_S:identifier ?OBJ_identifier . + + ?OBJ2 ?ap ?THING . + ?OBJ2 a ?OBJ2_type . + ?OBJ2 PROV_S:identifier ?OBJ2_identifier . +} + FROM + FROM +where { + ?THING a ?THING_type . + ?THING_type rdfs:subClassOf* PROV_S:THING . + FILTER ( ?THING IN ( ) ) . + optional{ + ?THING ?p ?OBJ . + ?OBJ a ?OBJ_type . + ?OBJ PROV_S:identifier ?OBJ_identifier . + ?OBJ_type rdfs:subClassOf* PROV_S:THING . + } + optional{ + ?THING ?dp ?o + } + optional{ + ?OBJ2 ?ap ?THING . + ?OBJ2 a ?OBJ2_type . + ?OBJ2 PROV_S:identifier ?OBJ2_identifier . + ?OBJ2_type rdfs:subClassOf* PROV_S:THING . + } +}""" diff --git a/EntityResolution/ResolutionEngine.py b/EntityResolution/ResolutionEngine.py new file mode 100755 index 00000000..6832b7b1 --- /dev/null +++ b/EntityResolution/ResolutionEngine.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +import os +import json +from colorama import Fore, Style +import multiprocessing +import os.path +DEBUG = False +def Debug(*args): + if DEBUG: + print(*args) + + +###################################### +# 0.0 == confirmedDifferent +# 0.0 < assumedDifferent <= 0.5 +# 0.5 < possibleSameAs <= 0.9 +# 0.8 < assumedSameAs < 1.0 +# 1.0 == confirmedSameAs +###################################### + +reportString = """{} / {} - {} + Best Match:{} + Score:{}{}{} +------------------------------------------------------------------------""" + +class ResolutionEngine: + entityList = None + ruleList = None + resolutions = None + processed = 0 + sourceConnection = None + resolvedConnection = None + logString = "" + + def __init__(self): + self.entityList = list() + self.ruleList = list() + + def __runRules__(self, eP, eS): + Score = 1 + for ruleType, rule in self.ruleList: + applicable, level = rule(eP, eS) + if applicable: + if ruleType == "Absolute": + return level + else: + Score += level + return Score + + def addEntities(self, entityUriList): + self.entityList = entityUriList + + def addAbsoluteRule(self, ruleFunction): + self.ruleList.append(["Absolute", ruleFunction]) + + def addRelativeRule(self, ruleFunction): + self.ruleList.append(["Relative", ruleFunction]) + + def work(self, eP): + print("Running Analysis on {}".format(eP)) + maxScore = 0 + bestMatch = None + resolutions = {} + Score = 0.0 + for eS in self.entityList[eP]: + if eS!=eP: + Score = self.__runRules__(eP,eS) + resolutions[eS] = Score + if Score > maxScore: + maxScore = Score + bestMatch = eS + color = Fore.WHITE + if Score> 2: + color = Fore.YELLOW + elif Score > 4: + color = Fore.GREEN + + with open("Resolutions/"+eP.split("#")[-1]+".json", "w") as out: + json.dump(resolutions, out, indent=4) + + with open("Resolutions/Summary.csv", "a") as out: + out.write("{},{},{}\n".format(eP,bestMatch ,maxScore)) + + print(reportString.format(len(os.listdir("Resolutions")), len(self.entityList), eP, bestMatch, color,str(maxScore),Style.RESET_ALL)) + + def runAllAnalysis(self): + + ###################################################################### + print("Intializing Resolution Dictionary..") + for f in os.listdir("Resolutions"): + os.remove(os.path.join("Resolutions",f)) + with open("Resolutions/Summary.csv", "w") as out: + out.write("Primary,Best Match,Score\n") + print(" Initialization Done.") + ###################################################################### + + ###################################################################### + print("Running Analysis..") + #for k in self.entityList.keys(): + # self.work(k) + print(" analyzing {} things for commonality.".format(len(self.entityList))) + with multiprocessing.Pool() as pool: + pool.map(self.work, self.entityList.keys()) + print(" Analysis Complete.") + ###################################################################### + + diff --git a/EntityResolution/ResolveThings.py b/EntityResolution/ResolveThings.py new file mode 100755 index 00000000..3096f3f4 --- /dev/null +++ b/EntityResolution/ResolveThings.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +import DataAccess as da +import ResolutionEngine as re +from difflib import SequenceMatcher +data = {} +entities = {} +DEBUG = False +def Debug(*args): + if DEBUG: + print(*args) +##################################### +# Queries +##################################### + +##################################### +# helper Functions +##################################### +def cleanString(string): + cleanString = "" + for c in string: + if c.isalnum(): + cleanString+=c + else: + cleanString += " " + while cleanString.find(" ")!=-1: + cleanString = cleanString.replace(" "," ") + + return cleanString.upper().rstrip(" ").lstrip(" ") +##################################### +# Rules Definitions +##################################### + +def fuzzyDescriptionCompare(e1,e2): + Debug("descriptionCompare") + if da.getDescription(e1) != None and da.getDescription(e2) != None: + t1 = cleanString(da.getDescription(e1)) + t2 = cleanString(da.getDescription(e2)) + matcher = SequenceMatcher(None, t1, t2) + return True, matcher.ratio()*2 + else: + return False, 1.0 +def fuzzyIdentifierCompare(e1,e2): + Debug("fuzzyIdentifierCompare") + global data + t1 = cleanString(da.getIdentifier(e1)) + t2 = cleanString(da.getIdentifier(e2)) + matcher = SequenceMatcher(None, t1, t2) + return True, matcher.ratio() * 2 + +def identifierCompare(e1,e2): + Debug("identifierCompare") + global data + t1 = cleanString(da.getIdentifier(e1)) + t2 = cleanString(da.getIdentifier(e2)) + + if t1 == t2: + return True, 5.0 + else: + return False, 5.0 + +def getDataInsertedBy(e): + guid = e.split("#")[-1] + data = da.getData(e) + ## Create a Hash based on the GUID and find the base Element + elements={} + baseElement = None + if "@graph" not in data: # No Graph tag means there is a single element at the root. + baseElement = data + return list() + + for el in data["@graph"]: + elements[el["@id"].split(":")[-1]] = el + if el["@id"].split(":")[-1] == guid: + baseElement = el + dataInsertedByIdentifiers = [] + ## Get dataInsertedBy Elements and collect the identifiers + if type(baseElement['PROV_S:dataInsertedBy']) is dict: + dataInsertedByIdentifiers.append(elements[baseElement['PROV_S:dataInsertedBy']['@id'].split(":")[-1]]['PROV_S:identifier']) + elif type(baseElement['PROV_S:dataInsertedBy']) is list: + for i in baseElement['PROV_S:dataInsertedBy']: + dataInsertedByIdentifiers.append(elements[i['@id'].split(":")[-1]]['PROV_S:identifier']) + else: + print("***** ERRROR 1 *****") + return dataInsertedByIdentifiers + + +def dataInsertedByCheck(e1,e2): + Debug("dataInsertedByCheck") + global data + t1 = getDataInsertedBy(e1) + t2 = getDataInsertedBy(e2) + + # If the requirements were inserted by the same ingestion then assum the entities are different + for i1 in t1: + for i2 in t2: + if i1 == i2: + return True, 0.0 + + return False, 1.0 + +def run(entities): + resEngine = re.ResolutionEngine() + resEngine.addEntities(entities) + resEngine.addAbsoluteRule(dataInsertedByCheck) + resEngine.addAbsoluteRule(identifierCompare) + resEngine.addRelativeRule(fuzzyIdentifierCompare) + resEngine.addRelativeRule(fuzzyDescriptionCompare) + + resEngine.runAllAnalysis() + +if __name__ == "__main__": + run(True) diff --git a/EntityResolution/SelectClassWindow.py b/EntityResolution/SelectClassWindow.py new file mode 100644 index 00000000..6de5dc75 --- /dev/null +++ b/EntityResolution/SelectClassWindow.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +#import os + +from CheckBar import Checkbar +import tkinter as tk +from tkinter import ttk +#from tkinter.messagebox import askyesno +import semtk3 +#import os.path +import RACK_CONSTANTS as rc +#from Entity import * +results = [] +class ClassWindow(tk.Tk): + def __init__(self): + super().__init__() + self.title('Select Classes') + self.superClassSelections = Checkbar(self, picks=['Activity', 'Agent', 'Entity'], command=self.update) + self.superClassSelections.grid(row=0, column=1) + + + self.classesTree = ttk.Treeview(self, selectmode=None, height=40) + self.classesTree["columns"]=["Class","Parent"] + self.classesTree["show"]="headings" + self.sortOrder = {} + for h in self.classesTree["columns"]: + self.classesTree.heading(h, text=h, command=lambda h=h: self.treeview_sort_column(h)) + self.sortOrder[h] = True + self.classesTree.grid(row=1, column=0, columnspan=3, sticky='nsew') + + self.updateButton = ttk.Button(self, text="Done", command=self.done) + self.updateButton.grid(row=2, column=1) + + self.grid_columnconfigure(0,weight=1) + self.grid_columnconfigure(1,weight=0) + self.grid_columnconfigure(2,weight=1) + + self.grid_rowconfigure(0,weight=0) + self.grid_rowconfigure(1,weight=1) + self.grid_rowconfigure(2,weight=0) + self.queryRack() + self.update() + + def treeview_sort_column(self, col): + tv = self.classesTree + l = [(tv.set(k, col), k) for k in tv.get_children('')] + l.sort(reverse=self.sortOrder[col] ) + + # rearrange items in sorted positions + for index, (val, k) in enumerate(l): + tv.move(k, '', index) + + # reverse sort next time + self.sortOrder[col] = not self.sortOrder[col] + + + def done(self): + global results + results = [] + selections = [] + for s in self.classesTree.selection(): + selections.append(self.classesTree.item(s)['text']) + results = getSubclass(selections) + self.destroy() + def queryRack(self): + all_ok = semtk3.check_services(); + if not all_ok: + print("Semtk services are not properly running on localhost") + return + self.classes = {'Entity':[], 'Activity':[], 'Agent':[], } + + tab = semtk3.query_raw_sparql(rc.entityTypeQuery) + for c in tab.get_column("directSub"): + if c not in self.classes['Entity']: + self.classes['Entity'].append(c) + + tab = semtk3.query_raw_sparql(rc.agentTypeQuery) + for c in tab.get_column("directSub"): + if c not in self.classes['Agent']: + self.classes['Agent'].append(c) + + tab = semtk3.query_raw_sparql(rc.activityTypeQuery) + for c in tab.get_column("directSub"): + if c not in self.classes['Activity']: + self.classes['Activity'].append(c) + + def update(self): + for item in self.classesTree.get_children(): + self.classesTree.delete(item) + + s = self.superClassSelections.state() + print(s) + if s['Entity']: + for c in self.classes['Entity']: + self.classesTree.insert("",'end', text=c, values=(c, 'Entity' )) + if s['Activity']: + for c in self.classes['Activity']: + self.classesTree.insert("",'end', text=c, values=(c, 'Activity' )) + if s['Agent']: + for c in self.classes['Agent']: + self.classesTree.insert("",'end', text=c, values=(c, 'Agent' )) + +def getSubclass(classes): + all_ok = semtk3.check_services(); + if not all_ok: + print("Semtk services are not properly running on localhost") + return + subclasses = [] + for s in classes: + subclasses.append(s) + tab = semtk3.query_raw_sparql(rc.classQuery.replace("{{Type}}", s)) + for c in tab.get_column("directSub"): + if c not in classes: + subclasses.append(c) + return subclasses + + +def SelectClass(): + c = ClassWindow() + c.wait_window() + return results +if __name__ == "__main__": + print(SelectClass()) + diff --git a/EntityResolution/TestData/Create-RACK-DATA.py b/EntityResolution/TestData/Create-RACK-DATA.py new file mode 100755 index 00000000..1fd5f7e9 --- /dev/null +++ b/EntityResolution/TestData/Create-RACK-DATA.py @@ -0,0 +1,51 @@ +#!/bin/python3 +import os.path +import shutil +from Evidence import createEvidenceFile, createCDR +import Evidence.Add as Add +if __name__ == "__main__": + ################################################################# + if os.path.exists(os.path.join(".","Package-1")): + shutil.rmtree(os.path.join(".","Package-1")) + createEvidenceFile(ingestionTitle="Package 1") + Add.TESTING.TEST(identifier="{L1-TST-1}", description="This is test L1-1.", verifies_identifier="{L1-REQ-1}") + Add.TESTING.TEST(identifier="{L1-TST-2}", description="This is test L1-2.", verifies_identifier="{L1-REQ-1}") + Add.REQUIREMENTS.REQUIREMENT(identifier="{L1-REQ-1}") + createCDR() + os.rename(os.path.join(".","RACK-DATA"), os.path.join(".","Package-1")) + + ################################################################# + if os.path.exists(os.path.join(".","Package-2")): + shutil.rmtree(os.path.join(".","Package-2")) + createEvidenceFile(ingestionTitle="Package 2") + Add.REQUIREMENTS.REQUIREMENT(identifier="L1-REQ-1", description="This is requirement L1-1.", satisfies_identifier="L0-REQ-1") + Add.REQUIREMENTS.REQUIREMENT(identifier="L1-REQ-2", description="This is requirement L1-2.", satisfies_identifier="L0-REQ-1") + Add.REQUIREMENTS.REQUIREMENT(identifier="L0-REQ-1") + createCDR() + os.rename(os.path.join(".","RACK-DATA"), os.path.join(".","Package-2")) + + ################################################################# + if os.path.exists(os.path.join(".","Package-3")): + shutil.rmtree(os.path.join(".","Package-3")) + createEvidenceFile(ingestionTitle="Package 3") + Add.REQUIREMENTS.REQUIREMENT(identifier="[L1-REQ-2]", satisfies_identifier="[L0-REQ-2]") + Add.REQUIREMENTS.REQUIREMENT(identifier="[L1-REQ-3]", satisfies_identifier="[L0-REQ-2]") + Add.REQUIREMENTS.REQUIREMENT(identifier="[L0-REQ-2]") + createCDR() + os.rename(os.path.join(".","RACK-DATA"), os.path.join(".","Package-3")) + + ################################################################# + if os.path.exists(os.path.join(".","Resolutions-1")): + shutil.rmtree(os.path.join(".","Resolutions-1")) + createEvidenceFile(ingestionTitle="Resolutions-1") + Add.RESOLUTIONS.SAME_AS(primary_identifier="L1-REQ-1", secondary_identifier="{L1-REQ-1}") + createCDR() + os.rename(os.path.join(".","RACK-DATA"), os.path.join(".","Resolutions-1")) + + ################################################################# + if os.path.exists(os.path.join(".","Resolutions-2")): + shutil.rmtree(os.path.join(".","Resolutions-2")) + createEvidenceFile(ingestionTitle="Resolutions-2") + Add.RESOLUTIONS.SAME_AS(primary_identifier="L1-REQ-2", secondary_identifier="[L1-REQ-2]") + createCDR() + os.rename(os.path.join(".","RACK-DATA"), os.path.join(".","Resolutions-2")) diff --git a/EntityResolution/TestData/Load-Package1.sh b/EntityResolution/TestData/Load-Package1.sh new file mode 100755 index 00000000..c1c7efa3 --- /dev/null +++ b/EntityResolution/TestData/Load-Package1.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2020, General Electric Company and Galois, Inc. +set -eu +BASEDIR=$(cd "$(dirname "$0")"; pwd) +echo "$BASEDIR" +if ! command -v rack > /dev/null +then + cat <<-END + ERROR: rack cli tool not found in PATH + + Installation instructions are available at + https://github.com/ge-high-assurance/RACK/wiki/RACK-CLI#install-dependencies + or locally in README.md + + If you've already installed RACK CLI, please activate your virtual environment + + macOS/Linux: source venv/bin/activate + Windows: venv\\Scripts\\activate.bat + PowerShell: venv\\Scripts\\Activate.ps1 + END + exit 1 +fi + +# suppress RACK cli warnings about missing columns +export LOG_LEVEL=ERROR + +echo "Ingesting Package 1 ..." +rack data import --clear "$BASEDIR"/Package-1/import.yaml + diff --git a/EntityResolution/TestData/Load-Package2.sh b/EntityResolution/TestData/Load-Package2.sh new file mode 100755 index 00000000..392c7941 --- /dev/null +++ b/EntityResolution/TestData/Load-Package2.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2020, General Electric Company and Galois, Inc. +set -eu +BASEDIR=$(cd "$(dirname "$0")"; pwd) +echo "$BASEDIR" +if ! command -v rack > /dev/null +then + cat <<-END + ERROR: rack cli tool not found in PATH + + Installation instructions are available at + https://github.com/ge-high-assurance/RACK/wiki/RACK-CLI#install-dependencies + or locally in README.md + + If you've already installed RACK CLI, please activate your virtual environment + + macOS/Linux: source venv/bin/activate + Windows: venv\\Scripts\\activate.bat + PowerShell: venv\\Scripts\\Activate.ps1 + END + exit 1 +fi + +# suppress RACK cli warnings about missing columns +export LOG_LEVEL=ERROR + +echo "Ingesting Package 2 ..." +rack data import "$BASEDIR"/Package-2/import.yaml + diff --git a/EntityResolution/TestData/Load-Package3.sh b/EntityResolution/TestData/Load-Package3.sh new file mode 100755 index 00000000..404ef928 --- /dev/null +++ b/EntityResolution/TestData/Load-Package3.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2020, General Electric Company and Galois, Inc. +set -eu +BASEDIR=$(cd "$(dirname "$0")"; pwd) +echo "$BASEDIR" +if ! command -v rack > /dev/null +then + cat <<-END + ERROR: rack cli tool not found in PATH + + Installation instructions are available at + https://github.com/ge-high-assurance/RACK/wiki/RACK-CLI#install-dependencies + or locally in README.md + + If you've already installed RACK CLI, please activate your virtual environment + + macOS/Linux: source venv/bin/activate + Windows: venv\\Scripts\\activate.bat + PowerShell: venv\\Scripts\\Activate.ps1 + END + exit 1 +fi + +# suppress RACK cli warnings about missing columns +export LOG_LEVEL=ERROR + +echo "Ingesting Package-3..." +rack data import "$BASEDIR"/Package-3/import.yaml + diff --git a/EntityResolution/TestData/Load-Resolutions-1.sh b/EntityResolution/TestData/Load-Resolutions-1.sh new file mode 100755 index 00000000..a2db4af2 --- /dev/null +++ b/EntityResolution/TestData/Load-Resolutions-1.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2020, General Electric Company and Galois, Inc. +set -eu +BASEDIR=$(cd "$(dirname "$0")"; pwd) +echo "$BASEDIR" +if ! command -v rack > /dev/null +then + cat <<-END + ERROR: rack cli tool not found in PATH + + Installation instructions are available at + https://github.com/ge-high-assurance/RACK/wiki/RACK-CLI#install-dependencies + or locally in README.md + + If you've already installed RACK CLI, please activate your virtual environment + + macOS/Linux: source venv/bin/activate + Windows: venv\\Scripts\\activate.bat + PowerShell: venv\\Scripts\\Activate.ps1 + END + exit 1 +fi + +# suppress RACK cli warnings about missing columns +export LOG_LEVEL=ERROR + +echo "Ingesting Resolution Data ..." +rack data import "$BASEDIR"/Resolutions-1/import.yaml + diff --git a/EntityResolution/TestData/Load-Resolutions-2.sh b/EntityResolution/TestData/Load-Resolutions-2.sh new file mode 100755 index 00000000..cf650d50 --- /dev/null +++ b/EntityResolution/TestData/Load-Resolutions-2.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2020, General Electric Company and Galois, Inc. +set -eu +BASEDIR=$(cd "$(dirname "$0")"; pwd) +echo "$BASEDIR" +if ! command -v rack > /dev/null +then + cat <<-END + ERROR: rack cli tool not found in PATH + + Installation instructions are available at + https://github.com/ge-high-assurance/RACK/wiki/RACK-CLI#install-dependencies + or locally in README.md + + If you've already installed RACK CLI, please activate your virtual environment + + macOS/Linux: source venv/bin/activate + Windows: venv\\Scripts\\activate.bat + PowerShell: venv\\Scripts\\Activate.ps1 + END + exit 1 +fi + +# suppress RACK cli warnings about missing columns +export LOG_LEVEL=ERROR + +echo "Ingesting Resolution Data ..." +rack data import "$BASEDIR"/Resolutions-2/import.yaml + diff --git a/EntityResolution/TestData/Package-1/PROV_S_ACTIVITY1.csv b/EntityResolution/TestData/Package-1/PROV_S_ACTIVITY1.csv new file mode 100644 index 00000000..69b3ecc5 --- /dev/null +++ b/EntityResolution/TestData/Package-1/PROV_S_ACTIVITY1.csv @@ -0,0 +1,2 @@ +identifier +Package 1 diff --git a/EntityResolution/TestData/Package-1/PROV_S_ACTIVITY2.csv b/EntityResolution/TestData/Package-1/PROV_S_ACTIVITY2.csv new file mode 100644 index 00000000..173fba18 --- /dev/null +++ b/EntityResolution/TestData/Package-1/PROV_S_ACTIVITY2.csv @@ -0,0 +1,3 @@ +identifier,dataInsertedBy_identifier,description,startedAtTime,title,endedAtTime +Package 1,Package 1,Data that was ingested using the ARCOS Scraping Tool Kit.,2022-12-08 14:39:56,Package 1, +Package 1,Package 1,,,,2022-12-08 14:39:56 diff --git a/EntityResolution/TestData/Package-1/REQUIREMENTS_REQUIREMENT1.csv b/EntityResolution/TestData/Package-1/REQUIREMENTS_REQUIREMENT1.csv new file mode 100644 index 00000000..cc5f7e47 --- /dev/null +++ b/EntityResolution/TestData/Package-1/REQUIREMENTS_REQUIREMENT1.csv @@ -0,0 +1,2 @@ +identifier +{L1-REQ-1} diff --git a/EntityResolution/TestData/Package-1/REQUIREMENTS_REQUIREMENT2.csv b/EntityResolution/TestData/Package-1/REQUIREMENTS_REQUIREMENT2.csv new file mode 100644 index 00000000..46ed5d48 --- /dev/null +++ b/EntityResolution/TestData/Package-1/REQUIREMENTS_REQUIREMENT2.csv @@ -0,0 +1,2 @@ +identifier,dataInsertedBy_identifier +{L1-REQ-1},Package 1 diff --git a/EntityResolution/TestData/Package-1/TESTING_TEST1.csv b/EntityResolution/TestData/Package-1/TESTING_TEST1.csv new file mode 100644 index 00000000..56cc9c6d --- /dev/null +++ b/EntityResolution/TestData/Package-1/TESTING_TEST1.csv @@ -0,0 +1,3 @@ +identifier +{L1-TST-1} +{L1-TST-2} diff --git a/EntityResolution/TestData/Package-1/TESTING_TEST2.csv b/EntityResolution/TestData/Package-1/TESTING_TEST2.csv new file mode 100644 index 00000000..09594707 --- /dev/null +++ b/EntityResolution/TestData/Package-1/TESTING_TEST2.csv @@ -0,0 +1,3 @@ +identifier,dataInsertedBy_identifier,description,verifies_identifier +{L1-TST-1},Package 1,This is test L1-1.,{L1-REQ-1} +{L1-TST-2},Package 1,This is test L1-2.,{L1-REQ-1} diff --git a/EntityResolution/TestData/Package-1/import.yaml b/EntityResolution/TestData/Package-1/import.yaml new file mode 100644 index 00000000..b78e80c1 --- /dev/null +++ b/EntityResolution/TestData/Package-1/import.yaml @@ -0,0 +1,11 @@ +data-graph: "http://rack001/data" +ingestion-steps: +#Phase1: Identifiers Only +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY1.csv"} +- {class: "http://arcos.rack/REQUIREMENTS#REQUIREMENT", csv: "REQUIREMENTS_REQUIREMENT1.csv"} +- {class: "http://arcos.rack/TESTING#TEST", csv: "TESTING_TEST1.csv"} + +#Phase2: All Evidence +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY2.csv"} +- {class: "http://arcos.rack/REQUIREMENTS#REQUIREMENT", csv: "REQUIREMENTS_REQUIREMENT2.csv"} +- {class: "http://arcos.rack/TESTING#TEST", csv: "TESTING_TEST2.csv"} diff --git a/EntityResolution/TestData/Package-2/PROV_S_ACTIVITY1.csv b/EntityResolution/TestData/Package-2/PROV_S_ACTIVITY1.csv new file mode 100644 index 00000000..10d87e4b --- /dev/null +++ b/EntityResolution/TestData/Package-2/PROV_S_ACTIVITY1.csv @@ -0,0 +1,2 @@ +identifier +Package 2 diff --git a/EntityResolution/TestData/Package-2/PROV_S_ACTIVITY2.csv b/EntityResolution/TestData/Package-2/PROV_S_ACTIVITY2.csv new file mode 100644 index 00000000..33d78f1a --- /dev/null +++ b/EntityResolution/TestData/Package-2/PROV_S_ACTIVITY2.csv @@ -0,0 +1,3 @@ +identifier,dataInsertedBy_identifier,description,startedAtTime,title,endedAtTime +Package 2,Package 2,Data that was ingested using the ARCOS Scraping Tool Kit.,2022-12-08 14:39:56,Package 2, +Package 2,Package 2,,,,2022-12-08 14:39:56 diff --git a/EntityResolution/TestData/Package-2/REQUIREMENTS_REQUIREMENT1.csv b/EntityResolution/TestData/Package-2/REQUIREMENTS_REQUIREMENT1.csv new file mode 100644 index 00000000..c37e11d0 --- /dev/null +++ b/EntityResolution/TestData/Package-2/REQUIREMENTS_REQUIREMENT1.csv @@ -0,0 +1,4 @@ +identifier +L1-REQ-1 +L1-REQ-2 +L0-REQ-1 diff --git a/EntityResolution/TestData/Package-2/REQUIREMENTS_REQUIREMENT2.csv b/EntityResolution/TestData/Package-2/REQUIREMENTS_REQUIREMENT2.csv new file mode 100644 index 00000000..322570d8 --- /dev/null +++ b/EntityResolution/TestData/Package-2/REQUIREMENTS_REQUIREMENT2.csv @@ -0,0 +1,4 @@ +identifier,dataInsertedBy_identifier,description,satisfies_identifier +L1-REQ-1,Package 2,This is requirement L1-1.,L0-REQ-1 +L1-REQ-2,Package 2,This is requirement L1-2.,L0-REQ-1 +L0-REQ-1,Package 2,, diff --git a/EntityResolution/TestData/Package-2/import.yaml b/EntityResolution/TestData/Package-2/import.yaml new file mode 100644 index 00000000..a8efb989 --- /dev/null +++ b/EntityResolution/TestData/Package-2/import.yaml @@ -0,0 +1,9 @@ +data-graph: "http://rack001/data" +ingestion-steps: +#Phase1: Identifiers Only +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY1.csv"} +- {class: "http://arcos.rack/REQUIREMENTS#REQUIREMENT", csv: "REQUIREMENTS_REQUIREMENT1.csv"} + +#Phase2: All Evidence +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY2.csv"} +- {class: "http://arcos.rack/REQUIREMENTS#REQUIREMENT", csv: "REQUIREMENTS_REQUIREMENT2.csv"} diff --git a/EntityResolution/TestData/Package-3/PROV_S_ACTIVITY1.csv b/EntityResolution/TestData/Package-3/PROV_S_ACTIVITY1.csv new file mode 100644 index 00000000..ed0a9665 --- /dev/null +++ b/EntityResolution/TestData/Package-3/PROV_S_ACTIVITY1.csv @@ -0,0 +1,2 @@ +identifier +Package 3 diff --git a/EntityResolution/TestData/Package-3/PROV_S_ACTIVITY2.csv b/EntityResolution/TestData/Package-3/PROV_S_ACTIVITY2.csv new file mode 100644 index 00000000..d792488e --- /dev/null +++ b/EntityResolution/TestData/Package-3/PROV_S_ACTIVITY2.csv @@ -0,0 +1,3 @@ +identifier,dataInsertedBy_identifier,description,startedAtTime,title,endedAtTime +Package 3,Package 3,Data that was ingested using the ARCOS Scraping Tool Kit.,2022-12-08 14:39:56,Package 3, +Package 3,Package 3,,,,2022-12-08 14:39:56 diff --git a/EntityResolution/TestData/Package-3/REQUIREMENTS_REQUIREMENT1.csv b/EntityResolution/TestData/Package-3/REQUIREMENTS_REQUIREMENT1.csv new file mode 100644 index 00000000..a34b930d --- /dev/null +++ b/EntityResolution/TestData/Package-3/REQUIREMENTS_REQUIREMENT1.csv @@ -0,0 +1,4 @@ +identifier +[L1-REQ-2] +[L1-REQ-3] +[L0-REQ-2] diff --git a/EntityResolution/TestData/Package-3/REQUIREMENTS_REQUIREMENT2.csv b/EntityResolution/TestData/Package-3/REQUIREMENTS_REQUIREMENT2.csv new file mode 100644 index 00000000..ef1c733b --- /dev/null +++ b/EntityResolution/TestData/Package-3/REQUIREMENTS_REQUIREMENT2.csv @@ -0,0 +1,4 @@ +identifier,dataInsertedBy_identifier,satisfies_identifier +[L1-REQ-2],Package 3,[L0-REQ-2] +[L1-REQ-3],Package 3,[L0-REQ-2] +[L0-REQ-2],Package 3, diff --git a/EntityResolution/TestData/Package-3/import.yaml b/EntityResolution/TestData/Package-3/import.yaml new file mode 100644 index 00000000..a8efb989 --- /dev/null +++ b/EntityResolution/TestData/Package-3/import.yaml @@ -0,0 +1,9 @@ +data-graph: "http://rack001/data" +ingestion-steps: +#Phase1: Identifiers Only +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY1.csv"} +- {class: "http://arcos.rack/REQUIREMENTS#REQUIREMENT", csv: "REQUIREMENTS_REQUIREMENT1.csv"} + +#Phase2: All Evidence +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY2.csv"} +- {class: "http://arcos.rack/REQUIREMENTS#REQUIREMENT", csv: "REQUIREMENTS_REQUIREMENT2.csv"} diff --git a/EntityResolution/TestData/Resolutions-1/PROV_S_ACTIVITY1.csv b/EntityResolution/TestData/Resolutions-1/PROV_S_ACTIVITY1.csv new file mode 100644 index 00000000..933e9b41 --- /dev/null +++ b/EntityResolution/TestData/Resolutions-1/PROV_S_ACTIVITY1.csv @@ -0,0 +1,2 @@ +identifier +Resolutions-1 diff --git a/EntityResolution/TestData/Resolutions-1/PROV_S_ACTIVITY2.csv b/EntityResolution/TestData/Resolutions-1/PROV_S_ACTIVITY2.csv new file mode 100644 index 00000000..85020507 --- /dev/null +++ b/EntityResolution/TestData/Resolutions-1/PROV_S_ACTIVITY2.csv @@ -0,0 +1,3 @@ +identifier,dataInsertedBy_identifier,description,startedAtTime,title,endedAtTime +Resolutions-1,Resolutions-1,Data that was ingested using the ARCOS Scraping Tool Kit.,2022-12-08 14:39:56,Resolutions-1, +Resolutions-1,Resolutions-1,,,,2022-12-08 14:39:56 diff --git a/EntityResolution/TestData/Resolutions-1/RESOLUTIONS_SAME_AS1.csv b/EntityResolution/TestData/Resolutions-1/RESOLUTIONS_SAME_AS1.csv new file mode 100644 index 00000000..2a8effa7 --- /dev/null +++ b/EntityResolution/TestData/Resolutions-1/RESOLUTIONS_SAME_AS1.csv @@ -0,0 +1 @@ +identifier diff --git a/EntityResolution/TestData/Resolutions-1/RESOLUTIONS_SAME_AS2.csv b/EntityResolution/TestData/Resolutions-1/RESOLUTIONS_SAME_AS2.csv new file mode 100644 index 00000000..6b3fdbca --- /dev/null +++ b/EntityResolution/TestData/Resolutions-1/RESOLUTIONS_SAME_AS2.csv @@ -0,0 +1,2 @@ +identifier,dataInsertedBy_identifier,primary_identifier,secondary_identifier +,Resolutions-1,L1-REQ-1,{L1-REQ-1} diff --git a/EntityResolution/TestData/Resolutions-1/import.yaml b/EntityResolution/TestData/Resolutions-1/import.yaml new file mode 100644 index 00000000..c8fed863 --- /dev/null +++ b/EntityResolution/TestData/Resolutions-1/import.yaml @@ -0,0 +1,9 @@ +data-graph: "http://rack001/data" +ingestion-steps: +#Phase1: Identifiers Only +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY1.csv"} +- {class: "http://arcos.rack/RESOLUTIONS#SAME_AS", csv: "RESOLUTIONS_SAME_AS1.csv"} + +#Phase2: All Evidence +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY2.csv"} +- {class: "http://arcos.rack/RESOLUTIONS#SAME_AS", csv: "RESOLUTIONS_SAME_AS2.csv"} diff --git a/EntityResolution/TestData/Resolutions-2/PROV_S_ACTIVITY1.csv b/EntityResolution/TestData/Resolutions-2/PROV_S_ACTIVITY1.csv new file mode 100644 index 00000000..ebf03678 --- /dev/null +++ b/EntityResolution/TestData/Resolutions-2/PROV_S_ACTIVITY1.csv @@ -0,0 +1,2 @@ +identifier +Resolutions-2 diff --git a/EntityResolution/TestData/Resolutions-2/PROV_S_ACTIVITY2.csv b/EntityResolution/TestData/Resolutions-2/PROV_S_ACTIVITY2.csv new file mode 100644 index 00000000..f2f8ec6a --- /dev/null +++ b/EntityResolution/TestData/Resolutions-2/PROV_S_ACTIVITY2.csv @@ -0,0 +1,3 @@ +identifier,dataInsertedBy_identifier,description,startedAtTime,title,endedAtTime +Resolutions-2,Resolutions-2,Data that was ingested using the ARCOS Scraping Tool Kit.,2022-12-08 14:39:56,Resolutions-2, +Resolutions-2,Resolutions-2,,,,2022-12-08 14:39:56 diff --git a/EntityResolution/TestData/Resolutions-2/RESOLUTIONS_SAME_AS1.csv b/EntityResolution/TestData/Resolutions-2/RESOLUTIONS_SAME_AS1.csv new file mode 100644 index 00000000..2a8effa7 --- /dev/null +++ b/EntityResolution/TestData/Resolutions-2/RESOLUTIONS_SAME_AS1.csv @@ -0,0 +1 @@ +identifier diff --git a/EntityResolution/TestData/Resolutions-2/RESOLUTIONS_SAME_AS2.csv b/EntityResolution/TestData/Resolutions-2/RESOLUTIONS_SAME_AS2.csv new file mode 100644 index 00000000..6409b459 --- /dev/null +++ b/EntityResolution/TestData/Resolutions-2/RESOLUTIONS_SAME_AS2.csv @@ -0,0 +1,2 @@ +identifier,dataInsertedBy_identifier,primary_identifier,secondary_identifier +,Resolutions-2,L1-REQ-2,[L1-REQ-2] diff --git a/EntityResolution/TestData/Resolutions-2/import.yaml b/EntityResolution/TestData/Resolutions-2/import.yaml new file mode 100644 index 00000000..c8fed863 --- /dev/null +++ b/EntityResolution/TestData/Resolutions-2/import.yaml @@ -0,0 +1,9 @@ +data-graph: "http://rack001/data" +ingestion-steps: +#Phase1: Identifiers Only +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY1.csv"} +- {class: "http://arcos.rack/RESOLUTIONS#SAME_AS", csv: "RESOLUTIONS_SAME_AS1.csv"} + +#Phase2: All Evidence +- {class: "http://arcos.rack/PROV-S#ACTIVITY", csv: "PROV_S_ACTIVITY2.csv"} +- {class: "http://arcos.rack/RESOLUTIONS#SAME_AS", csv: "RESOLUTIONS_SAME_AS2.csv"} diff --git a/EntityResolution/getData.json b/EntityResolution/getData.json new file mode 100644 index 00000000..6063a85f --- /dev/null +++ b/EntityResolution/getData.json @@ -0,0 +1,314 @@ +{ + "version": 3, + "sparqlConn": { + "name": "RACK local fuseki", + "domain": "", + "enableOwlImports": false, + "model": [ + { + "type": "fuseki", + "url": "http://localhost:3030/RACK", + "graph": "http://rack001/model" + } + ], + "data": [ + { + "type": "fuseki", + "url": "http://localhost:3030/RACK", + "graph": "http://rack001/data" + } + ] + }, + "sNodeGroup": { + "version": 20, + "limit": 0, + "offset": 0, + "sNodeList": [ + { + "propList": [ + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://arcos.rack/PROV-S#identifier", + "Constraints": "", + "SparqlID": "?dataInsertedBy_identifier", + "isReturned": true, + "optMinus": 0, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + } + ], + "nodeList": [], + "fullURIName": "http://arcos.rack/PROV-S#ACTIVITY", + "SparqlID": "?ACTIVITY", + "isReturned": false, + "isRuntimeConstrained": false, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE", + "binding": "?dataInsertedBy_ACTIVITY", + "isBindingReturned": false + }, + { + "propList": [ + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://arcos.rack/PROV-S#identifier", + "Constraints": "", + "SparqlID": "?definedIn_identifier", + "isReturned": true, + "optMinus": 0, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + } + ], + "nodeList": [], + "fullURIName": "http://arcos.rack/FILE#FILE", + "SparqlID": "?FILE", + "isReturned": false, + "isRuntimeConstrained": false, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE", + "binding": "?definedIn_FILE", + "isBindingReturned": false + }, + { + "propList": [ + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://arcos.rack/PROV-S#description", + "Constraints": "", + "SparqlID": "?description", + "isReturned": true, + "optMinus": 1, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + }, + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://arcos.rack/PROV-S#identifier", + "Constraints": "", + "SparqlID": "?identifier", + "isReturned": true, + "optMinus": 0, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + }, + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://arcos.rack/PROV-S#title", + "Constraints": "", + "SparqlID": "?title", + "isReturned": true, + "optMinus": 1, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + } + ], + "nodeList": [ + { + "SnodeSparqlIDs": [ + "?FILE" + ], + "OptionalMinus": [ + 1 + ], + "Qualifiers": [ + "" + ], + "DeletionMarkers": [ + false + ], + "range": [ + "http://arcos.rack/FILE#FILE" + ], + "ConnectBy": "definedIn", + "Connected": true, + "UriConnectBy": "http://arcos.rack/FILE#definedIn" + }, + { + "SnodeSparqlIDs": [ + "?ACTIVITY" + ], + "OptionalMinus": [ + 1 + ], + "Qualifiers": [ + "" + ], + "DeletionMarkers": [ + false + ], + "range": [ + "http://arcos.rack/PROV-S#ACTIVITY" + ], + "ConnectBy": "dataInsertedBy", + "Connected": true, + "UriConnectBy": "http://arcos.rack/PROV-S#dataInsertedBy" + } + ], + "fullURIName": "http://arcos.rack/PROV-S#THING", + "SparqlID": "?THING", + "isReturned": true, + "isRuntimeConstrained": true, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE", + "isTypeReturned": true + } + ], + "orderBy": [], + "groupBy": [], + "unionHash": {}, + "queryType": "CONSTRUCT", + "columnOrder": [] + }, + "importSpec": { + "version": "1", + "baseURI": "", + "columns": [ + { + "colId": "col_0", + "colName": "description" + }, + { + "colId": "col_1", + "colName": "identifier" + }, + { + "colId": "col_2", + "colName": "title" + }, + { + "colId": "col_3", + "colName": "definedIn_identifier" + }, + { + "colId": "col_4", + "colName": "dataInsertedBy_identifier" + } + ], + "dataValidator": [], + "texts": [], + "transforms": [ + { + "transId": "trans_0", + "name": "rm_null", + "transType": "replaceAll", + "arg1": "^(null|Null|NULL)$", + "arg2": "" + } + ], + "nodes": [ + { + "sparqlID": "?THING", + "type": "http://arcos.rack/PROV-S#THING", + "URILookupMode": "createIfMissing", + "mapping": [], + "props": [ + { + "URIRelation": "http://arcos.rack/PROV-S#description", + "mapping": [ + { + "colId": "col_0", + "transformList": [ + "trans_0" + ] + } + ] + }, + { + "URIRelation": "http://arcos.rack/PROV-S#identifier", + "URILookup": [ + "?THING" + ], + "mapping": [ + { + "colId": "col_1", + "transformList": [ + "trans_0" + ] + } + ] + }, + { + "URIRelation": "http://arcos.rack/PROV-S#title", + "mapping": [ + { + "colId": "col_2", + "transformList": [ + "trans_0" + ] + } + ] + } + ] + }, + { + "sparqlID": "?FILE", + "type": "http://arcos.rack/FILE#FILE", + "URILookupMode": "noCreate", + "mapping": [], + "props": [ + { + "URIRelation": "http://arcos.rack/PROV-S#identifier", + "URILookup": [ + "?FILE" + ], + "mapping": [ + { + "colId": "col_3", + "transformList": [ + "trans_0" + ] + } + ] + } + ] + }, + { + "sparqlID": "?ACTIVITY", + "type": "http://arcos.rack/PROV-S#ACTIVITY", + "URILookupMode": "noCreate", + "mapping": [], + "props": [ + { + "URIRelation": "http://arcos.rack/PROV-S#identifier", + "URILookup": [ + "?ACTIVITY" + ], + "mapping": [ + { + "colId": "col_4", + "transformList": [ + "trans_0" + ] + } + ] + } + ] + } + ] + }, + "plotSpecs": [] +} \ No newline at end of file diff --git a/EntityResolution/manifest_template/manifest.yaml b/EntityResolution/manifest_template/manifest.yaml new file mode 100644 index 00000000..7a6cb667 --- /dev/null +++ b/EntityResolution/manifest_template/manifest.yaml @@ -0,0 +1,11 @@ +name: 'Entity Resolution Data' + +footprint: + model-graphs: + - http://rack001/model + data-graphs: + - http://rack001/data + +steps: + - nodegroups: nodegroups + - data: resolutions/import.yaml diff --git a/EntityResolution/manifest_template/nodegroups/ingest_SAME_AS.json b/EntityResolution/manifest_template/nodegroups/ingest_SAME_AS.json new file mode 100644 index 00000000..7730e3d1 --- /dev/null +++ b/EntityResolution/manifest_template/nodegroups/ingest_SAME_AS.json @@ -0,0 +1,323 @@ +{ + "version": 3, + "sparqlConn": { + "name": "RACK local fuseki copy", + "domain": "", + "enableOwlImports": false, + "model": [ + { + "type": "fuseki", + "url": "http://localhost:3030/RACK", + "graph": "http://rack001/model" + } + ], + "data": [ + { + "type": "fuseki", + "url": "http://localhost:3030/RACK", + "graph": "http://rack001/data" + } + ] + }, + "sNodeGroup": { + "version": 20, + "limit": 0, + "offset": 0, + "sNodeList": [ + { + "propList": [ + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://arcos.rack/PROV-S#identifier", + "Constraints": "", + "SparqlID": "?secondary_identifier", + "isReturned": true, + "optMinus": 0, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + } + ], + "nodeList": [], + "fullURIName": "http://arcos.rack/PROV-S#THING", + "SparqlID": "?THING_0", + "isReturned": false, + "isRuntimeConstrained": false, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE", + "isTypeReturned": true, + "binding": "?secondary_THING", + "isBindingReturned": false + }, + { + "propList": [ + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://arcos.rack/PROV-S#identifier", + "Constraints": "", + "SparqlID": "?primary_identifier", + "isReturned": true, + "optMinus": 0, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + } + ], + "nodeList": [], + "fullURIName": "http://arcos.rack/PROV-S#THING", + "SparqlID": "?THING", + "isReturned": false, + "isRuntimeConstrained": false, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE", + "isTypeReturned": true, + "binding": "?primary_THING", + "isBindingReturned": false + }, + { + "propList": [ + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://arcos.rack/PROV-S#identifier", + "Constraints": "", + "SparqlID": "?dataInsertedBy_identifier", + "isReturned": true, + "optMinus": 0, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + } + ], + "nodeList": [], + "fullURIName": "http://arcos.rack/PROV-S#ACTIVITY", + "SparqlID": "?ACTIVITY", + "isReturned": false, + "isRuntimeConstrained": false, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE", + "binding": "?dataInsertedBy_ACTIVITY", + "isBindingReturned": false + }, + { + "propList": [], + "nodeList": [ + { + "SnodeSparqlIDs": [ + "?ACTIVITY" + ], + "OptionalMinus": [ + 1 + ], + "Qualifiers": [ + "" + ], + "DeletionMarkers": [ + false + ], + "range": [ + "http://arcos.rack/PROV-S#ACTIVITY" + ], + "ConnectBy": "dataInsertedBy", + "Connected": true, + "UriConnectBy": "http://arcos.rack/PROV-S#dataInsertedBy" + }, + { + "SnodeSparqlIDs": [ + "?THING" + ], + "OptionalMinus": [ + 0 + ], + "Qualifiers": [ + "" + ], + "DeletionMarkers": [ + false + ], + "range": [ + "http://arcos.rack/PROV-S#THING" + ], + "ConnectBy": "primary", + "Connected": true, + "UriConnectBy": "http://arcos.rack/RESOLUTIONS#primary" + }, + { + "SnodeSparqlIDs": [ + "?THING_0" + ], + "OptionalMinus": [ + 0 + ], + "Qualifiers": [ + "" + ], + "DeletionMarkers": [ + false + ], + "range": [ + "http://arcos.rack/PROV-S#THING" + ], + "ConnectBy": "secondary", + "Connected": true, + "UriConnectBy": "http://arcos.rack/RESOLUTIONS#secondary" + } + ], + "fullURIName": "http://arcos.rack/RESOLUTIONS#SAME_AS", + "SparqlID": "?SAME_AS", + "isReturned": false, + "isRuntimeConstrained": false, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE" + } + ], + "orderBy": [], + "groupBy": [], + "unionHash": {}, + "columnOrder": [] + }, + "importSpec": { + "version": "1", + "baseURI": "", + "columns": [ + { + "colId": "col_0", + "colName": "dataInsertedBy_identifier" + }, + { + "colId": "col_1", + "colName": "primary_identifier" + }, + { + "colId": "col_2", + "colName": "secondary_identifier" + }, + { + "colId": "col_3", + "colName": "primary_THING_type" + }, + { + "colId": "col_4", + "colName": "secondary_THING_type" + } + ], + "dataValidator": [], + "texts": [], + "transforms": [ + { + "transId": "trans_0", + "name": "rm_null", + "transType": "replaceAll", + "arg1": "^(null|Null|NULL)$", + "arg2": "" + } + ], + "nodes": [ + { + "sparqlID": "?SAME_AS", + "type": "http://arcos.rack/RESOLUTIONS#SAME_AS", + "mapping": [], + "props": [] + }, + { + "sparqlID": "?ACTIVITY", + "type": "http://arcos.rack/PROV-S#ACTIVITY", + "URILookupMode": "noCreate", + "mapping": [], + "props": [ + { + "URIRelation": "http://arcos.rack/PROV-S#identifier", + "URILookup": [ + "?ACTIVITY" + ], + "mapping": [ + { + "colId": "col_0", + "transformList": [ + "trans_0" + ] + } + ] + } + ] + }, + { + "sparqlID": "?THING", + "type": "http://arcos.rack/PROV-S#THING", + "URILookupMode": "noCreate", + "mapping": [], + "props": [ + { + "URIRelation": "http://arcos.rack/PROV-S#identifier", + "URILookup": [ + "?THING" + ], + "mapping": [ + { + "colId": "col_1", + "transformList": [ + "trans_0" + ] + } + ] + } + ], + "type_restriction": { + "URILookup": [ + "?THING" + ], + "mapping": [ + { + "colId": "col_3" + } + ] + } + }, + { + "sparqlID": "?THING_0", + "type": "http://arcos.rack/PROV-S#THING", + "URILookupMode": "noCreate", + "mapping": [], + "props": [ + { + "URIRelation": "http://arcos.rack/PROV-S#identifier", + "URILookup": [ + "?THING_0" + ], + "mapping": [ + { + "colId": "col_2", + "transformList": [ + "trans_0" + ] + } + ] + } + ], + "type_restriction": { + "URILookup": [ + "?THING_0" + ], + "mapping": [ + { + "colId": "col_4" + } + ] + } + } + ] + }, + "plotSpecs": [] +} diff --git a/EntityResolution/manifest_template/nodegroups/store_data.csv b/EntityResolution/manifest_template/nodegroups/store_data.csv new file mode 100644 index 00000000..477b7e29 --- /dev/null +++ b/EntityResolution/manifest_template/nodegroups/store_data.csv @@ -0,0 +1,2 @@ +ID,comments,creator,jsonFile,itemType +ingest_SAME_AS,Nodegroup used by Entity Resolution to ingest SAME_AS relationship as generated by the Entity Resolution Tool,Entity Resolution,ingest_SAME_AS.json,PrefabNodeGroup diff --git a/EntityResolution/manifest_template/resolutions/import.yaml b/EntityResolution/manifest_template/resolutions/import.yaml new file mode 100644 index 00000000..a2719f17 --- /dev/null +++ b/EntityResolution/manifest_template/resolutions/import.yaml @@ -0,0 +1,4 @@ +data-graph: "http://rack001/data" +ingestion-steps: + +- {nodegroup: "ingest_SAME_AS", csv: "SAME_AS.csv"} diff --git a/RACK-Ontology/OwlModels/import.yaml b/RACK-Ontology/OwlModels/import.yaml index 75ed1460..904cb39b 100644 --- a/RACK-Ontology/OwlModels/import.yaml +++ b/RACK-Ontology/OwlModels/import.yaml @@ -8,6 +8,7 @@ files: - CLAIM.owl - CONFIDENCE.owl - DOCUMENT.owl +- EntityResolution.owl - FILE.owl - HARDWARE.owl - HAZARD.owl @@ -20,3 +21,4 @@ files: - SOFTWARE.owl - SYSTEM.owl - TESTING.owl +- RESOLUTIONS.owl diff --git a/RACK-Ontology/ontology/ARP-4754A/import.yaml b/RACK-Ontology/ontology/ARP-4754A/import.yaml index 0df46f35..45523976 100644 --- a/RACK-Ontology/ontology/ARP-4754A/import.yaml +++ b/RACK-Ontology/ontology/ARP-4754A/import.yaml @@ -1,13 +1,13 @@ data-graph: "http://rack001/arp-4754a" ingestion-steps: #Phase1: Identifiers Only -- {nodegroup: "ingest_AGENT", csv: "AGENT_1.csv"} -- {nodegroup: "ingest_SPECIFICATION", csv: "SPECIFICATION_1.csv"} -- {nodegroup: "ingest_SECTION", csv: "SECTION_1.csv"} -- {nodegroup: "ingest_OBJECTIVE", csv: "OBJECTIVE_1.csv"} +- {class: "http://arcos.rack/PROV-S#AGENT", csv: "AGENT_1.csv"} +- {class: "http://arcos.rack/DOCUMENT#SPECIFICATION", csv: "SPECIFICATION_1.csv"} +- {class: "http://arcos.rack/DOCUMENT#SECTION", csv: "SECTION_1.csv"} +- {class: "http://arcos.rack/PROCESS#OBJECTIVE", csv: "OBJECTIVE_1.csv"} #Phase2: The rest of the data -- {nodegroup: "ingest_AGENT", csv: "AGENT_2.csv"} -- {nodegroup: "ingest_SPECIFICATION", csv: "SPECIFICATION_2.csv"} -- {nodegroup: "ingest_SECTION", csv: "SECTION_2.csv"} -- {nodegroup: "ingest_OBJECTIVE", csv: "OBJECTIVE_2.csv"} +- {class: "http://arcos.rack/PROV-S#AGENT", csv: "AGENT_2.csv"} +- {class: "http://arcos.rack/DOCUMENT#SPECIFICATION", csv: "SPECIFICATION_2.csv"} +- {class: "http://arcos.rack/DOCUMENT#SECTION", csv: "SECTION_2.csv"} +- {class: "http://arcos.rack/PROCESS#OBJECTIVE", csv: "OBJECTIVE_2.csv"} diff --git a/RACK-Ontology/ontology/CAPEC/import.yaml b/RACK-Ontology/ontology/CAPEC/import.yaml index 34e3650c..a1c01ae9 100644 --- a/RACK-Ontology/ontology/CAPEC/import.yaml +++ b/RACK-Ontology/ontology/CAPEC/import.yaml @@ -1,7 +1,7 @@ data-graph: "http://rack001/capec" ingestion-steps: #Phase1: Identifiers Only -- {nodegroup: "ingest_THREAT", csv: "CAPEC1.csv"} +- {class: "http://arcos.rack/SECURITY#THREAT", csv: "CAPEC1.csv"} #Phase2: The rest of the data -- {nodegroup: "ingest_THREAT", csv: "CAPEC2.csv"} +- {class: "http://arcos.rack/SECURITY#THREAT", csv: "CAPEC2.csv"} diff --git a/RACK-Ontology/ontology/DO-178C/import.yaml b/RACK-Ontology/ontology/DO-178C/import.yaml index 3ea55538..6ae0de5a 100644 --- a/RACK-Ontology/ontology/DO-178C/import.yaml +++ b/RACK-Ontology/ontology/DO-178C/import.yaml @@ -1,12 +1,12 @@ data-graph: "http://rack001/do-178c" ingestion-steps: #Phase1: Identifiers Only -- {nodegroup: "ingest_AGENT", csv: "AGENT_1.csv"} -- {nodegroup: "ingest_SPECIFICATION", csv: "SPECIFICATION_1.csv"} -- {nodegroup: "ingest_SECTION", csv: "SECTION_1.csv"} -- {nodegroup: "ingest_OBJECTIVE", csv: "OBJECTIVE_1.csv"} +- {class: "http://arcos.rack/PROV-S#AGENT", csv: "AGENT_1.csv"} +- {class: "http://arcos.rack/DOCUMENT#SPECIFICATION", csv: "SPECIFICATION_1.csv"} +- {class: "http://arcos.rack/DOCUMENT#SECTION", csv: "SECTION_1.csv"} +- {class: "http://arcos.rack/PROCESS#OBJECTIVE", csv: "OBJECTIVE_1.csv"} #Phase2: The rest of the data -- {nodegroup: "ingest_SPECIFICATION", csv: "SPECIFICATION_2.csv"} -- {nodegroup: "ingest_SECTION", csv: "SECTION_2.csv"} -- {nodegroup: "ingest_OBJECTIVE", csv: "OBJECTIVE_2.csv"} +- {class: "http://arcos.rack/DOCUMENT#SPECIFICATION", csv: "SPECIFICATION_2.csv"} +- {class: "http://arcos.rack/DOCUMENT#SECTION", csv: "SECTION_2.csv"} +- {class: "http://arcos.rack/PROCESS#OBJECTIVE", csv: "OBJECTIVE_2.csv"} diff --git a/RACK-Ontology/ontology/DO-330/import.yaml b/RACK-Ontology/ontology/DO-330/import.yaml index c1fe3382..0f0563d8 100644 --- a/RACK-Ontology/ontology/DO-330/import.yaml +++ b/RACK-Ontology/ontology/DO-330/import.yaml @@ -1,12 +1,12 @@ data-graph: "http://rack001/do-330" ingestion-steps: #Phase1: Identifiers Only -- {nodegroup: "ingest_AGENT", csv: "AGENT_1.csv"} -- {nodegroup: "ingest_SPECIFICATION", csv: "SPECIFICATION_1.csv"} -- {nodegroup: "ingest_SECTION", csv: "SECTION_1.csv"} -- {nodegroup: "ingest_OBJECTIVE", csv: "OBJECTIVE_1.csv"} +- {class: "http://arcos.rack/PROV-S#AGENT", csv: "AGENT_1.csv"} +- {class: "http://arcos.rack/DOCUMENT#SPECIFICATION", csv: "SPECIFICATION_1.csv"} +- {class: "http://arcos.rack/DOCUMENT#SECTION", csv: "SECTION_1.csv"} +- {class: "http://arcos.rack/PROCESS#OBJECTIVE", csv: "OBJECTIVE_1.csv"} #Phase2: The rest of the data -- {nodegroup: "ingest_SPECIFICATION", csv: "SPECIFICATION_2.csv"} -- {nodegroup: "ingest_SECTION", csv: "SECTION_2.csv"} -- {nodegroup: "ingest_OBJECTIVE", csv: "OBJECTIVE_2.csv"} +- {class: "http://arcos.rack/DOCUMENT#SPECIFICATION", csv: "SPECIFICATION_2.csv"} +- {class: "http://arcos.rack/DOCUMENT#SECTION", csv: "SECTION_2.csv"} +- {class: "http://arcos.rack/PROCESS#OBJECTIVE", csv: "OBJECTIVE_2.csv"} diff --git a/RACK-Ontology/ontology/EntityResolution.sadl b/RACK-Ontology/ontology/EntityResolution.sadl new file mode 100644 index 00000000..42e48a0a --- /dev/null +++ b/RACK-Ontology/ontology/EntityResolution.sadl @@ -0,0 +1,30 @@ +uri "http://research.ge.com/semtk/EntityResolution" alias EntityResolution. + + +SameAs is a top-level class, + described by target with a single value of type class, + described by duplicate with a single value of type class. + + + +// -- range for examples +// THING is a class. + + +// -- simple example of subclassing SameAs with your own type(SAME_AS1) and range (THING) +// +// SAME_AS1 is a type of SameAs. +// target of SAME_AS1 only has values of type THING. +// duplicate of SAME_AS1 only has values of type THING. + + +// -- example of subclassing SameAs with your own type (SAME_AS2) +// and changing the names of the properties to 'primary' and 'secondary' +// and setting the range to THING +// +// SAME_AS2 is a type of SameAs, +// described by primary with a single value of type THING, +// described by secondary with a single value of type THING. +// +// primary is a type of target. +// secondary is a type of duplicate. \ No newline at end of file diff --git a/RACK-Ontology/ontology/PROV-S.sadl b/RACK-Ontology/ontology/PROV-S.sadl index 46f3eb39..8270c0ad 100644 --- a/RACK-Ontology/ontology/PROV-S.sadl +++ b/RACK-Ontology/ontology/PROV-S.sadl @@ -19,15 +19,17 @@ uri "http://arcos.rack/PROV-S" alias provs (note "a basic Implementation of PROV Data Model standard in SADL based on table https://www.w3.org/TR/prov-dm/#relations-at-a-glance"). +NODE is a class. + dataInsertedBy (note "The activity that caused this data to be added to RACK") describes NODE with values of type ACTIVITY. -THING (note "A piece of data stored in RACK") is a class. + +THING (note "A piece of data stored in RACK") is a type of NODE. identifier (note "identifier is any data item that is used to associate items on when loading into the data store.") describes THING with values of type string. identifier describes THING with at most 1 value. title (note "A short, human-readable identifying label.") describes THING with values of type string. title describes THING with at most 1 value. description (note "A free-form, multi-line, human-readable explanation of this data element.") describes THING with values of type string. description describes THING with at most 1 value. - dataInsertedBy (note "The activity that caused this data to be added to RACK") describes THING with values of type ACTIVITY. ENTITY (note "An entity is a physical, digital, conceptual, or other kind of thing with some fixed aspects; entities may be real or imaginary.") is a type of THING. diff --git a/RACK-Ontology/ontology/RESOLUTIONS.sadl b/RACK-Ontology/ontology/RESOLUTIONS.sadl new file mode 100644 index 00000000..57a24114 --- /dev/null +++ b/RACK-Ontology/ontology/RESOLUTIONS.sadl @@ -0,0 +1,30 @@ +/* Copyright (c) 20202, General Electric Company, Galois, Inc. + * + * All Rights Reserved + * + * This material is based upon work supported by the Defense Advanced Research + * Projects Agency (DARPA) under Contract No. FA8750-20-C-0203. + * + * Any opinions, findings and conclusions or recommendations expressed in this + * material are those of the author(s) and do not necessarily reflect the views + * of the Defense Advanced Research Projects Agency (DARPA). + */ + +/************** edit history ***************** + * + * + *********************************************/ + +uri "http://arcos.rack/RESOLUTIONS" alias Rs. +import "http://arcos.rack/PROV-S". +import "http://research.ge.com/semtk/EntityResolution". + + +SAME_AS (note "Used to create curation relationships between two nodes. When two THINGs are connected via the SAME_AS relationship it means that the THINGs are actually describing the same. SAME_AS relationships will be collapsed into a single THING by the resolution process.") is a type of NODE. + primary (note "The primary THING is the one which will remain after the merge processes any conflicts will be resolved by using the primary's value, for example the resulting identifier will be the identifier from the primary") describes SAME_AS with a single value of type THING. + secondary (note "the secondary THINGs are the entity that will be removed during the resolution process, any attributes that do not conflict will be copied to the to the primary,") describes SAME_AS with values of type THING. + +// Make SAME_AS compatible with semTK entity resolution functions. +SAME_AS is a type of EntityResolution:SameAs. +primary is a type of EntityResolution:target. +secondary is a type of EntityResolution:duplicate. diff --git a/ScrapingToolKit/Evidence/__init__.py b/ScrapingToolKit/Evidence/__init__.py index 596ed55c..b128dcd3 100644 --- a/ScrapingToolKit/Evidence/__init__.py +++ b/ScrapingToolKit/Evidence/__init__.py @@ -77,11 +77,11 @@ def createCDR(dataGraph="http://rack001/data"): if c.find("identifier").text not in loaded: outwriter.writerow([c.find("identifier").text]) loaded.append(c.find("identifier").text) - # Check to see if the header item - for k in headers: - if c.find(k) is not None: - if k not in usedHeaders: - usedHeaders.append(k) + # Check to see if the header item + for k in headers: + if c.find(k) is not None: + if k not in usedHeaders: + usedHeaders.append(k) else: log("Identifier not found.") diff --git a/assist/bin/checks/bdu.pl b/assist/bin/checks/bdu.pl index 33c1bc86..b2cdd180 100644 --- a/assist/bin/checks/bdu.pl +++ b/assist/bin/checks/bdu.pl @@ -36,21 +36,21 @@ :- use_module(utils(float_equality)). prolog:message(bad_BDU_sum(Thing, Sum)) --> - [ 'BDU sum for ~w is ~:f, expected 1.0'-[Thing, Sum] ]. + [ 'CE-121: BDU sum for ~w is ~:f, expected 1.0'-[Thing, Sum] ]. prolog:message(multiple_beliefs(A, B1, B2)) --> - [ '~w has two (or more) belief values (such as ~w and ~w)'-[A, B1, B2] ]. + [ 'CE-122: ~w has two (or more) belief values (such as ~w and ~w)'-[A, B1, B2] ]. prolog:message(multiple_disbeliefs(A, B1, B2)) --> - [ '~w has two (or more) disbelief values (such as ~w and ~w)'-[A, B1, B2] ]. + [ 'CE-123: ~w has two (or more) disbelief values (such as ~w and ~w)'-[A, B1, B2] ]. prolog:message(multiple_uncertainties(A, B1, B2)) --> - [ '~w has two (or more) uncertainties values (such as ~w and ~w)'-[A, B1, B2] ]. + [ 'CE-124: ~w has two (or more) uncertainties values (such as ~w and ~w)'-[A, B1, B2] ]. prolog:message(no_belief(Thing)) --> - [ '~w has a disbelief or uncertainty value, but no belief'-[Thing] ]. + [ 'CE-125: ~w has a disbelief or uncertainty value, but no belief'-[Thing] ]. prolog:message(no_disbelief(Thing)) --> - [ '~w has a belief or uncertainty value, but no disbelief'-[Thing] ]. + [ 'CE-126: ~w has a belief or uncertainty value, but no disbelief'-[Thing] ]. prolog:message(no_uncertainty(Thing)) --> - [ '~w has a belief or disbelief value, but no uncertainty'-[Thing] ]. + [ 'CE-127: ~w has a belief or disbelief value, but no uncertainty'-[Thing] ]. % Belief-Disbelief-Uncertainty metrics should: % * have all three values diff --git a/assist/bin/checks/interfaceChecks.pl b/assist/bin/checks/interfaceChecks.pl index 0938093a..9db7acd1 100644 --- a/assist/bin/checks/interfaceChecks.pl +++ b/assist/bin/checks/interfaceChecks.pl @@ -26,7 +26,8 @@ % Similar to "nodegroups/query/query dataVer INTERFACE without destination SYSTEM.json" % check_INTERFACE_no_dest_SYSTEM(IFACE) :- - check_has_no_rel('http://arcos.rack/SYSTEM#INTERFACE', + check_has_no_rel('I1', + 'http://arcos.rack/SYSTEM#INTERFACE', 'http://arcos.rack/SYSTEM#destination', 'http://arcos.rack/SYSTEM#SYSTEM', IFACE). @@ -39,7 +40,8 @@ % Similar to "nodegroups/query/query dataVer INTERFACE without source SYSTEM.json" % check_INTERFACE_no_src_SYSTEM(IFACE) :- - check_has_no_rel('http://arcos.rack/SYSTEM#INTERFACE', + check_has_no_rel('I2', + 'http://arcos.rack/SYSTEM#INTERFACE', 'http://arcos.rack/SYSTEM#source', 'http://arcos.rack/SYSTEM#SYSTEM', IFACE). diff --git a/assist/bin/checks/sbvt_checks.pl b/assist/bin/checks/sbvt_checks.pl index 3914a073..ee8f49d5 100644 --- a/assist/bin/checks/sbvt_checks.pl +++ b/assist/bin/checks/sbvt_checks.pl @@ -30,7 +30,8 @@ % dataVer SBVT_Result without confirms_SBVT_Test.json" % check_Result_not_confirmed(I) :- - check_has_no_rel('http://arcos.AH-64D/Boeing#SBVT_Result', + check_has_no_rel('SBVT1', + 'http://arcos.AH-64D/Boeing#SBVT_Result', 'http://arcos.rack/TESTING#confirms', 'http://arcos.AH-64D/Boeing#SBVT_Test', I). @@ -50,7 +51,8 @@ % where the latter additionally qualifies the target of the former. % check_no_Test_requirement(I) :- - check_has_no_rel('http://arcos.AH-64D/Boeing#SBVT_Test', + check_has_no_rel('SBVT2', + 'http://arcos.AH-64D/Boeing#SBVT_Test', 'http://arcos.rack/TESTING#verifies', 'http://arcos.AH-64D/Boeing#SRS_Req', %% 'http://arcos.rack/REQUIREMENTS#REQUIREMENT', diff --git a/assist/bin/checks/software_checks.pl b/assist/bin/checks/software_checks.pl index 70a8261f..e0287569 100644 --- a/assist/bin/checks/software_checks.pl +++ b/assist/bin/checks/software_checks.pl @@ -26,20 +26,25 @@ % Similar to "nodegroups/query/query dataVer SOFTWARE without partOf SOFTWARE.json" % check_SOFTWARE_COMPONENT_contained(I) :- - check_has_no_rel('http://arcos.rack/SOFTWARE#SWCOMPONENT', - 'http://arcos.rack/SOFTWARE#subcomponentOf', + check_has_no_rel('S1', + 'http://arcos.rack/SOFTWARE#SWCOMPONENT', + 'http://arcos.rack/SOFTWARE#partOf', 'http://arcos.rack/SOFTWARE#SWCOMPONENT', I). %! check_SOFTWARE_COMPONENT_impact is det. % -% Checks every SOFTWARE partOf target is a SOFTWARE. -% Always succeeds, emits warnings. +% Checks every SWCOMPONENT has an associated REQUIREMENT if that +% SWCOMPONENT is a MODULE. % -% Similar to "nodegroups/query/query dataVer SOFTWARE without partOf SOFTWARE.json" +% Similar to "nodegroups/query/query dataVer unlinked SWCOMPONENT.json" % check_SOFTWARE_COMPONENT_impact(I) :- - check_has_no_rel('http://arcos.rack/SOFTWARE#SWCOMPONENT', + rack_data_instance('http://arcos.rack/SOFTWARE#SWCOMPONENT', I), + rdf(I, 'http://arcos.rack/SOFTWARE#componentType', CT), + rack_instance_ident(CT, "Module"), + check_has_no_rel('S2', + 'http://arcos.rack/SOFTWARE#SWCOMPONENT', 'http://arcos.rack/PROV-S#wasImpactedBy', 'http://arcos.rack/REQUIREMENTS#REQUIREMENT', I). diff --git a/assist/bin/checks/srs_checks.pl b/assist/bin/checks/srs_checks.pl index deb1bb70..f851f690 100644 --- a/assist/bin/checks/srs_checks.pl +++ b/assist/bin/checks/srs_checks.pl @@ -20,17 +20,21 @@ %! check_SRS_insertion_source is det. % -% Checks that no SRS_Req is inserted by any activity other than -% "SRS Data Ingestion". Always succeeds, emits warnings. +% Checks that at least one "insertedBy" activity for an SRS_Req is the "SRS +% Data Ingestion". Always succeeds, emits warnings. % % Similar to "nodegroups/query/query dataVer SRS_Req dataInsertedBy other than SRS Data Ingestion.json" -% + check_SRS_insertion_source(I) :- T = 'http://arcos.AH-64D/Boeing#SRS_Req', rack_data_instance(T, I), rdf(I, 'http://arcos.rack/PROV-S#dataInsertedBy', A), + must_have_srs_data_ingestion(T,I,A). + +must_have_srs_data_ingestion(T,I,A) :- + rack_instance_ident(A, "SRS Data Ingestion"), !. +must_be_srs_data_ingestion(T,I,A) :- rack_instance_ident(A, AName), - \+ AName = 'SRS Data Ingestion', rack_instance_ident(I, IN), rdf(A, rdf:type, ATy), print_message(warning, invalid_srs_req_inserter(T, I, IN, ATy, A, AName)). @@ -64,7 +68,8 @@ % Similar to "nodegroups/query/query dataVer SRS_Req without description.json" % check_SRS_Req_description(I) :- - check_has_no_rel('http://arcos.AH-64D/Boeing#SRS_Req', + check_has_no_rel('SRS1', + 'http://arcos.AH-64D/Boeing#SRS_Req', 'http://arcos.rack/PROV-S#description', I). @@ -77,8 +82,9 @@ % Similar to "nodegroups/query/query dataVer SubDD_Req without satisfies SRS_Req.json" % check_SubDD_Req_satisfies_SRS_Req(I) :- - check_has_no_rel('http://arcos.AH-64D/Boeing#SubDD_Req', - 'http://arcos.rack/TESTING#satisifes', + check_has_no_rel('SRS2', + 'http://arcos.AH-64D/Boeing#SubDD_Req', + 'http://arcos.rack/REQUIREMENTS#satisfies', 'http://arcos.AH-64D/Boeing#SRS_Req', I). @@ -87,18 +93,23 @@ { prefix_shorten(ITy, SIT), prefix_shorten(Inst, SII), prefix_shorten(InsTy, STT), - prefix_shorten(InsI, STI) + prefix_shorten(InsI, STI), + rdf_literal_val_type(InstIdent, IName, _), + rdf_literal_val_type(InsN, InsName, _) }, - [ '~w instance ~w (~w) inserted by invalid ACTIVITY: ~w ~w (~w)'-[ - SIT, SII, InstIdent, STT, STI, InsN ] ]. + [ 'CE-132: ~w instance ~w inserted by invalid ACTIVITY: ~w ~w~n Instance Domain: ~w~n Activity Domain: ~w~n'-[ + SIT, IName, STT, InsName, SII, STI ] ]. prolog:message(invalid_srs_req_satisfies(ITy, Inst, InstIdent, TgtTy, Tgt, TgtIdent)) --> { prefix_shorten(Inst, SI), prefix_shorten(ITy, ST), prefix_shorten(Tgt, SR), - prefix_shorten(TgtTy, SRT) + prefix_shorten(TgtTy, SRT), + rdf_literal_val_type(InstIdent, IName, _), + rdf_literal_val_type(TgtIdent, TName, _) + }, - [ '~w instance ~w (~w) satisifes something not a PIDS_Req or CSID_Req: ~w ~w (~w)'-[ - ST, SI, InstIdent, SRT, SR, TgtIdent ] ]. + [ 'CE-133: ~w instance ~w satisifes something not a PIDS_Req or CSID_Req: ~w ~w~n Instance Domain: ~w~n Satisfies Domain: ~w~n'-[ + ST, IName, SRT, TName, SI, SR ] ]. %! check_SRS is det. diff --git a/assist/bin/checks/system_checks.pl b/assist/bin/checks/system_checks.pl index 1c94bda9..4cf8c158 100644 --- a/assist/bin/checks/system_checks.pl +++ b/assist/bin/checks/system_checks.pl @@ -26,7 +26,8 @@ % Similar to "nodegroups/query/query dataVer SYSTEM without partOf SYSTEM.json" % check_SYSTEM_partOf_SYSTEM(I) :- - check_has_no_rel('http://arcos.rack/SYSTEM#SYSTEM', + check_has_no_rel('SYS1', + 'http://arcos.rack/SYSTEM#SYSTEM', 'http://arcos.rack/SYSTEM#partOf', 'http://arcos.rack/SYSTEM#SYSTEM', I). diff --git a/assist/bin/rack/check.pl b/assist/bin/rack/check.pl index c1ebb2cb..b0ed496a 100644 --- a/assist/bin/rack/check.pl +++ b/assist/bin/rack/check.pl @@ -204,20 +204,22 @@ % True for any SrcClass that has no Prop relationship to any target % instance. Returns the SrcInst that this occurs for as well as % generating a warning. -check_has_no_rel(SrcClass, Prop, SrcInst) :- +check_has_no_rel(Context, SrcClass, Prop, SrcInst) :- + is_valid_property(Context, SrcClass, Prop), rack_data_instance(SrcClass, SrcInst), none_of(SrcInst, rack_instance_relationship(SrcClass, Prop)), rack_instance_ident(SrcInst, SrcName), - print_message(warning, missing_any_tgt(SrcClass, SrcInst, SrcName, Prop)). + print_message(warning, missing_any_tgt(Context, SrcClass, SrcInst, SrcName, Prop)). % True for any SrcClass that has no Prop relationship to an instance % of the specific target class. Returns the SrcInst that this occurs % for as well as generating a warning. -check_has_no_rel(SrcClass, Prop, TgtClass, SrcInst) :- +check_has_no_rel(Context, SrcClass, Prop, TgtClass, SrcInst) :- + is_valid_property(Context, SrcClass, Prop), rack_data_instance(SrcClass, SrcInst), none_of(SrcInst, rack_instance_relationship(SrcClass, Prop, TgtClass)), rack_instance_ident(SrcInst, SrcName), - print_message(warning, missing_tgt(SrcClass, SrcInst, SrcName, Prop, TgtClass)), + print_message(warning, missing_tgt(Context, SrcClass, SrcInst, SrcName, Prop, TgtClass)), % -- if the above fails, it's probably useful to see if there are % *any* targets of Src--[Rel]--> check_also_has_no_rel(SrcClass, Prop). @@ -226,6 +228,13 @@ check_has_no_rel(SrcClass, Rel, SrcInst), !. check_also_has_no_rel(_, _). +is_valid_property(_, SrcClass, Property) :- + rdf(Property, rdfs:domain, PropClass), + rdf_reachable(SrcClass, rdfs:subClassOf, PropClass), !. +is_valid_property(Context, SrcClass, Property) :- + print_message(error, invalid_property_in_check(Context, SrcClass, Property)), + fail. + % Sometimes there will be things in SADL like: % @@ -300,9 +309,9 @@ prolog:message(class_missing_note(Class)) --> - [ 'No Note/Description for class ~w'-[Class] ]. + [ 'CE-100: No Note/Description for class ~w'-[Class] ]. prolog:message(not_prov_s_thing_class(Class)) --> - [ 'Not a subclass of PROV-S#THING: ~w'-[Class] ]. + [ 'CE-101: Not a subclass of PROV-S#THING: ~w'-[Class] ]. prolog:message(num_classes(What, Count)) --> [ 'There are ~:d RACK ~w.'-[Count, What] ]. prolog:message(cardinality_violation(InstType, Instance, InstanceIdent, Property, Specified, Actual)) --> @@ -310,38 +319,42 @@ prefix_shorten(InstType, ST), prefix_shorten(Property, SP) }, - [ '~w ~w (~w) . ~w has ~d values but an allowed cardinality of ~d~n'-[ - ST, SI, InstanceIdent, SP, Actual, Specified] ]. + [ 'CE-103: ~w ~w . ~w has ~d values but an allowed cardinality of ~d~n Domain: ~w~n'-[ + ST, InstanceIdent, SP, Actual, Specified, SI ] ]. prolog:message(min_cardinality_violation(InstType, Instance, IName, Property, Specified, Actual)) --> { prefix_shorten(Instance, SI), prefix_shorten(InstType, ST), - prefix_shorten(Property, SP) + prefix_shorten(Property, SP), + rdf_literal_val_type(IName, InstName, _) }, - [ '~w ~w (~w) . ~w has ~d values but a minimum allowed cardinality of ~d~n'-[ - ST, SI, IName, SP, Actual, Specified] ]. + [ 'CE-104: ~w ~w . ~w has ~d values but a minimum allowed cardinality of ~d~n Domain: ~w~n'-[ + ST, InstName, SP, Actual, Specified, SI] ]. prolog:message(max_cardinality_violation(InstType, Instance, IName, Property, Specified, Actual)) --> { prefix_shorten(Instance, SI), prefix_shorten(InstType, ST), - prefix_shorten(Property, SP) + prefix_shorten(Property, SP), + rdf_literal_val_type(IName, InstName, _) }, - [ '~w ~w (~w) . ~w has ~d values but a maximum allowed cardinality of ~d~n'-[ - ST, SI, IName, SP, Actual, Specified] ]. + [ 'CE-105: ~w ~w . ~w has ~d values but a maximum allowed cardinality of ~d~n Domain: ~w~n'-[ + ST, InstName, SP, Actual, Specified, SI ] ]. prolog:message(maybe_restriction(InstType, Instance, IName, Property, Actual)) --> { prefix_shorten(Instance, SI), prefix_shorten(InstType, ST), - prefix_shorten(Property, SP) + prefix_shorten(Property, SP), + rdf_literal_val_type(IName, InstName, _) }, - [ '~w ~w (~w) . ~w must have only zero or one instance, but has ~d~n'-[ - ST, SI, IName, SP, Actual] ]. + [ 'CE-106: ~w ~w . ~w must have only zero or one instance, but has ~d~n Domain: ~w~n'-[ + ST, InstName, SP, Actual, SI ] ]. prolog:message(invalid_value_in_enum(InstType, Instance, IName, Property, Value, Valid)) --> { prefix_shorten(Instance, SI), prefix_shorten(InstType, ST), prefix_shorten(Property, SP), prefix_shorten(Value, SV), - maplist(prefix_shorten, Valid, SL) + maplist(prefix_shorten, Valid, SL), + rdf_literal_val_type(IName, InstName, _) }, - [ '~w ~w (~w) . ~w value of ~w is invalid, allowed enumerations: ~w~n'-[ - ST, SI, IName, SP, SV, SL] ]. + [ 'CE-107: ~w ~w . ~w value of ~w is invalid, allowed enumerations: ~w~n Domain: ~w~n'-[ + ST, InstName, SP, SV, SL, SI ] ]. prolog:message(value_outside_range(InstType, Instance, IName, Property, Ty, V, MinV, MaxV)) --> { prefix_shorten(Instance, SI), prefix_shorten(InstType, ST), @@ -349,44 +362,52 @@ (rdf_equal(xsd:T, Ty) ; T = Ty), (rdf_equal(Val^^Ty, V) ; Val = V), (rdf_equal(Min^^Ty, MinV) ; Min = MinV), - (rdf_equal(Max^^Ty, MaxV) ; Max = MaxV) + (rdf_equal(Max^^Ty, MaxV) ; Max = MaxV), + rdf_literal_val_type(IName, InstName, _) }, - [ '~w, ~w (~w) . ~w value of ~w is outside ~w range [~w .. ~w]~n'-[ - ST, SI, IName, SP, Val, T, Min, Max ] ]. + [ 'CE-108: ~w, ~w . ~w value of ~w is outside ~w range [~w .. ~w]~n Domain: ~w~n'-[ + ST, InstName, SP, Val, T, Min, Max, SI ] ]. prolog:message(multiple_types_for_instance(Instance, Types)) --> { prefix_shorten(Instance, SI), maplist(prefix_shorten, Types, STys) }, - [ 'Instance ~w has multiple types: ~w~n'-[SI, STys] ]. + [ 'CE-109: Instance ~w has multiple types: ~w~n'-[SI, STys] ]. prolog:message(property_value_wrong_type(InstType, Instance, IName, Property, DefType, Val, ValType)) --> { prefix_shorten(Instance, SI), prefix_shorten(InstType, ST), prefix_shorten(Property, SP), prefix_shorten(DefType, SDTy), prefix_shorten(ValType, SVTy), - prefix_shorten(Val, SV) + prefix_shorten(Val, SV), + rdf_literal_val_type(IName, InstName, _) }, - [ '~w instance property ~w (~w) . ~w of ~w should be a ~w but is a ~w'-[ - ST, SI, IName, SP, SV, SVTy, SDTy ] ]. + [ 'CE-110: ~w instance property ~w . ~w of ~w should be a ~w but is a ~w~n Domain: ~w'-[ + ST, InstName, SP, SV, SVTy, SDTy, SI ] ]. prolog:message(property_value_wrong_type_in(InstType, Instance, IName, Property, DefType, Val, ValTypes)) --> { prefix_shorten(Instance, SI), prefix_shorten(InstType, ST), prefix_shorten(Property, SP), prefix_shorten(DefType, SDTy), findall(SVT, (member(VT, ValTypes), prefix_shorten(VT, SVT)), SVTys), - prefix_shorten(Val, SV) + prefix_shorten(Val, SV), + rdf_literal_val_type(IName, InstName, _) + }, + [ 'CE-111: ~w instance property ~w . ~w of ~w should be one of ~w but is a ~w~n Domain: ~w'-[ + ST, InstName, SP, SV, SVTys, SDTy, SI ] ]. +prolog:message(missing_any_tgt(Context, SrcClass, SrcInst, SrcIdent, Rel)) --> + [ 'CE-112-~w: ~w ~w has no ~w target relationships~n Domain: ~w'-[ + Context, SrcClass, SrcIdent, Rel, SrcInst] ]. +prolog:message(missing_tgt(Context, SrcClass, SrcInst, SrcIdent, Rel, TgtClass)) --> + { rdf_literal_val_type(SrcIdent, IdentName, _) }, - [ '~w instance property ~w (~w) . ~w of ~w should be one of ~w but is a ~w'-[ - ST, SI, IName, SP, SV, SVTys, SDTy ] ]. -prolog:message(missing_any_tgt(SrcClass, SrcInst, SrcIdent, Rel)) --> - [ '~w ~w (~w) has no ~w target relationships'-[ - SrcClass, SrcInst, SrcIdent, Rel] ]. -prolog:message(missing_tgt(SrcClass, SrcInst, SrcIdent, Rel, TgtClass)) --> - [ '~w ~w (~w) missing the ~w target of type ~w'-[ - SrcClass, SrcInst, SrcIdent, Rel, TgtClass] ]. + [ 'CE-113-~w: ~w ~w missing the ~w target of type ~w~n Domain: ~w~n'-[ + Context, SrcClass, IdentName, Rel, TgtClass, SrcInst ] ]. prolog:message(invalid_domain(SrcClass, Property, DefinedClass)) --> - [ 'Property ~w was referenced on class ~w, but that property is defined for the unrelated class ~w'-[ + [ 'CE-114: Property ~w was referenced on class ~w, but that property is defined for the unrelated class ~w~n'-[ Property, SrcClass, DefinedClass] ]. prolog:message(invalid_subclass_domain(SrcClass, Property, ParentProperty, DefinedClass)) --> - [ 'Property ~w was referenced on class ~w, but that property is a sub-type of ~w, which is defined for the unrelated class ~w'-[ + [ 'CE-115: Property ~w was referenced on class ~w, but that property is a sub-type of ~w, which is defined for the unrelated class ~w~n'-[ Property, SrcClass, ParentProperty, DefinedClass] ]. +prolog:message(invalid_property_in_check(Context, SrcClass, Property)) --> + [ 'CE-116-~w: INVALID CHECK for ~w property on ~w class!~n'-[ + Context, Property, SrcClass] ]. diff --git a/assist/databin/ar b/assist/databin/ar index f3fa8bb8..68a35b3d 100755 --- a/assist/databin/ar +++ b/assist/databin/ar @@ -33,7 +33,7 @@ fi if (( creating )) ; then outf=${!archive_file_idx} - rackf="$(dirname ${outf})/.$(basename ${outf}).rack" + rackf="$(dirname "${outf}")/.$(basename "${outf}").rack" ( export IFS="," diff --git a/cli/README.md b/cli/README.md index 4ac8bd00..a7a28919 100644 --- a/cli/README.md +++ b/cli/README.md @@ -389,6 +389,9 @@ steps: - manifest: another.yaml - model: model-manifest.yaml - data: data-manifest.yaml + - copygraph: + from-graph: 'http://rack001/data' + to-graph: 'uri://DefaultGraph' ``` The `name` and `description` fields are informational and are used to @@ -401,11 +404,19 @@ using the `--clear` flag. The `steps` section is required. It describes the sequential process of loading this ingestion package. This section must be a list of singleton -maps. Each map should have exactly one key describing which kind of -data should be imported. These keys will point to the same kind of -file as you'd use loading this kind of data individually. For example -a `data` section uses the same configuration file as `rack data import` -and a `model` section uses the same configuration file as `rack model import`. +maps. There are currently 4 kinds of step you can use in a manifest: + +- `manifest` steps take a relative path argument and recursively + import that manifest file. +- `model` steps take a relative path argument and invoke + `rack model import` on that file. +- `nodegroups` steps take a relative path argument and invoke + `rack nodegroups import` on that directory. +- `data` steps take a relative path argument and invoke + `rack data import` on that directory. +- `copygraph` steps take a dictionary specifying a `from-graph` URI + and a `to-graph` URI and perform a merge copying triples from the + from graph into the to graph. All file paths are resolved relative to the location of the manifest YAML file. diff --git a/cli/docker_start.sh b/cli/docker_start.sh new file mode 100755 index 00000000..8aba8a07 --- /dev/null +++ b/cli/docker_start.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2086,SC2128,SC2048 + +default_version=v11 + +if [ "$1" == "--help" ] || [ "$1" == "-?" ] || [ "$1" == "help" ] ; then + echo "This tool can be used to (re-)start a RACK docker image (using" + echo "either the docker or podman commands, depending on which is" + echo "installed). The optional command-line argument specifies which" + echo "version of the RACK image to start; the default is ${default_version}" + exit 0 +fi + +set -e -o pipefail + +image="gehighassurance/rack-box:${1:-$default_version}" + +cmd_exists () { type -p $1 > /dev/null 2>&1; } +# shellcheck disable=SC1075 +if cmd_exists podman +then cmd=podman +else if cmd_exists docker + then cmd=docker + else >&2 echo 'Cannot find docker or podman installation!'; exit 1; + fi +fi + +name=$(echo ${image} | cut -d/ -f2 | sed -e so:o-o) + +containerName () { # $1 is image name + grep $1 <(${cmd} container ls $2 --format '{{ .Image}}~{{ .Names}}') | cut -d~ -f2 +} + +mapfile -t existing < <(containerName ${image} -a) + +if containerName ${image} >/dev/null +then echo RACK docker image already running + +elif [ 0 == ${#existing[*]} ] +then ${cmd} run --detach --name ${name} \ + -p 3030:3030 \ + -p 8050:8050 \ + -p 8080:80 \ + -p 12050-12091:12050-12091 \ + ${image} + echo Started RACK container image: ${image} + echo Stop container by typing: + echo " $ ${cmd} container stop $(containerName ${image})" + +elif [ 1 == ${#existing[*]} ] +then echo Restarting stopped RACK container + echo Note: To discard and start a fresh container, type: + echo " $ ${cmd} container stop ${existing}" + echo " $ ${cmd} container rm ${existing}" + ${cmd} start ${existing} +else echo Multiple stopped RACK containers exist: ${existing[*]} + echo Not sure which one to restart. Remove extras, or manually start one via: + echo " $ ${cmd} container start NAME" + false # this is considered an error result +fi diff --git a/cli/optimize.sh b/cli/optimize.sh index a34da993..03abcbeb 100755 --- a/cli/optimize.sh +++ b/cli/optimize.sh @@ -30,4 +30,5 @@ echo "Now printing the contents of stats.opt:" cat "${STATS}" echo "Restarting Fuseki" +chown fuseki "${STATS}" systemctl start fuseki diff --git a/cli/rack/__init__.py b/cli/rack/__init__.py index 8d1c507f..e7a9efbd 100755 --- a/cli/rack/__init__.py +++ b/cli/rack/__init__.py @@ -20,19 +20,19 @@ import csv from enum import Enum, unique from io import StringIO -import json import logging from os import environ from pathlib import Path import re import sys -from typing import Any, Callable, Dict, List, Optional, NewType, TypeVar, cast +from typing import Any, Callable, Dict, List, Optional, NewType, Set, TypeVar, cast from types import SimpleNamespace +import tempfile +import shutil # library imports -import colorama from colorama import Fore, Style -from jsonschema import ValidationError, validate +from jsonschema import validate from tabulate import tabulate import requests import semtk3 @@ -73,6 +73,7 @@ def __str__(self) -> str: return self.value DEFAULT_BASE_URL: Url = Url("http://localhost") +DEFAULT_OPTIMIZE_URL: Url = Url("http://localhost:8050/optimize") MODEL_GRAPH: Url = Url("http://rack001/model") DEFAULT_DATA_GRAPH = Url("http://rack001/data") @@ -349,13 +350,179 @@ def utility_copygraph_driver(base_url: Url, triple_store: Optional[Url], triple_ semtk3.set_host(base_url) triple_store = triple_store or DEFAULT_TRIPLE_STORE triple_store_type = triple_store_type or DEFAULT_TRIPLE_STORE_TYPE - + @with_status(f'Copying {str_highlight(from_graph)} to {str_highlight(to_graph)}') def go() -> dict: return semtk3.copy_graph(from_graph, to_graph, triple_store, triple_store_type, triple_store, triple_store_type) go() -def ingest_manifest_driver(manifest_path: Path, base_url: Url, triple_store: Optional[Url], triple_store_type: Optional[str], clear: bool, default_graph: bool) -> None: +class IngestionBuilder: + def __init__(self) -> None: + self.fresh: int = 0 + self.model_graphs: Set[str] = set() + self.data_graphs: Set[str] = set() + self.manifests: Set[Path] = set() + + def next_fresh(self) -> int: + result = self.fresh + self.fresh = result + 1 + return result + + def model( + self, + from_path: Path, + to_path: Path, + ) -> None: + with open(from_path, mode='r', encoding='utf-8-sig') as f: + obj = yaml.safe_load(f) + + frombase = from_path.parent + tobase = to_path.parent + + files = obj['files'] + for (i,file) in enumerate(files): + file = Path(file) + shutil.copyfile(frombase.joinpath(file), tobase.joinpath(file.name), follow_symlinks=True) + files[i] = file.name + + c = obj.get('model-graphs') + if c is None: + self.model_graphs.add(str(MODEL_GRAPH)) + elif isinstance(c, str): + self.model_graphs.add(c) + elif isinstance(c, list): + self.model_graphs.update(c) + + with open(to_path, mode='w', encoding='utf-8-sig', newline='\n') as out: + yaml.safe_dump(obj, out) + + def data( + self, + from_path: Path, + to_path: Path, + ) -> None: + with open(from_path, mode='r', encoding='utf-8-sig') as f: + obj = yaml.safe_load(f) + frombase = from_path.parent + tobase = to_path.parent + + for step in obj['ingestion-steps']: + if 'owl' in step: + path = Path(step['owl']) + shutil.copyfile(frombase.joinpath(path), tobase.joinpath(path.name), follow_symlinks=True) + step['owl'] = path.name + if 'csv' in step: + path = Path(step['csv']) + shutil.copyfile(frombase.joinpath(path), tobase.joinpath(path.name), follow_symlinks=True) + step['csv'] = path.name + + self.data_graphs.add(obj['data-graph']) + + c = obj.get('extra-data-graphs') + if c is not None: + self.data_graphs.update(c) + + c = obj.get('model-graphs') + if c is not None: + if isinstance(c, str): + self.model_graphs.add(c) + elif isinstance(c, list): + self.model_graphs.update(c) + + with open(to_path, mode='w', encoding='utf-8-sig', newline='\n') as out: + yaml.safe_dump(obj, out) + + def nodegroups( + self, + from_path: Path, + to_path: Path, + ) -> None: + shutil.copyfile(from_path.joinpath('store_data.csv'), to_path.joinpath('store_data.csv'), follow_symlinks=True) + with open(from_path.joinpath('store_data.csv'), 'r') as f: + for row in csv.DictReader(f): + json = row['jsonFile'] + shutil.copyfile(from_path.joinpath(json), to_path.joinpath(json), follow_symlinks=True) + + def manifest( + self, + from_path: Path, + to_path: Path, + ) -> None: + + with open(from_path, mode='r', encoding='utf-8-sig') as f: + obj = yaml.safe_load(f) + + # Handle multiple inclusions of the same manifest to simplify + full_path = from_path.absolute() + if full_path in self.manifests: + print(f'Pruning duplicate manifest {from_path}') + del obj['steps'] + else: + self.manifests.add(full_path) + base_path = from_path.parent + for step in obj.get('steps',[]): + if 'manifest' in step: + path = step['manifest'] + dirname = Path(f'{self.next_fresh():02}_manifest') + subdir = to_path.parent.joinpath(dirname) + topath = subdir.joinpath(Path(path).name) + subdir.mkdir(exist_ok=False) + self.manifest(base_path.joinpath(path), topath) + step['manifest'] = str(dirname.joinpath(Path(path).name)) + elif 'model' in step: + path = step['model'] + dirname = Path(f'{self.next_fresh():02}_model') + subdir = to_path.parent.joinpath(dirname) + topath = subdir.joinpath(Path(path).name) + subdir.mkdir(exist_ok=False) + self.model(base_path.joinpath(path), topath) + step['model'] = str(dirname.joinpath(Path(path).name)) + elif 'data' in step: + path = step['data'] + dirname = Path(f'{self.next_fresh():02}_data') + subdir = to_path.parent.joinpath(dirname) + topath = subdir.joinpath(Path(path).name) + subdir.mkdir(exist_ok=False) + self.data(base_path.joinpath(path), topath) + step['data'] = str(dirname.joinpath(Path(path).name)) + elif 'nodegroups' in step: + path = step['nodegroups'] + dirname = Path(f'{self.next_fresh():02}_nodegroups') + subdir = to_path.parent.joinpath(dirname) + subdir.mkdir(exist_ok=False) + self.nodegroups(base_path.joinpath(path), subdir) + step['nodegroups'] = str(dirname) + + with open(to_path, mode='w', encoding='utf-8-sig', newline='\n') as out: + yaml.safe_dump(obj, out) + +def build_manifest_driver( + manifest_path: Path, + zipfile_path: Path +) -> None: + + with tempfile.TemporaryDirectory() as outdir: + builder = IngestionBuilder() + builder.manifest(manifest_path, Path(outdir).joinpath(f'manifest.yaml')) + shutil.make_archive(str(zipfile_path), 'zip', outdir) + + for x in builder.model_graphs: + print(f'Model graph: {x}') + + for x in builder.data_graphs: + print(f'Data graph: {x}') + + +def ingest_manifest_driver( + manifest_path: Path, + base_url: Url, + triple_store: Optional[Url], + triple_store_type: Optional[str], + clear: bool, + default_graph: bool, + top_level: bool = True, + optimization_url: Optional[Url] = None) -> None: + with open(manifest_path, mode='r', encoding='utf-8-sig') as manifest_file: manifest = Manifest.fromYAML(manifest_file) @@ -373,7 +540,7 @@ def ingest_manifest_driver(manifest_path: Path, base_url: Url, triple_store: Opt clear_driver(base_url, modelgraphs, datagraphs, triple_store, triple_store_type, Graph.MODEL) if not datagraphs == []: clear_driver(base_url, modelgraphs, datagraphs, triple_store, triple_store_type, Graph.DATA) - + if not manifest.getNodegroupsFootprint() == []: delete_nodegroups_driver(manifest.getNodegroupsFootprint(), True, True, True, base_url) @@ -396,10 +563,40 @@ def ingest_manifest_driver(manifest_path: Path, base_url: Url, triple_store: Opt store_nodegroups_driver(stepFile, base_url) elif StepType.MANIFEST == step_type: stepFile = base_path / step_data - ingest_manifest_driver(stepFile, base_url, triple_store, triple_store_type, False, default_graph) + ingest_manifest_driver(stepFile, base_url, triple_store, triple_store_type, False, default_graph, False) elif StepType.COPYGRAPH == step_type: utility_copygraph_driver(base_url, triple_store, triple_store_type, step_data[0], step_data[1]) + if top_level: + if manifest.getCopyToDefaultGraph(): + defaultGraph = Url("uri://DefaultGraph") + + if clear: + clear_driver(base_url, [defaultGraph], None, triple_store, triple_store_type, Graph.MODEL) + for graph in manifest.getModelgraphsFootprint(): + utility_copygraph_driver(base_url, triple_store, triple_store_type, graph, defaultGraph) + for graph in manifest.getDatagraphsFootprint(): + utility_copygraph_driver(base_url, triple_store, triple_store_type, graph, defaultGraph) + + if manifest.getPerformEntityResolution(): + @with_status(f'Executing entity resolution') + def go() -> dict: + return semtk3.combine_entities_in_conn(conn=sparql_connection(base_url, [defaultGraph], defaultGraph, [], triple_store, triple_store_type)) + go() + + if manifest.getPerformOptimization(): + invoke_optimization(optimization_url) + +def invoke_optimization(url: Optional[Url]) -> None: + url = url or DEFAULT_OPTIMIZE_URL + @with_status(f'Optimizing triplestore') + def go() -> None: + response = requests.get(str(url)).json() + if not response['success']: + raise Exception(response['message']) + go() + + def ingest_data_driver(config_path: Path, base_url: Url, model_graphs: Optional[List[Url]], data_graphs: Optional[List[Url]], triple_store: Optional[Url], triple_store_type: Optional[str], clear: bool) -> None: """Use an import.yaml file to ingest multiple CSV files into the data graph.""" with open(config_path, mode='r', encoding='utf-8-sig') as config_file: @@ -428,9 +625,9 @@ def ingest_data_driver(config_path: Path, base_url: Url, model_graphs: Optional[ if model_graphs is None: c = config.get('model-graphs') if c is not None: - if type(c) == str: + if isinstance(c, str): model_graphs = [Url(c)] - elif type(c) == list: + elif isinstance(c, list): model_graphs = [Url(x) for x in c] conn = sparql_connection(base_url, model_graphs, data_graph, extra_data_graphs, triple_store, triple_store_type) @@ -498,9 +695,9 @@ def ingest_owl_driver(config_path: Path, base_url: Url, model_graphs: Optional[L if model_graphs is None: c = config.get('model-graphs') if c is not None: - if type(c) == str: + if isinstance(c, str): model_graphs = [Url(c)] - elif type(c) == list: + elif isinstance(c, list): model_graphs = [Url(x) for x in c] conn = sparql_connection(base_url, model_graphs, None, [], triple_store, triple_store_type) @@ -673,7 +870,11 @@ def dispatch_utility_copygraph(args: SimpleNamespace) -> None: def dispatch_manifest_import(args: SimpleNamespace) -> None: """Implementation of manifest import subcommand""" - ingest_manifest_driver(Path(args.config), args.base_url, args.triple_store, args.triple_store_type, args.clear, args.default_graph) + ingest_manifest_driver(Path(args.config), args.base_url, args.triple_store, args.triple_store_type, args.clear, args.default_graph, True, args.optimize_url) + +def dispatch_manifest_build(args: SimpleNamespace) -> None: + """Implementation of manifest import subcommand""" + build_manifest_driver(Path(args.config), Path(args.zipfile)) def dispatch_data_import(args: SimpleNamespace) -> None: """Implementation of the data import subcommand""" @@ -731,6 +932,7 @@ def get_argument_parser() -> argparse.ArgumentParser: manifest_parser = subparsers.add_parser('manifest', help='Ingestion package automation') manifest_subparsers = manifest_parser.add_subparsers(dest='command') manifest_import_parser = manifest_subparsers.add_parser('import', help='Import ingestion manifest') + manifest_build_parser = manifest_subparsers.add_parser('build', help='Build ingestion package zip file') data_parser = subparsers.add_parser('data', help='Import or export CSV data') data_subparsers = data_parser.add_subparsers(dest='command') @@ -766,8 +968,13 @@ def get_argument_parser() -> argparse.ArgumentParser: manifest_import_parser.add_argument('config', type=str, help='Manifest YAML file') manifest_import_parser.add_argument('--clear', action='store_true', help='Clear footprint before import') manifest_import_parser.add_argument('--default-graph', action='store_true', help='Load whole manifest into default graph') + manifest_import_parser.add_argument('--optimize-url', type=str, help='RACK UI optimization endpoint (e.g. http://localhost:8050/optimize)') manifest_import_parser.set_defaults(func=dispatch_manifest_import) + manifest_build_parser.add_argument('config', type=str, help='Manifest YAML file') + manifest_build_parser.add_argument('zipfile', type=str, help='Ingestion package output file') + manifest_build_parser.set_defaults(func=dispatch_manifest_build) + data_import_parser.add_argument('config', type=str, help='Configuration YAML file') data_import_parser.add_argument('--model-graph', type=str, action='append', help='Model graph URL') data_import_parser.add_argument('--data-graph', type=str, action='append', help='Data graph URL') diff --git a/cli/rack/manifest.py b/cli/rack/manifest.py index deb30dbc..b59ca1cb 100644 --- a/cli/rack/manifest.py +++ b/cli/rack/manifest.py @@ -12,6 +12,11 @@ 'properties': { 'name': {'type': 'string'}, 'description': {'type': 'string'}, + + 'copy-to-default-graph': {'type': 'boolean'}, + 'perform-entity-resolution': {'type': 'boolean'}, + 'perform-triplestore-optimization': {'type': 'boolean'}, + 'footprint': { 'type': 'object', 'additionalProperties': False, @@ -97,6 +102,9 @@ def __init__(self, name: str, description: Optional[str] = None) -> None: self.datagraphsFootprint: List[Url] = [] self.nodegroupsFootprint: List[str] = [] self.steps: List[Tuple[StepType, Any]] = [] + self.performOptimization: bool = False + self.performEntityResolution: bool = False + self.copyToDefaultGraph: bool = False def getName(self) -> str: return self.name @@ -104,6 +112,18 @@ def getName(self) -> str: def getDescription(self) -> Optional[str]: return self.description + def getPerformOptimization(self) -> bool: + """Return True when this manifest file prescribes running the triplestore optimizer""" + return self.performOptimization + + def getPerformEntityResolution(self) -> bool: + """Return True when this manifest prescribes running entity resolution""" + return self.performEntityResolution + + def getCopyToDefaultGraph(self) -> bool: + """Return True when this manifest prescribes copying the footprint to the default graph""" + return self.copyToDefaultGraph + def addModelgraphFootprint(self, modelgraph: Url) -> None: self.modelgraphsFootprint.append(modelgraph) @@ -141,6 +161,10 @@ def fromYAML(src: Any) -> 'Manifest': manifest = Manifest(obj.get('name'), obj.get('description')) + manifest.copyToDefaultGraph = obj.get('copy-to-default-graph', False) + manifest.performEntityResolution = obj.get('perform-entity-resolution', False) + manifest.performOptimization = obj.get('perform-triplestore-optimization', False) + footprint = obj.get('footprint', {}) for datagraph in footprint.get('data-graphs', []): manifest.addDatagraphFootprint(Url(datagraph)) diff --git a/cli/requirements.txt b/cli/requirements.txt index a9d908dc..0b02291e 100644 --- a/cli/requirements.txt +++ b/cli/requirements.txt @@ -1,6 +1,6 @@ ase==3.22.1 attrs==21.4.0 -certifi==2022.6.15 +certifi==2022.12.7 chardet==5.0.0 colorama==0.4.5 idna==3.3 @@ -12,7 +12,7 @@ PyYAML==5.4.1 requests==2.28.1 Pillow==9.0.1 plotly==5.9.0 -semtk-python3 @ git+https://github.com/ge-semtk/semtk-python3@b07d2219679e46d588884fc0e4427da5a6f28a1e +semtk-python3 @ git+https://github.com/ge-semtk/semtk-python3@3794a10ba5c2065b145d88f074a7e52028c21cdb six==1.16.0 tabulate==0.8.10 urllib3==1.26.10 diff --git a/manifests/entity_resolution.yaml b/manifests/entity_resolution.yaml new file mode 100644 index 00000000..ad8613ab --- /dev/null +++ b/manifests/entity_resolution.yaml @@ -0,0 +1,17 @@ +name: 'Entity Resolution' +copy-to-default-graph: true +perform-entity-resolution: true +perform-triplestore-optimization: true +footprint: + model-graphs: + - http://rack001/model + data-graphs: + - http://rack001/data + +steps: + - manifest: rack.yaml + - data: ../EntityResolution/TestData/Package-1/import.yaml + - data: ../EntityResolution/TestData/Package-2/import.yaml + - data: ../EntityResolution/TestData/Package-3/import.yaml + - data: ../EntityResolution/TestData/Resolutions-1/import.yaml + - data: ../EntityResolution/TestData/Resolutions-2/import.yaml diff --git a/nodegroups/queries/query dataVer SBVT_Test without REQUIREMENT.json b/nodegroups/queries/query dataVer SBVT_Test without REQUIREMENT.json index 0c0a61a5..39d1900f 100644 --- a/nodegroups/queries/query dataVer SBVT_Test without REQUIREMENT.json +++ b/nodegroups/queries/query dataVer SBVT_Test without REQUIREMENT.json @@ -27,8 +27,9 @@ { "propList": [], "nodeList": [], - "fullURIName": "http://arcos.AH-64D/Boeing#SBVT_Test", - "SparqlID": "?SBVT_Test", + "NodeName": "REQUIREMENT", + "fullURIName": "http://arcos.rack/REQUIREMENTS#REQUIREMENT", + "SparqlID": "?REQUIREMENT", "isReturned": false, "isRuntimeConstrained": false, "valueConstraint": "", @@ -38,11 +39,10 @@ { "propList": [ { - "valueTypes": [ - "string" - ], - "rangeURI": "http://www.w3.org/2001/XMLSchema#string", - "domainURI": "http://arcos.rack/PROV-S#identifier", + "KeyName": "identifier", + "ValueType": "string", + "relationship": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://arcos.rack/PROV-S#identifier", "Constraints": "", "SparqlID": "?identifier", "isReturned": true, @@ -55,7 +55,7 @@ "nodeList": [ { "SnodeSparqlIDs": [ - "?SBVT_Test" + "?REQUIREMENT" ], "OptionalMinus": [ 2 @@ -66,16 +66,17 @@ "DeletionMarkers": [ false ], - "range": [ - "http://arcos.AH-64D/Boeing#SBVT_Test" - ], - "ConnectBy": "confirms", + "KeyName": "verifies", + "ValueType": "ENTITY", + "UriValueType": "http://arcos.rack/PROV-S#ENTITY", + "ConnectBy": "verifies", "Connected": true, - "UriConnectBy": "http://arcos.rack/TESTING#confirms" + "UriConnectBy": "http://arcos.rack/TESTING#verifies" } ], - "fullURIName": "http://arcos.AH-64D/Boeing#SBVT_Result", - "SparqlID": "?SBVT_Result", + "NodeName": "SBVT_Test", + "fullURIName": "http://arcos.AH-64D/Boeing#SBVT_Test", + "SparqlID": "?SBVT_Test", "isReturned": false, "isRuntimeConstrained": false, "valueConstraint": "", @@ -84,9 +85,7 @@ } ], "orderBy": [], - "groupBy": [], - "unionHash": {}, - "columnOrder": [] + "unionHash": {} }, "importSpec": { "version": "1", @@ -97,18 +96,18 @@ "transforms": [], "nodes": [ { - "sparqlID": "?SBVT_Result", - "type": "http://arcos.AH-64D/Boeing#SBVT_Result", + "sparqlID": "?SBVT_Test", + "type": "http://arcos.AH-64D/Boeing#SBVT_Test", "mapping": [], "props": [] }, { - "sparqlID": "?SBVT_Test", - "type": "http://arcos.AH-64D/Boeing#SBVT_Test", + "sparqlID": "?REQUIREMENT", + "type": "http://arcos.rack/REQUIREMENTS#REQUIREMENT", "mapping": [], "props": [] } ] }, - "plotSpecs": [] + "plotSpecs": null } \ No newline at end of file diff --git a/rack-box/Docker-Hub-README.md b/rack-box/Docker-Hub-README.md index 8062bbfb..b59a7f27 100644 --- a/rack-box/Docker-Hub-README.md +++ b/rack-box/Docker-Hub-README.md @@ -11,10 +11,16 @@ You may need to increase the resources given to Docker in order to run a RACK bo If you do see these resource settings, make the following changes: 1. Increase the number of CPUs to 4 if you have enough CPUs (2 may be enough if you don't have many CPUs). -2. Increase the amount of Memory to 4.00 GB (or more if you have plenty of RAM). +2. Increase the amount of Memory to 20 GB (16 GB may be enough if you don't have much RAM). 3. Click the Apply & Restart button to restart Docker with the new resource settings. -Now you are ready to start your RACK box. Type the following command to run your RACK box on your computer: +Now you are ready to start your RACK box. If you are running Unix or Mac, you can use the cli command: + +```shell +./cli/docker_start.sh +``` + +Otherwise, type the following command to run your RACK box on your computer: ```shell docker run --detach -p 3030:3030 -p 8050:8050 -p 8080:80 -p 12050-12091:12050-12091 gehighassurance/rack-box:v11 diff --git a/rack-box/GitHub-Release-README.md b/rack-box/GitHub-Release-README.md index f6d73d03..eecc6f75 100644 --- a/rack-box/GitHub-Release-README.md +++ b/rack-box/GitHub-Release-README.md @@ -14,12 +14,12 @@ Here are very brief instructions how to run a RACK box using a Linux container. Here are very brief instructions how to run a RACK box using a virtual machine. You will find more detailed [instructions](https://github.com/ge-high-assurance/RACK/wiki/03-Run-a-RACK-Box-VM) in the RACK Wiki. -1. Download the split VirtualBox zip files. -2. Concatenate the split VirtualBox zip files together. +1. Download the split zip files. +2. Concatenate the split zip files together. 3. Unzip the newly concatenated zip file. 4. Start VirtualBox. -5. Import the VirtualBox VM from the rack-box-virtualbox-v11 folder. -6. Start the VM. +5. Import the virtual machine from the unpacked .ovf file. +6. Start the virtual machine. 7. Visit in your browser to view the RACK box's welcome page. --- diff --git a/rack-box/README.md b/rack-box/README.md index 0f5e0c27..7ef8fc0d 100644 --- a/rack-box/README.md +++ b/rack-box/README.md @@ -76,6 +76,10 @@ although we will mention each file here as well: --exclude=.github --exclude=assist --exclude=cli --exclude=rack-box --exclude=tests --exclude=tools RACK`) +- `focal64\*`: Unpack a recent Ubuntu vagrant box here (`curl -LOSfs + https://app.vagrantup.com/ubuntu/boxes/focal64/versions/20221021.0.0/providers/virtualbox.box + && tar -xf virtualbox.box -C RACK/rack-box/focal64`) + Once you have put these files into the `files` subdirectory, skip to [Build the rack-box images](#Build-the-rack-box-images) for the next step. diff --git a/rack-box/http/user-data b/rack-box/http/user-data index 91224323..234b8fbf 100644 --- a/rack-box/http/user-data +++ b/rack-box/http/user-data @@ -27,8 +27,10 @@ autoinstall: - python3 - python3-pip - strace + - sudo - swi-prolog - unzip - vim late-commands: - echo 'ubuntu ALL=(ALL) NOPASSWD:ALL' > /target/etc/sudoers.d/ubuntu + - echo 'rackui ALL=(ALL) NOPASSWD:ALL' > /target/etc/sudoers.d/rackui diff --git a/rack-box/rack-box-hyperv.json b/rack-box/rack-box-hyperv.json index 923f13c0..bca8d82d 100644 --- a/rack-box/rack-box-hyperv.json +++ b/rack-box/rack-box-hyperv.json @@ -1,10 +1,11 @@ { "variables": { + "comment": "Note we used Hyper-V only for 1 week and this template may no longer work", "cpus": "4", "headless": "false", "http_proxy": "{{env `http_proxy`}}", "https_proxy": "{{env `https_proxy`}}", - "memory": "8192", + "memory": "20480", "no_proxy": "{{env `no_proxy`}}", "version": "dev", "vm_name": "rack-box-{{user `version`}}" diff --git a/rack-box/rack-box-virtualbox.json b/rack-box/rack-box-virtualbox.json index 8635ce04..f8bbd1ed 100644 --- a/rack-box/rack-box-virtualbox.json +++ b/rack-box/rack-box-virtualbox.json @@ -4,7 +4,7 @@ "headless": "false", "http_proxy": "{{env `http_proxy`}}", "https_proxy": "{{env `https_proxy`}}", - "memory": "8192", + "memory": "20480", "no_proxy": "{{env `no_proxy`}}", "version": "dev", "vm_name": "rack-box-{{user `version`}}" diff --git a/rack-box/scripts/install.sh b/rack-box/scripts/install.sh index 79079aca..007d9673 100644 --- a/rack-box/scripts/install.sh +++ b/rack-box/scripts/install.sh @@ -28,6 +28,7 @@ apt-get install -yqq \ python3 \ python3-pip \ strace \ + sudo \ swi-prolog \ unzip \ vim @@ -90,6 +91,8 @@ systemctl start fuseki cd /home/"${USER}"/RACK/rack-ui python3 -m pip install -r ./requirements.txt adduser --system --group --no-create-home --disabled-password rackui +usermod -aG sudo rackui +echo "rackui ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/rackui mkdir /etc/rackui chown rackui.rackui /etc/rackui envsubst < rackui.service > /etc/systemd/system/rackui.service @@ -98,6 +101,8 @@ systemctl enable rackui # Initialize SemTK environment variables cd "/home/${USER}/semtk-opensource" +# This lets rackui see and run scripts that live in /home/ubuntu/RACK/cli +chmod 755 "/home/${USER}" chmod 755 ./*.sh export SERVER_ADDRESS=localhost export SERVICE_HOST=localhost @@ -140,6 +145,11 @@ while ! curl http://localhost:3030/$/ping &>/dev/null; do sleep 10 done +# Configure Fuseki to time out queries after 5 minutes + +sed -e 's/^ # ja:co/ja:co/' -i /etc/fuseki/config.ttl +sed -e 's/"30000"/"300000"/' -i /etc/fuseki/config.ttl + # Create the RACK dataset curl -Ss -d 'dbName=RACK' -d 'dbType=tdb' 'http://localhost:3030/$/datasets' diff --git a/rack-ui/app.py b/rack-ui/app.py index bdc75565..23b955cc 100644 --- a/rack-ui/app.py +++ b/rack-ui/app.py @@ -4,14 +4,18 @@ import dash from dash import Dash, DiskcacheManager, html, dcc, callback, Input, Output import dash_bootstrap_components as dbc -from pages import home, load, verify +from pages import home, load, verify, utility from pages.helper import * +from flask import Flask +import json +import platform # diskcache for non-production apps when developing locally (fine for our Docker application). Needed for @dash.callback with background=True -cache = diskcache.Cache(get_temp_dir() + "/cache") +cache = diskcache.Cache(f"{get_temp_dir()}/cache") background_callback_manager = DiskcacheManager(cache) -app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP], background_callback_manager=background_callback_manager) +server = Flask(__name__) +app = Dash(server=server, external_stylesheets=[dbc.themes.BOOTSTRAP], background_callback_manager=background_callback_manager) app.title = 'RACK UI' # menu @@ -28,6 +32,7 @@ dbc.NavLink("Home", href="/", active="exact"), dbc.NavLink("Load", href="/load", active="exact"), dbc.NavLink("Verify", href="/verify", active="exact"), + dbc.NavLink("Utility", href="/utility", active="exact"), ], vertical=True, pills=True, ) @@ -49,7 +54,7 @@ ) # validate using this layout (includes components from pages) -app.validation_layout = html.Div([app.layout, load.layout, verify.layout]) +app.validation_layout = html.Div([app.layout, load.layout, verify.layout, utility.layout]) @callback(Output('page-content', 'children'), @@ -61,8 +66,28 @@ def display_page(pathname): return load.layout elif pathname == '/verify': return verify.layout + elif pathname == '/utility': + return utility.layout else: return '404' + +# endpoint to run triplestore optimization script +# (runs as part of the Dash app, alongside but separate from the UI) +@server.route('/optimize') +def optimize(): + try: + if platform.system() == "Windows": + raise Exception("RACK UI triplestore optimization endpoint is not supported on Windows") + command = "sudo ../cli/optimize.sh" + completed_process = run_subprocess(command, get_temp_dir_unique("optimize")) + if completed_process.returncode == 0: + return json.dumps({'success': True}) + else: + raise Exception(f"Optimize script returned exit code {completed_process.returncode}") + except Exception as e: + return json.dumps({'success': False, 'message': get_error_message(e)}) + + if __name__ == '__main__': app.run_server(host="0.0.0.0", debug=False) \ No newline at end of file diff --git a/rack-ui/assets/style.css b/rack-ui/assets/style.css index 6041f017..eecdda2e 100644 --- a/rack-ui/assets/style.css +++ b/rack-ui/assets/style.css @@ -26,11 +26,48 @@ button { /* style for scrolling status area */ div.scrollarea { - margin-top: 100px; + margin-top: 50px; white-space: pre-wrap; border-style: none; - height: 500px; + height: 400px; width: 1200px; overflow-y: auto; display: flex; flex-direction: column-reverse; } + +/* put a space between a checkbox and the text next to it */ +input[type=checkbox] { + margin-right: 10px; +} + + +/* dropdown menus */ +.ddm { + background-color: powderblue; + color: black; + border: none; + padding: 0px; +} +/* avoid color change when hovering */ +.ddm:hover { + background-color:powderblue; + color: black; +} + + +/* load options accordion */ +.accordion-button { + padding: 0px; +} +.accordion-button:not(.collapsed) { + background-color: white; + color: black; + box-shadow: none; +} +.accordion-button:is(.collapsed) { + box-shadow: none; +} +.accordion-body { + padding-top: 0px; + padding-bottom: 0px; +} diff --git a/rack-ui/pages/helper.py b/rack-ui/pages/helper.py index 1a26adfc..21384c30 100644 --- a/rack-ui/pages/helper.py +++ b/rack-ui/pages/helper.py @@ -9,6 +9,7 @@ import uuid import semtk3 import rack +import subprocess # configuration BASE_URL = "http://localhost" @@ -23,13 +24,17 @@ def get_temp_dir() -> str: def get_temp_dir_unique(prefix) -> str: """ Get a unique subdirectory within the temp dir, e.g. /tmp/ingest_9d40551e-f31f-4530-8c90-ca3e0acc4257""" - return os.path.join(get_temp_dir(), prefix + "_" + str(uuid.uuid4())) + return os.path.join(get_temp_dir(), f"{prefix}_{uuid.uuid4()}") def get_error_trace(e) -> str: """ Get error trace string """ trace = traceback.format_exception(None, e, e.__traceback__) return trace[-1] +def get_error_message(e) -> str: + """ Get error message """ + return str(e) + def get_trigger(): """ Get the input that triggered a callback @@ -47,3 +52,11 @@ def get_graph_info(): conn_str = rack.sparql_connection(BASE_URL, None, None, [], TRIPLE_STORE, TRIPLE_STORE_TYPE) graph_info_table = semtk3.get_graph_info(conn_str, True, False) # True to exclude internal SemTK graphs, False to get counts too return graph_info_table + +def run_subprocess(command, status_filepath=None): + """ Launch a process using a given command. Pipe output to file if provided """ + if status_filepath is not None: + command = f"{command} > {status_filepath} 2>&1" + completed_process = subprocess.run(command, shell=True, capture_output=True) + print(completed_process) # useful to see exit code + return completed_process \ No newline at end of file diff --git a/rack-ui/pages/home.py b/rack-ui/pages/home.py index 18a474b4..7735d553 100644 --- a/rack-ui/pages/home.py +++ b/rack-ui/pages/home.py @@ -5,17 +5,21 @@ from .helper import * import pandas as pd +# name of default graph +DEFAULT_GRAPH_NAME = "uri://DefaultGraph" + def layout(): """ Provide the layout in a function, so that it is refreshed every time the page is displayed """ # get table with graph names and triple counts df = pd.DataFrame(get_graph_info().get_pandas_data()) df.rename(columns={'graph': 'Graph', 'triples': '# Triples'}, inplace=True) # rename columns for display + df = df.replace(DEFAULT_GRAPH_NAME,'Optimized graph') - layout = html.Div(children=[ + layout = dbc.Spinner(html.Div(children=[ html.H2('Welcome to RACK.'), dcc.Markdown('Current graphs in RACK:', style={"margin-top": "50px"}), dbc.Table.from_dataframe(df, color="primary", bordered=True, size="sm", style={"width": "auto"}), - ]) + ])) return layout diff --git a/rack-ui/pages/load.py b/rack-ui/pages/load.py index f251c7bd..cde90b6d 100644 --- a/rack-ui/pages/load.py +++ b/rack-ui/pages/load.py @@ -18,18 +18,38 @@ # name of default manifest file within ingestion package MANIFEST_FILE_NAME = "manifest.yaml" -# div showing load details/options and load/cancel buttons -load_div = html.Div( +# display strings +CLEAR_BEFORE_LOADING_STR = "Clear before loading" + +# div showing load details and buttons to load data or open SPARQLgraph +load_div = dbc.Spinner(html.Div( [ + # package metadata (from manifest) dcc.Markdown("", id="load-div-message"), - dcc.RadioItems([], value="manifest-graphs", id="load-graph-radio", labelStyle={'display': 'block'}, inputStyle={"margin-right": "10px"}), # choose to load to manifest-specified or default graphs - html.Button("Load", id="load-button", n_clicks=0), # load button - html.Button("Cancel", id="cancel-load-button", n_clicks=0) # cancel button + + # load options + dbc.Accordion([ + dbc.AccordionItem( + dcc.Checklist([CLEAR_BEFORE_LOADING_STR], [CLEAR_BEFORE_LOADING_STR], id="load-options-checklist"), + title="Options")], + start_collapsed=True, flush=True, style={"width": "250px"}), + + # load/view buttons + dbc.Row([ + dbc.Col([html.Button("Load data", id="load-button", n_clicks=0)], width="auto"), # load button + dbc.Tooltip("Load the above data into RACK", target="load-button"), + dbc.Col(dbc.DropdownMenu([ + dbc.DropdownMenuItem("Target graphs", href="", target="_blank", id="sparqlgraph-button"), + dbc.DropdownMenuItem("Optimized graph", href="", target="_blank", id="sparqlgraph-default-button") + ], id="view-dropdown", label="View data", toggle_class_name="ddm"), width="auto"), + dbc.Tooltip("After loading, view data in SPARQLgraph", target="view-dropdown") + ]) + ], id="load-div", hidden=True, style={"margin-top": "50px"}, -) +)) # dialog indicating unzip error (e.g. no manifest) unzip_error_dialog = dbc.Modal( @@ -57,10 +77,12 @@ layout = html.Div([ html.H2("Load data"), dcc.Markdown("_Load data into RACK_"), - html.Button(id="turnstile-button", children="Load Turnstile data"), # button to load turnstile - dbc.Tooltip("Load the Turnstile sample data provided with RACK", target="turnstile-button"), - dcc.Upload(html.Button(id="select-button", children="Load ingestion package"), id='select-button-upload', accept=".zip", multiple=False), # button to show upload dialog to pick ingestion package - dbc.Tooltip("Load an ingestion package (in .zip format) from your local machine", target="select-button"), + dbc.Row([ + dbc.Col(html.Button(id="turnstile-button", children="Select Turnstile data"), width="auto"), # button to load turnstile + dbc.Col(dcc.Upload(html.Button(id="select-button", children="Select ingestion package"), id='select-button-upload', accept=".zip", multiple=False), width="auto") # button to show upload dialog to pick ingestion package + ]), + dbc.Tooltip("Select the Turnstile sample data provided with RACK", target="turnstile-button"), + dbc.Tooltip("Select an ingestion package (in .zip format) from your local machine", target="select-button"), load_div, html.Div(id="status-div", className="scrollarea"), # displays ingestion status unzip_error_dialog, @@ -74,9 +96,10 @@ @dash.callback( output=[ - Output("load-div-message", "children"), - Output("load-graph-radio", "options"), - Output("manifest-filepath", "data"), + Output("load-div-message", "children"), # package information to display to the user before confirming load + Output("sparqlgraph-button", "href"), # set the SG button link + Output("sparqlgraph-default-button", "href"), # set the SG button link (default graph) + Output("manifest-filepath", "data"), Output("unzip-error-dialog-body", "children"), Output("status-filepath", "data"), # store a status file path Output("select-button-upload", "contents")], # set to None after extracting, else callback ignores re-uploaded file @@ -87,6 +110,7 @@ running=[ (Output("select-button", "disabled"), True, False), # disable the button while running (Output("turnstile-button", "disabled"), True, False), # disable the button while running + (Output("load-button", "disabled"), True, False), # disable the button while running ], prevent_initial_call=True ) @@ -100,29 +124,39 @@ def run_unzip(zip_file_contents, turnstile_clicks): zip_str = io.BytesIO(base64.b64decode(zip_file_contents.split(',')[1])) zip_obj = ZipFile(zip_str, 'r') zip_obj.extractall(path=tmp_dir) # unzip the package - manifest_paths = glob.glob(tmp_dir + '/**/' + MANIFEST_FILE_NAME, recursive=True) + manifest_paths = glob.glob(f"{tmp_dir}/**/{MANIFEST_FILE_NAME}", recursive=True) if len(manifest_paths) == 0: - raise Exception("Cannot load ingestion package: does not contain manifest file " + MANIFEST_FILE_NAME) + raise Exception(f"Cannot load ingestion package: does not contain manifest file {MANIFEST_FILE_NAME}") if len(manifest_paths) > 1: - raise Exception("Cannot load ingestion package: contains multiple default manifest files: " + str(manifests)) + raise Exception(f"Cannot load ingestion package: contains multiple default manifest files: {manifests}") manifest_path = manifest_paths[0] else: manifest_path = "../Turnstile-Example/Turnstile-IngestionPackage/manifest.yaml" - manifest = get_manifest(manifest_path) - manifest_graphs_option = "Load to " + str(manifest.getModelgraphsFootprint()) + " " + str(manifest.getDatagraphsFootprint()) - radio_choices = [{'label': manifest_graphs_option, 'value': 'manifest-graphs'}, {'label': 'Load to default graph (for optimized performance)', 'value': 'default-graph'}] + + # generate SPARQLgraph link + sg_link = semtk3.get_sparqlgraph_url(SPARQLGRAPH_BASE_URL, conn_json_str=manifest.getConnection()) + sg_link_default = semtk3.get_sparqlgraph_url(SPARQLGRAPH_BASE_URL, conn_json_str=manifest.getDefaultGraphConnection()) + + # gather displayable information about the package + package_description = "" + if manifest.getDescription() != None and manifest.getDescription().strip() != '': + package_description = f"({manifest.getDescription()})" + additional_actions = [] + if manifest.getCopyToDefaultGraph(): additional_actions.append("copy to optimized graph") + if manifest.getPerformEntityResolution(): additional_actions.append("resolve entities") + if manifest.getPerformOptimization(): additional_actions.append("optimize triple store") + package_info = f"Data: `{manifest.getName()} {package_description}` \n" + \ + f"Target model graphs: `{', '.join(manifest.getModelgraphsFootprint())}` \n" + \ + f"Target data graphs: `{', '.join(manifest.getDatagraphsFootprint())}` \n" + \ + f"Additional actions: `{', '.join(additional_actions) if len(additional_actions) > 0 else 'None'}`" # generate a file in which to capture the ingestion status status_filepath = get_temp_dir_unique("output") - selected_message = "You have selected package '" + manifest.getName() + "'" - if manifest.getDescription() != None and manifest.getDescription().strip() != "": - selected_message = selected_message + " (" + manifest.getDescription() + ")" - except Exception as e: - return "", [], None, get_error_trace(e), None, None - return selected_message, radio_choices, manifest_path, None, status_filepath, None + return "", None, None, None, get_error_trace(e), None, None + return package_info, sg_link, sg_link_default, manifest_path, None, status_filepath, None @dash.callback( @@ -130,18 +164,19 @@ def run_unzip(zip_file_contents, turnstile_clicks): Output("last-loaded-graphs", "data")], # remember graphs loaded (used in the Verify tab) NOTE this Store is from app.py layout - using it here disables prevent_initial_call=True inputs=Input("load-button", "n_clicks"), # triggered by user clicking load button state=[ - State("load-graph-radio", "value"), # load to manifest or default graphs State("status-filepath", "data"), - State("manifest-filepath", "data")], + State("manifest-filepath", "data"), + State("load-options-checklist", "value")], # user-selected load options from the checklist background=True, # background callback running=[ (Output("select-button", "disabled"), True, False), # disable button while running (Output("turnstile-button", "disabled"), True, False), # disable button while running + (Output("load-button", "disabled"), True, False), # disable button while running (Output("status-interval", "disabled"), False, True) # enable the interval component while running ], prevent_initial_call=True # NOTE won't work because last-loaded-graphs is in the layout before load-button (see https://dash.plotly.com/advanced-callbacks#prevent-callback-execution-upon-initial-component-render) ) -def run_ingest(load_button_clicks, manifest_or_default_graphs, status_filepath, manifest_filepath): +def run_ingest(load_button_clicks, status_filepath, manifest_filepath, load_options): """ Ingest the selected zip file """ @@ -149,33 +184,26 @@ def run_ingest(load_button_clicks, manifest_or_default_graphs, status_filepath, if load_button_clicks == 0: raise dash.exceptions.PreventUpdate + clear = (CLEAR_BEFORE_LOADING_STR in load_options) # clear the graph before loading (or not), depending on UI checkbox selection + try: # avoid a ConnectionError if SemTK services are not fully up yet if semtk3.check_services() == False: raise Exception("Cannot reach SemTK Services (wait for startup to complete, or check for failures)") - use_default_graph = (manifest_or_default_graphs == "default-graph") - f = open(status_filepath, "a") with redirect_stdout(f), redirect_stderr(f): # send command output to temporary file rack.logger.setLevel("ERROR") - rack.ingest_manifest_driver(Path(manifest_filepath), BASE_URL, TRIPLE_STORE, TRIPLE_STORE_TYPE, True, use_default_graph) # process the manifest - - # get connection from manifest, construct SPARQLGraph URL - manifest = get_manifest(manifest_filepath) - if use_default_graph: - conn_str = manifest.getDefaultGraphConnection() - else: - conn_str = manifest.getConnection() - sparqlgraph_url_str = semtk3.get_sparqlgraph_url(SPARQLGRAPH_BASE_URL, conn_json_str=conn_str) + rack.ingest_manifest_driver(Path(manifest_filepath), BASE_URL, TRIPLE_STORE, TRIPLE_STORE_TYPE, clear, False) # process the manifest # store list of loaded graphs + manifest = get_manifest(manifest_filepath) last_loaded_graphs = manifest.getModelgraphsFootprint() + manifest.getDatagraphsFootprint() time.sleep(1) except Exception as e: return get_error_trace(e), [] # show done dialog with error - return [dcc.Markdown("Loaded ingestion package."), html.A("Open in SPARQLGraph UI", href=sparqlgraph_url_str, target="_blank", style={"margin-top": "100px"})], last_loaded_graphs + return [dcc.Markdown("Data was loaded successfully.")], last_loaded_graphs @callback(Output("status-div", "children"), @@ -198,19 +226,16 @@ def update_status(n, status_filepath): ####### simple callbacks to show/hide components ####### @callback(Output("load-div", "hidden"), - Input("load-graph-radio", "options"), # triggered by setting load graph radio options + Input("load-div-message", "children"), Input("load-button", "n_clicks"), - Input("cancel-load-button", "n_clicks"), prevent_initial_call=True ) -def manage_load_div(radio_options, load_clicks, cancel_clicks): +def manage_load_div(load_message, load_clicks): """ Show or hide the load div """ - if (get_trigger() in ["load-button.n_clicks", "cancel-load-button.n_clicks"]): - return True # load or cancel button pressed, hide div - elif radio_options == []: - return True # no radio options provided, don't show div + if len(load_message) > 0: + return False # show the div else: - return False # radio options provided, show div + return True # hide the div @callback(Output("unzip-error-dialog", "is_open"), Input("unzip-error-dialog-body", "children"), diff --git a/rack-ui/pages/utility.py b/rack-ui/pages/utility.py new file mode 100644 index 00000000..e5ac5df2 --- /dev/null +++ b/rack-ui/pages/utility.py @@ -0,0 +1,72 @@ +""" Content for the utility page """ + +import time +import platform +from dash import html, dcc, callback, Input, Output, State +import dash_bootstrap_components as dbc +from .helper import * + + +# dialog confirming triple store restart done +restart_done_dialog = dbc.Spinner(dbc.Modal( + [ + dbc.ModalBody("MESSAGE PLACEHOLDER", id="restart-done-dialog-body"), # message + dbc.ModalFooter([ + html.Button("Close", id="restart-done-button", n_clicks=0) # close button + ]), + ], + id="restart-done-dialog", + is_open=False, + backdrop=False, +)) + +# page elements +layout = html.Div([ + html.H2('RACK Utilities'), + dcc.Markdown("_Utility functions for RACK_"), + html.Button("Restart triple store", id="restart-button", n_clicks=0), + restart_done_dialog +]) + +####### callbacks ###################################### + +@dash.callback( + output=Output("restart-done-dialog-body", "children"), + inputs=Input("restart-button", "n_clicks"), # triggered by clicking restart button + background=True, # background callback + running=[ + (Output("restart-button", "disabled"), True, False), # disable the button while running + ], + prevent_initial_call=True +) +def run_restart(n_clicks): + """ + Restart the triple store + """ + try: + # determine if we can restart fuseki + if run_subprocess("systemctl is-enabled fuseki").returncode != 0: + raise Exception("Triple store restart not supported in this deployment") + + # restart fuseki + completed_process = run_subprocess("sudo systemctl restart fuseki", get_temp_dir_unique("restart-fuseki")) + if completed_process.returncode == 0: + return dcc.Markdown("Restarted the triple store.") + else: + raise Exception("Error restarting the triple store") + except Exception as e: + return get_error_trace(e) # show done dialog with error + +####### simple callbacks to show/hide components ####### + +@callback(Output("restart-done-dialog", "is_open"), + Input("restart-done-dialog-body", "children"), + Input("restart-done-button", "n_clicks"), + prevent_initial_call=True + ) +def manage_restart_done_dialog(children, n_clicks): + """ Show or hide the done dialog after restarting triple store """ + if (get_trigger() == "restart-done-button.n_clicks"): + return False # button pressed, hide the dialog + else: + return True # child added, show the dialog diff --git a/rack-ui/pages/verify.py b/rack-ui/pages/verify.py index fe629a25..f8b0435e 100644 --- a/rack-ui/pages/verify.py +++ b/rack-ui/pages/verify.py @@ -2,12 +2,15 @@ import time import platform -import subprocess from dash import html, dcc, callback, Input, Output, State import dash_bootstrap_components as dbc import semtk3 +import json from .helper import * +# name of default graph +DEFAULT_GRAPH_NAME = "uri://DefaultGraph" + # dialog confirming ASSIST verification done verify_assist_done_dialog = dbc.Modal( [ @@ -24,17 +27,19 @@ ) # div showing graphs list -verify_report_options_div = html.Div( +verify_report_options_div = dbc.Spinner(html.Div( [ dcc.Markdown("Select graphs to include in report:"), - dcc.Checklist([], [], id="verify-graph-checklist", labelStyle={'display': 'block'}, inputStyle={"margin-right": "10px"}), # choose which graphs to verify - html.Button("Continue", id="verify-report-continue-button", n_clicks=0), # button to open SPARQLgraph report - html.Button("Cancel", id="verify-report-cancel-button", n_clicks=0) # button to cancel + dcc.Checklist([], [], id="verify-graph-checklist", labelStyle={'display': 'block'}), # choose which graphs to verify + dbc.Row([ + dbc.Col(html.Button("Continue", id="verify-report-continue-button", n_clicks=0), width="auto"), # button to open SPARQLgraph report + dbc.Col(html.Button("Cancel", id="verify-report-cancel-button", n_clicks=0), width="auto") # button to cancel + ]) ], id="verify-report-options-div", hidden=True, style={"margin-top": "50px"}, -) +)) # dialog indicating an error generating the SPARQLgraph report (e.g. no graphs selected) verify_report_error_dialog = dbc.Modal( @@ -53,9 +58,11 @@ layout = html.Div([ html.H2('Verify Data'), dcc.Markdown("_Run verification routines on the data loaded in RACK_"), - html.Button("Verify using ASSIST", id="verify-assist-button", n_clicks=0), # button to verify using ASSIST + dbc.Row([ + dbc.Col(html.Button("Verify using ASSIST", id="verify-assist-button", n_clicks=0), width="auto"), # button to verify using ASSIST + dbc.Col(html.Button("Verify using report", id="verify-report-button"), width="auto") # button to verify using SPARQLgraph report + ]), dbc.Tooltip("Run the ASSIST tool and download an error report", target="verify-assist-button"), - html.Button("Verify using report", id="verify-report-button"), # button to verify using SPARQLgraph report dbc.Tooltip("Open SPARQLgraph and run data verification report on selected graphs", target="verify-report-button"), verify_report_options_div, html.Div(id="assist-status-div", className="scrollarea"), # displays status @@ -108,8 +115,8 @@ def run_assist(status_filepath): if platform.system() == "Windows": raise Exception("Not yet supported on Windows. (PROLOG checking is available through LINUX/Docker.)") else: - # runs on all graphs in the triple store, minus an exclusion list of internal SemTK graphs (e.g. demo data) - subprocess.call("../assist/bin/check -v -m " + TRIPLE_STORE_BASE_URL + "/ > " + status_filepath + " 2>&1", shell=True) + command = f"../assist/bin/check -v -m {TRIPLE_STORE_BASE_URL}/" # ASSIST tool. Runs on all graphs, minus exclusion list of internal SemTK graphs + run_subprocess(command, status_filepath) # TODO returns error code 1 even though seems successful time.sleep(1) return [dcc.Markdown("Completed ASSIST verification.")], False @@ -172,7 +179,9 @@ def show_report_options(button_clicks, last_loaded_graphs): Show list of graphs for verification report, with the last loaded graphs pre-selected """ # get list of graphs populated in the triple store - graphs_list = get_graph_info().get_column(0) + graphs_list_values = get_graph_info().get_column(0) # list of graphs, including uri://DefaultGraph + graphs_list_labels = list(map(lambda x: x.replace(DEFAULT_GRAPH_NAME, 'Optimized graph'), graphs_list_values.copy())) # display default graph as "Optimized graph" + graphs_list = [{'label': label, 'value': val} for label, val in zip(graphs_list_labels, graphs_list_values)] # these are the graphs last loaded - check the checkboxes for these if last_loaded_graphs == None: