<a href="https://colab.research.google.com/github/IanCostello/tools/blob/ValidationTool/import-validation-helper/ImportValidatorMaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Validation Helper
This Colab notebook introduces a few tools to check your template MCF, StatVars, and CSV. 

A summary of features is as follows.

*   MCF format checking (no improperly defined nodes).
*   StatVar reference checking (makes sure that all references either exist locally or in the knowledge graph).
*   TMCF and CSV column valididation.
*   Description spell checking.
*   ASCII encoding checking.

### Usage summary:
1.   Runtime -> Run All
2.   Authenticate with BigQuery in second cell
3.   Scroll to bottom, select three files to validate from your local computer.


# 1) At the top of the page, go to "Runtime -> Run All".

In [None]:
import re
import pandas as pd
!pip install --upgrade -q pyspellchecker
!pip install --upgrade -q pygsheets

from spellchecker import SpellChecker
import subprocess

from google.colab import auth
from google.cloud import bigquery
import gspread
from oauth2client.client import GoogleCredentials


# 2) Authenticate BQ here.
BigQuery is used to check your used references against the KG.

In [None]:
#@title Do you have BigQuery Access? (Internal Googler)
bq_access = True

if bq_access:
  auth.authenticate_user()

## Helper Functions

In [None]:
# Setup BQ client
client = None
if bq_access:
  project_id = "google.com:datcom-store-dev"
  client = bigquery.Client(project=project_id)

# Setup logging
# gc = gspread.authorize(GoogleCredentials.get_application_default())

# Enum definition
from enum import Enum
class PrecheckError(Enum):
  CRITICAL = "Critical"
  WARN = "Warn"

# Helpers
cache = {}

def validateNodeStructure(mcf_contents):
  # See if node has been processed
  hash_of_contents = "validateNodeStructure_" + str(hash(mcf_contents))
  if hash_of_contents in cache:
    return cache[hash_of_contents]

  # Nodes in an MCF file are separated by a blank line 
  mcf_nodes_text = mcf_contents.split("\n\n")

  # Lines in an MCF file are separated as property: constraint
  mcf_line = re.compile(r"^(\w+): (.*)$")

  mcf_nodes = []
  errors = []

  for node in mcf_nodes_text:
    current_mcf_node = {}

    for line in node.split('\n'):
      # Ignore blank lines if multiple spaces between lines
      if len(line) == 0:
        continue

      parsed_line = mcf_line.match(line)

      if parsed_line is None:
        errors.append((PrecheckError.CRITICAL, "MalformedLine", f"Malformed MCF Line '{line}'"))
      else:
        # Property = Constraint
        current_mcf_node[parsed_line.group(1)] = parsed_line.group(2)
        
    if len(current_mcf_node) > 0:
      mcf_nodes.append(current_mcf_node)

  # Add to cache
  cache[hash_of_contents] = (mcf_nodes, errors)

  return mcf_nodes, errors
  
def get_nodes_with_property(mcf_contents, prop, constraint):
  mcf_nodes, errors = validateNodeStructure(mcf_contents)

  matching_nodes = []
  for node in mcf_nodes:
    if prop in node and node[prop] == constraint:
      matching_nodes.append(node)

  return matching_nodes

def remove_prefix(s):
  """Removes prefixes 'dcs:', 'dcid:' and 'schema:' to ease node comparison."""
  s = s.strip()
  if s.startswith('dcs:'):
    return s[4:]
  if s.startswith('dcid:'):
    return s[5:]
  if s.startswith('schema:'):
    return s[7:]
  return s

def cmp_nodes(n1, n2):
  """Compares two nodes, ignoring prefixes such as in remove_prefix()"""
  return remove_prefix(n1) == remove_prefix(n2)

def get_newly_defined_nodes(mcf_contents, typeOf = ""):
  mcf_nodes, errors = validateNodeStructure(mcf_contents)

  new_nodes = []
  for node in mcf_nodes:
    if "Node" in node and "typeOf" in node and \
        (typeOf == "" or typeOf == remove_prefix(node['typeOf'])):
      new_nodes.append(node['Node'].replace("dcs:","").replace("dcid:",""))

  return new_nodes


## Definition of Tests

In [None]:
class TriplesChecks():
  """Defines the various tests that run on the combined contents of TMCF,
  uploaded csv, and statistical variable files. 
  
  To add a test: Make a new method with the following args.
    Args:
      df -> Dataframe of uploaded CSV
      tmcf_contents -> String of TMCF text content
      stat_vars -> String of Statistical Variables file
    Yields:
      Yields tuple of the precheck error level enum, error name, and an error message
  """

  def ensure_ascii(_, tmcf_contents, stat_vars_content):
    """Checks to ensure that files contents are solely ascii characters."""
    ascii_character_match = re.compile(r"^[\x00-\x7F]+$")

    for file_name, contents in \
        [("TMCF", tmcf_contents), ("Statistical Variables", stat_vars_content)]:

      if ascii_character_match.match(contents) == None:
        yield (PrecheckError.CRITICAL, "NonAsciiInFile",
          f"{file_name} file contains non-ascii characters.")

  def tmcf_csv_column_checks(df, tmcf_contents, stat_vars_content):
    """Handles column inconsistencies between tmcf and csv."""
    column_matches = re.compile(r"C:\w+->(\w+)")
    tmcf_columns = column_matches.findall(tmcf_contents)
    csv_columns = df.columns
    for column in tmcf_columns:
      if column not in csv_columns:
        yield (PrecheckError.CRITICAL, "ColInTMCFMissingFromCSV",
          f"Referenced column {column} in TMCF not found in CSV.")
    for column in csv_columns:
      if column not in tmcf_columns:
        yield (PrecheckError.WARN, "UnusedColumnPresent",
          f"Unused column {column} present in CSV.")
        
  def ensure_mcf_not_malformed(_, tmcf_contents, stat_vars_content):
    """Ensures lines of MCF files are property defined.
    Passes: Node: E:WorldBank->E0
    Fails: Node E:WorldBank->E0 
    """
    # Grab error field of tuple
    for error in validateNodeStructure(tmcf_contents)[1]:
      yield error

    for error in validateNodeStructure(stat_vars_content)[1]:
      yield error

  def ensure_nodes_properly_referenced(_, tmcf_contents, stat_vars_content):
    """Ensures that constraint field of mcf files are references or constants."""
    tmcf_nodes, _ = validateNodeStructure(tmcf_contents)
    stat_var_nodes, _ = validateNodeStructure(stat_vars_content)

    # Ensure that each property is a string, integer, boolean, tmcf reference, or schema reference 
    tmcf_match = re.compile(r"^(\"[^\"]+\")|(E:\w+->E\d+)|(C:\w+->\w+)|(\d+)|(True)|(False)|(((dcs)|(dcid)|(schema)):\w+)$")

    # Ensure that each property is a string, integer, boolean, tmcf reference, schema reference, or quantity range
    stvr_match = re.compile(r"^(\"[^\"]+\")|(\d+)|(True)|(False)|((((dcs)|(dcid)|(schema)):[A-Za-z0-9_\-\/]+,? ?)+)|(\[\S+ ((\d+)|(\d+ \d+)|(\d+ \+)|(\- \d+))\])$")
    tmcf_node_match = re.compile("^E:\S+->E\d+$")
    stvr_node_match = re.compile("^dcid:\S+$")

    for node_list, node_prop_regex, property_regex in \
        [(tmcf_nodes, tmcf_node_match, tmcf_match),
         (stat_var_nodes, stvr_node_match, stvr_match)]:
      for node in node_list:
        for prop, constraint in node.items():
          if prop == "Node":
            if node_prop_regex.match(constraint) == None:
              yield (PrecheckError.CRITICAL, "MalformedNode",
                     f"Malformed Node Property '{prop}: {constraint}'")
          
          # Validate properties of TMCF
          elif prop[0].islower():
            if property_regex.match(constraint) == None:
              yield (PrecheckError.WARN, "MisformedReference",
                     f"Misformed Reference: '{prop}: {constraint}'")

          # All properties besides Node should be lower case
          else:
            yield (PrecheckError.WARN, "LowerProperties",
                f"All MCF Properties besides Node should be lowercase. Triggered for '{prop}'.")
    
  def spell_check_descriptions(_, tmcf_contents, stat_vars_content):
    """Provides spell checking on all description fields."""
    description_field_parser = re.compile("description: \"([^\"]*)\"")
    spell = SpellChecker()
    sets_to_check = [("TMCF", tmcf_contents), 
                      ("Statistical Variables", stat_vars_content)]

    for set_name, text in sets_to_check:
      potential_mispellings = set()
      for description in description_field_parser.findall(text):
        potential_mispellings = potential_mispellings.union(
            spell.unknown(spell.split_words(description))
        )

      if len(potential_mispellings) != 0:
        yield (PrecheckError.WARN, "Misspelling",
            f"Potential Misspelling(s) in {set_name}: {list(potential_mispellings)})")

  def ensure_all_references_exist(df, tmcf_contents, stat_vars_content):
    if not bq_access:
      return
      
    # Get locally defined instances.
    new_references = get_newly_defined_nodes(stat_vars_content)

    # Get all references instances in stat vars
    ref_finder = re.compile(r"(?:(?:dcs)|(?:dcid)):(\S+)")
    references = list(set(ref_finder.findall(stat_vars_content)))

    # Get all stat vars that are not locally defined
    global_references = []
    for ref in references:
      if len(ref) != 0 and ref not in new_references:
        global_references.append(ref)

    # Query database 
    instance_query = """
    SELECT distinct id
    FROM `google.com:datcom-store-dev.dc_v3_clustered.Instance` 
    WHERE id IN ({str})
    """
    obj_instances = client.query(instance_query.replace("{str}",
      str(global_references).lstrip("[").rstrip("]"))).to_dataframe()['id'].values

    missing_references = []
    for ref in global_references:
      if ref not in obj_instances:
        missing_references.append(ref)

    if len(missing_references) != 0:
      yield (PrecheckError.WARN, "UndefinedReference",
                f"Potential Undefined References: {missing_references}")
    
  def ensure_all_statvars_defined(df, tmcf_contents, stat_vars_content):
    direct_ref = re.compile(r'variableMeasured:\s(\w+)$')
    indirect_ref = re.compile(r'variableMeasured:\sC:\w+->(\w+)')
    defined_nodes = get_newly_defined_nodes(stat_vars_content)
    for ref in direct_ref.findall(tmcf_contents):
      if not any([cmp_nodes(ref, n) for n in defined_nodes]):
        yield (PrecheckError.CRITICAL, "TMCFNodeRefNotInMCF",
              f"Node '{ref}' referenced in TMCF, undefined in MCF.")
    
    for col_ref in indirect_ref.findall(tmcf_contents):
      if col_ref not in df.columns:
        yield (PrecheckError.CRITICAL, "ColInTMCFNotInCSV",
              f"Column '{col_ref}' referenced in TMCF, not in CSV.")
      else:
        for ref in df[col_ref].unique():
          if not any([cmp_nodes(ref, n) for n in defined_nodes]):
            yield (PrecheckError.CRITICAL, "ReferencedFieldNotInMcf",
                  f"Node '{ref}' referenced in TMCF through '{col_ref}' column in CSV, undefined in MCF.")
  
  def ensure_dcid_not_too_long(_, __, stat_vars_content):
    for line in stat_vars_content.split("\n"):
      if 'Node:' in line and len(line) - len('Node: ') > 256:
        yield (PrecheckError.CRITICAL, "MalformedNode",
              f"The following node is too long: '{line.strip()}'\nMax dcid length is 256.")



In [None]:
from optparse import OptionParser
import inspect

def validate_prechecks(df, tmcf_contents, stat_vars_content, demo=False):
  """Runs validation checks on provided triples.

  Args:
      df -> Dataframe of uploaded CSV
      tmcf_contents -> String of TMCF text content
      stat_vars_content -> String of Statistical Variables file
  """
  # Log usage to find common errors
  # process = subprocess.Popen("gcloud config get-value account", shell=True, stdout=subprocess.PIPE)
  # username = process.stdout.read().decode("utf-8")
  # gc = gspread.authorize(GoogleCredentials.get_application_default())
  # workbook = gc.open_by_url('https://docs.google.com/spreadsheets/d/1l4YqvkhzRBKtab5lVCuoR0guGf2qIARK-A995DFZ6Nw/edit?usp=sharing')
  # sheet = workbook.worksheet('Usage')
  # if demo:
      # sheet.append_row([username, "RanDemo"])

  for function_name, function in inspect.getmembers(TriplesChecks, predicate=inspect.isfunction):
    errors = list(function(df, tmcf_contents, stat_vars_content))
    if len(errors) != 0:
      print(f"Error In Test {function_name}")
      for error in errors: 
        # if not demo:
          # sheet.append_row([username, str(error[0].value), error[1], error[2]])
        print(f"{error[0].value} - {error[2]}")
      print("")

# Sample Upload Demonstrating Common Errors

In [None]:
# Sample Input
stat_vars_content = \
"""
Node: dcid:Tourism
name: “Tourism“
typeOf: dcid:TravelPurposeEnum
description: "Ptential mispeling in my description." 

Node: dcid:ThisIsAnExtremelyLongStatVarName__Count_MortalityEvent_From75To79Years_MalignantImmunoproliferativeDiseasesAndCertainOtherB-CellLymphomas_MultipleMyelomaAndMalignantPlasmaCellNeoplasms_OtherAndUnspecifiedMalignantNeoplasmsOfLymphoid_HematopoieticAndRelatedTissue_Female_AsFractionOf_Count_Person
name: “Tourism“
typeOf: dcid:TravelPurposeEnum
description: "Ptential mispeling in my description." 
"""

tmcf_contents = \
"""
Node E:WorldBank->E0
typeOf: dcs:StatVarObservation
variableMeasured: C:WorldBank->StatisticalVariable
observationDate: C:WorldBank->Year
observationPeriod: "P1Y"
observationAbout: E:WorldBank->E1
value: C:WorldBank->Value

Node: E:WorldBank->E1
typeOf: dcs:Country
countryAlpha3Code: C:WorldBank->IsoCode
BadProperty: someFieldThatShouldBeAReferenceButIsInterpretedAsAString
"""

df = pd.DataFrame.from_dict({"Value": [4], "IsoCode": ["USA"], "Year":[2018], "Foo": ['bar']})
df

In [None]:
validate_prechecks(df, tmcf_contents, stat_vars_content, demo=True)

# Real Validation Code

Upload three files.

- StatisticalVariable -> Needs to end in .mcf
- Template MCF -> Needs to end in .tmcf
- CSV File -> Needs to end in .csv

In [None]:
from google.colab import files
from io import StringIO

LARGE_FILE_FLAG = False # Set this to true for large files
selected_files = files.upload()

# Parse out files
df, tmcf_text, stat_var_text = None, None, None

for file, contents in selected_files.items():
  if ".csv" in file:
    if not LARGE_FILE_FLAG:
      df = pd.read_csv(StringIO(contents.decode()))
    else:
      df = pd.read_csv(StringIO(contents.decode()), nrows=100)
  elif ".tmcf" in file:
    tmcf_text = contents.decode()
  elif ".mcf" in file:
    stat_var_text = contents.decode()

In [None]:
validate_prechecks(df, tmcf_text, stat_var_text)