# Set-up

## Select Colab or HPC

In [32]:
PLATFORM = 'colab' # colab or HPC or laptop

## Import dependencies

In [33]:
import altair as alt
import argparse
import ast
import bz2
import IPython.display
import json
import jsonschema
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import random
import re
import time
import urllib

In [34]:
if PLATFORM == 'colab':

    # Install Hugging Face library using a shell command
    import os
    os.system("pip install transformers")

    # Mount Google Drive and CD using a shell command
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir("/content/drive/MyDrive/Data science jobs/2. Portfolio/3. NL2VIS/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Top level functions

In [35]:
def save_object(fname, data):
    """Pickle a file and compress it.
    Source: https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
    """
    with bz2.open(fname, "wb") as f:
        pickle.dump(data, f)


def load_object(fname):
    """Load compressed pickle file
    Source: https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
    """
    with bz2.open(fname, "rb") as f:
        data = pickle.load(f)
    return data


def parse_raw_output(output):
    """Parses the decoded generated output sequence. Returns a json string when a match is found.
    An attribute error is raised if there is no match (and therefore no group attribute). Please note
    a match only occurs if the end of the dictionary is immediately followed by a line break.
    """
    if "OOD" in INPUT_FILENAME:
        pattern = r'^Task:.*?\nDataset schema:.*?\nSpecification:\s{.*?}\s*$\n'*3 + \
            r'^Task:.*?\nDataset schema:.*?\nSpecification:\s({.*?})\s*$'
    else:
        pattern = r'^Task:.*?\nSpecification:\s{.*?}\s*$\n'*3 + \
            r'^Task:.*?\nSpecification:\s({.*?})\s*$'

    patternObject = re.compile(pattern, re.M)
    try:
        spec = patternObject.search(output)
        return spec.group(1)

    except AttributeError:
        return "Attribute error"


def parse_json_string(spec_string):
    """This function transforms a string to a python literal. It therefore returns a dictionary from a JSON string.
    To do this, it utilises the AST module (https://docs.python.org/3/library/ast.html). All errors are recorded
    as syntax errors. However, they include:
    1. Syntax error: If specString is a string but it doesnt contain a valid python literal (i.e. string,
    numeric, collection (including dictionary), special).
    2. Value error: Raised when malformed node or string.
    3. Type error: Raised when an operation or function is applied to an object of inappropriate type.
    """
    try:
        spec = ast.literal_eval(spec_string)
        return spec
    except SyntaxError as e:
        return 'Syntax error'
    except ValueError as e:
        return 'Syntax error'
    except TypeError as e:
        return 'Syntax error'


def validate_json_schema(spec, vega_lite_schema):
    """Validates Vega-Lite specification via downloaded schema. For details, see
    https://python-jsonschema.readthedocs.io/en/latest/validate/
    """
    try:
        jsonschema.validate(spec, vega_lite_schema)
        # chart = alt.Chart(**spec)
        # chart.display()
        # save(chart, imageName)
        # print(f"Query {index}: Valid specification")
        return spec

    except jsonschema.exceptions.ValidationError as e:
        # print(f"Query {index}: Validation error")
        # print(e)
        # print("\n")
        return "Validation error"

    except jsonschema.exceptions.SchemaError as e:
        # print(f"Query {index}: Schema error")
        # print(e)
        # print("\n")
        return "Validation error"

# Parameters

## Command line arguments

In [36]:
# Create parser for command line arguments
# Source: https://docs.python.org/3/library/argparse.html
parser = argparse.ArgumentParser(description="Parse command line arguments")
parser.add_argument('-d',
                    '--device',
                    metavar='platform',
                    type=str,
                    nargs='?',
                    default='hpc',
                    help='Cuda or cpu.')

parser.add_argument('-ds',
                    '--decode',
                    metavar='Decoding strategy',
                    type=str,
                    nargs='?',
                    default='top_p',
                    help='Top-P or Top-K');

parser.add_argument('-i',
                    '--input',
                    metavar='input filename',
                    type=str,
                    nargs='?',
                    default='0shot_10fold_FS_test',
                    help='Input filename.')

parser.add_argument('-m',
                    '--model',
                    metavar='device',
                    type=str,
                    nargs='?',
                    default='gpt-neo-1.3B',
                    help='Pre-trained transformer model.');

In [37]:
# Parse arguments via the parse_args() method
if (PLATFORM=='colab') or (PLATFORM=='laptop'):
    # args = parser.parse_args([])
    args = parser.parse_args(['--decode', 'top_p',
                              '--device', 'cpu',
                              '--input', 'OOD_3shot_test_set-gpt-neo-125M-20230830-133808_results',
                              '--model', 'gpt-neo-125M'])
elif PLATFORM=='HPC':
    args = parser.parse_args()

In [38]:
DECODE_STRATEGY = args.decode
DEVICE_TYPE = args.device
INPUT_FILENAME = args.input
MODEL_TYPE = args.model
SCHEMA_FILENAME = "VegaLiteSchema"

## File parameters

In [39]:
TIME_STRING = time.strftime("%Y%m%d-%H%M%S")
OUTPUT_FILENAME = INPUT_FILENAME + '_validated'

if PLATFORM == 'colab':
    FILE_DIRECTORY = '/content/drive/MyDrive/Data science jobs/2. Portfolio/3. NL2VIS/'

elif PLATFORM == 'hpc':
    FILE_DIRECTORY = "/mnt/scratch/users/adbz866/"

elif PLATFORM == 'laptop':
    FILE_DIRECTORY = 'C:/Users/billy/OneDrive/Documents/Python Scripts/1. Portfolio/1. NL2VIS/'

In [40]:
print("Time:", TIME_STRING)
print("File directory:", FILE_DIRECTORY)
print("Input file name:", INPUT_FILENAME)
print("\n")
print("Platform:", PLATFORM)
print("Device:", DEVICE_TYPE)
print("Model:", MODEL_TYPE)
print("Decoding strategy:", DECODE_STRATEGY)
print("Vega-lite schema file name:", SCHEMA_FILENAME)
print("\n")
print("Output file name:", OUTPUT_FILENAME)

Time: 20230830-142326
File directory: /content/drive/MyDrive/Data science jobs/2. Portfolio/3. NL2VIS/
Input file name: OOD_3shot_test_set-gpt-neo-125M-20230830-133808_results


Platform: colab
Device: cpu
Model: gpt-neo-125M
Decoding strategy: top_p
Vega-lite schema file name: VegaLiteSchema


Output file name: OOD_3shot_test_set-gpt-neo-125M-20230830-133808_results_validated


# Import data

**Decoded generated sequences**

In [41]:
model_output = load_object(FILE_DIRECTORY + INPUT_FILENAME)

**Relevant vega-lite schema**

In [42]:
# vega_lite_schema = json.load(urllib.request.urlopen('https://vega.github.io/schema/vega-lite/v4.17.0.json'))
# save_object(FILE_DIRECTORY + "VegaLiteSchema" + ".pickle", vega_lite_schema)
# del vega_lite_schema
vega_lite_schema = load_object(FILE_DIRECTORY + SCHEMA_FILENAME + '.pickle')

# Evaluation

## Methodology

![image.png](attachment:a9bfb8fc-3efe-4d54-920d-1ae920551d98.png)

A series of unit tests are used to assess model outputs. When a problem fails a unit test, a unique error is raised and no further tests are conducted. The first test involves capturing the completed Vega-Lite specification using the relevant regular expression pattern. An attribute error is raised when no match is found. Next, the ast module (https://docs.python.org/3/library/ast.html) is used to check whether parsed outputs are valid Python dictionaries. An invalid dictionary raises a syntax error. Afterwards, the jsonschema module (https://python-jsonschema.readthedocs.io/) is used to check compliance with the Vega-Lite schema. A validation error is raised when an invalid instance was encountered.

Note: All logical tests are conducted in a seperate notebook.

## Regular expression pattern matches?

![image.png](attachment:7ecb1778-f690-4967-bf65-f3bb44036e50.png)

* The regular expression patterns used to capture completed Vega-Lite specifications for A) out-of-domain
problems and B) in-domain problems. The capturing group is highlighted in orange.
* An attribute error is returned when no match is found.

Note: Multiple outputs are generated for each problem. A nested for loop is therefore required below.

In [43]:
model_output.update({'Parsed spec':[[parse_raw_output(decoded_seq)
                                     for decoded_seq in problem]
                                    for problem in model_output['Decoded sequences']]
                    }
                   )

## Valid python dictionary?
* The ast module is used to check whether captured specifications are valid Python dictionaries.
* An invalid dictionary raises a syntax error.

Note: Multiple outputs are generated for each problem. A nested for loop is therefore required below.

In [44]:
model_output['Parsed spec'] = [[parse_json_string(json_string) if json_string != "Attribute error" else json_string
                                for json_string in problem]
                                for problem in model_output['Parsed spec']]

## Valid Vega-Lite specification?
* The jsonschema module is used to check compliance with the Vega-Lite schema.
* A validation error is raised when an invalid instance was encountered.

Note: Multiple outputs are generated for each problem. A nested for loop is therefore required below.

In [45]:
errors = ['Attribute error', 'Syntax error']
model_output.update({"Validated spec":[[validate_json_schema(spec, vega_lite_schema) if spec not in errors else spec
                                        for spec in problem]
                                       for problem in model_output['Parsed spec']]
                    }
                   )

# Save data

In [46]:
save_object(FILE_DIRECTORY + OUTPUT_FILENAME, model_output)