# Create Dataframe from JSON
A notebook for creating OpenRefine procedures that change R5.1 JSONs into dataframes for use in testing.

In [10]:
from pathlib import Path

In [None]:
def rename_fields():
    return """
        {
            "op": "core/column-rename",
            "oldColumnName": "_ - Platform",
            "newColumnName": "Platform",
            "description": "Rename column _ - Platform to platform"
        },
        {
            "op": "core/fill-down",
            "engineConfig": {
                "facets": [],
                "mode": "record-based"
            },
            "columnName": "platform",
            "description": "Fill down cells in column platform"
        },
        {
            "op": "core/column-rename",
            "oldColumnName": "_ - Attribute_Performance - _ - Data_Type",
            "newColumnName": "data_type",
            "description": "Rename column _ - Attribute_Performance - _ - Data_Type to data_type"
        },
        {
            "op": "core/column-rename",
            "oldColumnName": "_ - Attribute_Performance - _ - Access_Method",
            "newColumnName": "access_method",
            "description": "Rename column _ - Attribute_Performance - _ - Access_Method to access_method"
        },
    """

def pivot_performance_fields(first_field):
    return f"""
        {{
            "op": "core/transpose-columns-into-rows",
            "startColumnName": "{first_field}",
            "columnCount": -1,
            "ignoreBlankCells": true,
            "fillDown": true,
            "separator": null,
            "keyColumnName": "Fields",
            "valueColumnName": "usage_count",
            "description": "Pivot `Performance` fields"
        }},
    """

def split_metric_and_date():
    return """
        {
            "op": "core/text-transform",
            "engineConfig": {
                "facets": [],
                "mode": "record-based"
            },
            "columnName": "Fields",
            "expression": "grel:value.replace('_ - Attribute_Performance - _ - Performance - ','')",
            "onError": "keep-original",
            "repeat": false,
            "repeatCount": 10,
            "description": "Remove '_ - Attribute_Performance - _ - Performance - ' from column Fields"
        },
        {
            "op": "core/column-split",
            "engineConfig": {
                "facets": [],
                "mode": "record-based"
            },
            "columnName": "Fields",
            "guessCellType": true,
            "removeOriginalColumn": true,
            "mode": "separator",
            "separator": " - ",
            "regex": false,
            "maxColumns": 0,
            "description": "Split column Fields into metric and date"
        },
        {
            "op": "core/column-rename",
            "oldColumnName": "Fields 1",
            "newColumnName": "metric_type",
            "description": "Rename Fields 1 to metric_type"
        },
        {
            "op": "core/column-rename",
            "oldColumnName": "Fields 2",
            "newColumnName": "usage_date",
            "description": "Rename Fields 2 to usage_date"
        },
    """

In [None]:
def make_final_field_adjustments():
    return """
        
        {
            "op": "core/text-transform",
            "engineConfig": {
                "facets": [],
                "mode": "row-based"
            },
            "columnName": "DOI",
            "expression": "grel:if(or(value=='',value==' '),null,value)",
            "onError": "keep-original",
            "repeat": false,
            "repeatCount": 10,
            "description": "Change whitespace values to null in column ``DOI``"
        },
        {
            "op": "core/text-transform",
            "engineConfig": {
                "facets": [],
                "mode": "row-based"
            },
            "columnName": "proprietary_ID",
            "expression": "grel:if(or(value=='',value==' '),null,value)",
            "onError": "keep-original",
            "repeat": false,
            "repeatCount": 10,
            "description": "Change whitespace values to null in column ``proprietary_ID``"
        },
        {
            "op": "core/text-transform",
            "engineConfig": {
                "facets": [],
                "mode": "row-based"
            },
            "columnName": "online_ISSN",
            "expression": "grel:if(or(value=='',value==' '),null,value)",
            "onError": "keep-original",
            "repeat": false,
            "repeatCount": 10,
            "description": "Change whitespace values to null in column ``online_ISSN``"
        },
        {
            "op": "core/text-transform",
            "engineConfig": {
                "facets": [],
                "mode": "row-based"
            },
            "columnName": "print_SSN",
            "expression": "grel:if(or(value=='',value==' '),null,value)",
            "onError": "keep-original",
            "repeat": false,
            "repeatCount": 10,
            "description": "Change whitespace values to null in column ``print_ISSN``"
        },
        {
            "op": "core/text-transform",
            "engineConfig": {
                "facets": [],
                "mode": "row-based"
            },
            "columnName": "resource_name",
            "expression": "grel:value.replace(/\\n/,'')",
            "onError": "keep-original",
            "repeat": false,
            "repeatCount": 10,
            "description": "Remove errant newlines from ``resource_name`` values"
        },
        {
            "op": "core/text-transform",
            "engineConfig": {
                "facets": [],
                "mode": "row-based"
            },
            "columnName": "resource_name",
            "expression": "grel:value.reinterpret('utf-8').unescape('html')",
            "onError": "keep-original",
            "repeat": false,
            "repeatCount": 10,
            "description": "Ensure values in column ``resource_name`` are encoded with UTF-8"
        },
        {
            "op": "core/text-transform",
            "engineConfig": {
                "facets": [],
                "mode": "row-based"
            },
            "columnName": "publisher",
            "expression": "grel:value.reinterpret('utf-8').unescape('html')",
            "onError": "keep-original",
            "repeat": false,
            "repeatCount": 10,
            "description": "Ensure values in column ``publisher`` are encoded with UTF-8"
        },
        {
            "op": "core/text-transform",
            "engineConfig": {
                "facets": [],
                "mode": "row-based"
            },
            "columnName": "platform",
            "expression": "grel:value.reinterpret('utf-8').unescape('html')",
            "onError": "keep-original",
            "repeat": false,
            "repeatCount": 10,
            "description": "Ensure values in column ``platform`` are encoded with UTF-8"
        },
        {
            "op": "core/column-reorder",
            "columnNames": [
                "statistics_source_ID",
                "report_type",
                "resource_name",
                "publisher",
                "publisher_ID",
                "platform",
                "authors",
                "publication_date",
                "article_version",
                "DOI",
                "proprietary_ID",
                "ISBN",
                "print_ISSN",
                "online_ISSN",
                "URI",
                "data_type",
                "section_type",
                "YOP",
                "access_type",
                "access_method",
                "parent_title",
                "parent_authors",
                "parent_publication_date",
                "parent_article_version",
                "parent_data_type",
                "parent_DOI",
                "parent_proprietary_ID",
                "parent_ISBN",
                "parent_print_ISSN",
                "parent_online_ISSN",
                "parent_URI",
                "metric_type",
                "usage_date",
                "usage_count"
            ],
            "description": "Reorder columns"
        },
        {
            "op": "core/column-addition",
            "engineConfig": {
                "facets": [],
                "mode": "row-based"
            },
            "baseColumnName": "statistics_source_ID",
            "expression": "grel:'['+forEach(row.columnNames,field,if(isBlank(cells[field].value),'None',if(cells[field].value.type()=='date','\\\"'+cells[field].value.toString('yyyy-MM-dd')+'\\\"',if(cells[field].value.type()=='string','\\\"'+cells[field].value.replace('\\\"','\\\\\\\"')+'\\\"',cells[field].value.round())))).join(', ')+'],'",
            "onError": "set-to-blank",
            "newColumnName": "df",
            "columnInsertIndex": 0,
            "description": "Create column ``df`` by combining all the other fields and formatting them for insertion into a dataframe"
        },
    """

## 1. Create Path to JSON

In [12]:
JSON_file_path = Path(f"for_OpenRefine.json")

## 2. Change Field Names and Fill Down as Needed
`Platform`, `Data_Type`, and `Access_Method` are always present and need to be changed.

`Platform` always needs to be filled down.

In [13]:
with open(JSON_file_path, 'w') as file:
    file.write(rename_fields())

## 3. Pivot `Performance` Fields
Add the name of the first field representing the `Performance` section of the JSON.

In [None]:
first_performance_field = "_ - Attribute_Performance - _ - Performance - Unique_Item_Requests - 2024-09"
with open(JSON_file_path, 'a') as file:
    file.write(pivot_performance_fields(first_performance_field))
    file.write(split_metric_and_date())

## 4. Create Dataframe

In [None]:
with open(JSON_file_path, 'a') as file:
    file.write(make_final_field_adjustments())