# RScript-Bridge

> Bridge between Stactics AICore framework and RScript prediction scripts

In [None]:
#| default_exp rscript

### Some things to set up first

Notebooks use nbdev thingses and `addroot` makes importing from
the repo-directory more convenient.

In [None]:
from nbdev.showdoc import *
import addroot

In [None]:
#| export

import json, os
import subprocess
import hashlib

from functools import reduce

AICore uses an `assets` dir from which we can read files, like scripts
and a `save` dir were modules can write and read files.

In [None]:

assets_dir = os.path.join(os.path.abspath(os.getcwd()), 'assets', 'rscript')
save_dir = os.path.join(os.path.abspath(os.getcwd()), 'saves', 'rscript')

def get_asset_path(script_name): 
    return os.path.join(assets_dir, script_name)
def get_save_path(datafile_name): 
    return os.path.join(save_dir, datafile_name)


In [None]:
#| hide
from IPython.display import display
from IPython.display import Markdown

from tabulate import tabulate


In [None]:
#| hide
def display_table(feature_dict):
    display(Markdown(tabulate(
      [[v for v in row.values()] for row in feature_dict],
      headers=[k for k in feature_dict[0].keys()],
        tablefmt='github'
    )))
data_file_flow = {}

def display_flow_table(flow_table):
    columns = set([C for S in data_file_flow for C in data_file_flow[S]])
    print(columns)
    display_table(
        [
            dict(file=fn, **{c:rd.get(c,' ') for c in columns}) 
            for fn, rd in flow_table.items()
        ]    
    )


## Running R code

Scripts written in R can be run from a Python program using `subprocess` and `Rscript`.



### `Rscript`

A script can be run from the commandline with

    Rscript ascript.R



### `subproces`

[Python's `subprocess`module](https://docs.python.org/3.11/library/subprocess.html#) has the tools to execute external programs like `Rscript`


In [None]:

subprocess.run(['Rscript',get_asset_path('hello.R')], capture_output=True).stdout.decode('UTF-8')

'[1] "hello world"\n'

## Sapflow prediction scripts



### `Data_preparation.R`

#### Libraries

* lubridate
* stringer
* zoo

#### Input

* `Data/Meta_data.csv`
* `Data/Sapflux_Tilia_train.csv`
* `Data/Weather_Tilia_train.csv`
* `Data/Weather_Tilia_pred.csv`

#### Output

* `Modelling_data.RData`
* `Prediction_data.RData`


In [None]:
data_file_flow['Data_preparation.R'] = {
      "in": [
         "Data/Meta_data.csv",
         "Data/Sapflux_Tilia_train.csv",
         "Data/Weather_Tilia_train.csv",
         "Data/Weather_Tilia_pred.csv"
      ],
      "out": [
         "Modelling_data.RData",
         "Prediction_data.RData"
      ],
    'libs':['lubridate', 'stringr', 'zoo']
}


### `Prediction_part1.R`

#### Libraries

* lubridate
* stringr
* mgcv

#### Input

* `Modelling_data.RData`

#### Output

* `Fitted_models.RData`
* `Weights.RData`



In [None]:
data_file_flow['Prediction_part1.R'] = {
      "in": [
         "Modelling_data.RData"
      ],
      "out": [
         "Fitted_models.RData",
         "Weights.RData"
      ],
    'libs':['lubridate', 'stringr', 'mgcv']
}



### `Prediction_part2.R`

#### Libraries

* lubridate
* stringr
* mgcv

#### Input

* `Fitted_models.RData`
* `Weights.RData`
* `Modelling_data.RData`
* `Prediction_data.RData`

#### Output

* `Predicted_sapflux.RData`

In [None]:
data_file_flow['Prediction_part2.R'] = {
    "in":[
        'Fitted_models.RData',
        'Weights.RData',
        'Modelling_data.RData',
        'Prediction_data.RData'
    ],
    "out":[
        'Predicted_sapflux.RData'
    ],
    'libs':['lubridate', 'stringr', 'mgcv']
}


### `Prediction_part3.R`

#### Libraries

* lubridate
* stringr

#### Input

* `Predicted_sapflux.RData`

#### Output

* `Predicted_water_usage.RData`


In [None]:
data_file_flow['Prediction_part3.R'] = {
    'in':['Predicted_sapflux.RData'],
    'out':['Predicted_water_usage.RData'],
    'libs':['lubridate', 'stringr']
}

In [None]:
#| hide

script_order = dict(zip(data_file_flow.keys(), range(len(data_file_flow.keys()))))

# add the name to the objects
data_file_flow = {
    script_order[k]:{**v, 'name':k}
    for k,v in data_file_flow.items()
}


data_files = reduce(
    lambda Y,X:Y if (X in Y) else [*Y,X],
    [
        f
        for S,P in data_file_flow.items() # patterns
        for D,F in P.items()
        for f in F
        if D in ['in','out']
        
    ],
    []
)
display(Markdown(tabulate(
    [
        [F.split('/')[-1]]+[
            'in' if F in P['in'] else 'out' if F in P['out'] else '--' 
            for S,P in data_file_flow.items()
        ] 
        for F in data_files
    ],
    headers=['data-file / script'] + [I['name'] for I in data_file_flow.values()],
    tablefmt='github'
)))

| data-file / script          | Data_preparation.R   | Prediction_part1.R   | Prediction_part2.R   | Prediction_part3.R   |
|-----------------------------|----------------------|----------------------|----------------------|----------------------|
| Meta_data.csv               | in                   | --                   | --                   | --                   |
| Sapflux_Tilia_train.csv     | in                   | --                   | --                   | --                   |
| Weather_Tilia_train.csv     | in                   | --                   | --                   | --                   |
| Weather_Tilia_pred.csv      | in                   | --                   | --                   | --                   |
| Modelling_data.RData        | out                  | in                   | in                   | --                   |
| Prediction_data.RData       | out                  | --                   | in                   | --                   |
| Fitted_models.RData         | --                   | out                  | in                   | --                   |
| Weights.RData               | --                   | out                  | in                   | --                   |
| Predicted_sapflux.RData     | --                   | --                   | out                  | in                   |
| Predicted_water_usage.RData | --                   | --                   | --                   | out                  |

In [None]:
data_files

['Data/Meta_data.csv',
 'Data/Sapflux_Tilia_train.csv',
 'Data/Weather_Tilia_train.csv',
 'Data/Weather_Tilia_pred.csv',
 'Modelling_data.RData',
 'Prediction_data.RData',
 'Fitted_models.RData',
 'Weights.RData',
 'Predicted_sapflux.RData',
 'Predicted_water_usage.RData']

## Import R libraries

Importing libraries can be done with

    Rscript -e 'install.packages("drat", repos="https://cloud.r-project.org")'

In [None]:
print(subprocess.run(['Rscript','--version', ], capture_output=True).stdout.decode('UTF-8'))


Rscript (R) version 4.2.2 (2022-10-31)



In [None]:
subprocess.run(['Rscript','--version', ], capture_output=True)

CompletedProcess(args=['Rscript', '--version'], returncode=0, stdout=b'Rscript (R) version 4.2.2 (2022-10-31)\n', stderr=b'')

In [None]:
list(set([L for V in data_file_flow.values() for L in V['libs']]))

['lubridate', 'stringr', 'zoo', 'mgcv']

In [None]:
run_script_result = subprocess.run(['Rscript','-e', "library(mgcv)"], capture_output=True)
print(run_script_result.stderr.decode('UTF-8'))

Loading required package: nlme
This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.



In [None]:
#| exports

def install_R_package(pkg:str|list):
    """
    Checks and if neccesary installs an R package

    Parameters
    ----------
    pkg : str|list
        name(s) of the package(s)
    """

    if isinstance(pkg, str):
        pkg = [pkg]

    for pkg_i in pkg:
        run_script_result = subprocess.run(['Rscript','-e', f"library({pkg_i})"], capture_output=True)
        if run_script_result.returncode != 0:
            print(f"Installing {pkg_i}")
            run_script_result = subprocess.run(['Rscript','-e', f"install.packages({pkg_i}, repos='https://cloud.r-project.org')"], capture_output=True)
        else:
            print(f"Library {pkg_i} already installed")
            
        print(run_script_result.stderr.decode('UTF-8'))



In [None]:
install_R_package(list(set([L for V in data_file_flow.values() for L in V['libs']])))

Library lubridate already installed

Attaching package: ‘lubridate’

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union


Library stringr already installed

Library zoo already installed

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric


Library mgcv already installed
Loading required package: nlme
This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.



## Running the scripts

### Checksum calculation

Each script has it's own set of input files and should be run to
update it's output when either it's inputs have changed or it's 
expected output does not exist.

We can check for filechanges using a hashing algorithm, for 
instance MD5 or SHA-256. These are available either in Python
or from the commandline.

Lets look at the commandline version of MD5, on linux this is
`md5sum`, with the input file for the preparation stage:

In [None]:
print(json.dumps(data_file_flow[list(data_file_flow.keys())[0]]['in'], indent=3))

[
   "Data/Meta_data.csv",
   "Data/Sapflux_Tilia_train.csv",
   "Data/Weather_Tilia_train.csv",
   "Data/Weather_Tilia_pred.csv"
]


md5sum will output hashes to stdout, which `subprocess.run` captures for us

In [None]:
flow_object_index = 0
input_files = data_file_flow[flow_object_index]['in']

print(json.dumps(input_files, indent=3))

[
   "Data/Meta_data.csv",
   "Data/Sapflux_Tilia_train.csv",
   "Data/Weather_Tilia_train.csv",
   "Data/Weather_Tilia_pred.csv"
]


In [None]:

md5_encode_result = subprocess.run(
    ['md5sum','-b']+
    input_files, 
    cwd=save_dir,
    capture_output=True)
print(md5_encode_result.stdout.decode('UTF-8'))

4bed61a77505bfd52032591d5c3a6050 *Data/Meta_data.csv
6d705d98caa6618a4a990c3742c16564 *Data/Sapflux_Tilia_train.csv
1232592f9488ce4fbb4ae11ba5be0349 *Data/Weather_Tilia_train.csv
366dac1bf64003d1d08fca6121c036bd *Data/Weather_Tilia_pred.csv



If we want to check the files we run it with the `-c` option and a file with the previously calculated checksums

In [None]:
script_name = data_file_flow[flow_object_index]['name']

checksum_file = get_save_path(f"input-checksum-{script_name.split('.')[0]}")
with open(checksum_file, 'wt') as cf:
    cf.write(md5_encode_result.stdout.decode('UTF-8'))

In [None]:
md5_check_result = subprocess.run(
    ['md5sum', '-c', checksum_file], 
    cwd=save_dir,
    capture_output=True)
print(md5_check_result.stdout.decode('UTF-8'))
print(f"Run returned code {md5_check_result.returncode}")
if md5_check_result.returncode:
    print(md5_check_result.stderr.decode('UTF-8'))

Data/Meta_data.csv: OK
Data/Sapflux_Tilia_train.csv: OK
Data/Weather_Tilia_train.csv: OK
Data/Weather_Tilia_pred.csv: OK

Run returned code 0


Had there been a change to a file it would have looked like

In [None]:
md5_check_result = subprocess.run(
    ['md5sum', '-c', checksum_file+'-modified'], 
    cwd=save_dir,
    capture_output=True)
print(md5_check_result.stdout.decode('UTF-8'))
print(f"Run returned code {md5_check_result.returncode}")

Data/Meta_data.csv: OK
Data/Sapflux_Tilia_train.csv: FAILED
Data/Weather_Tilia_train.csv: OK
Data/Weather_Tilia_pred.csv: OK

Run returned code 1


We don't really need specifics, only the return code will
do for our purpose.

### Checking files


#### Generating names

In [None]:
#| exports

def calc_hash_from_flowobject(flow_object:dict)->str:
    return hashlib.md5(repr(flow_object).encode('UTF-8')).hexdigest()

In [None]:
calc_hash_from_flowobject(data_file_flow[flow_object_index])

'da4b2413f6a22c19a8a7823e6564e746'

#### Inputs

In [None]:
#| exports
def check_script_inputs(flow_object:dict)->bool:
    """ 
    Check if the input files for a script are up-to-date, returns True if up-to-date.
    """

    checksum_file = get_save_path(f"input-checksum-{calc_hash_from_flowobject(flow_object)}")
    md5_check_result = subprocess.run(
        ['md5sum', '-c', checksum_file], 
        cwd=save_dir,
        capture_output=True)
    
    return int(md5_check_result.returncode) == 0

In [None]:
check_script_inputs(data_file_flow[0])

False

#### Outputs
The output is easily checked for existence with `isfile`.

In [None]:
#| exports
def check_script_output(flow_object:dict)->bool:
    """ 
    Check if the output files for a script exist, returns True if they all exist.
    """

    return all([
        os.path.isfile(get_save_path(F)) 
        for F in flow_object['out']
    ])

In [None]:
check_script_output(data_file_flow[0])

False

### Running a script

We need to run a script when either any of it's inputs have changed or any 
of it's outputs do not exist. Return True if a follow-up script should be 
executed, False if nothing changed or executing the script failed.


In [None]:
#| exports

def run_script(flow_object):
    """ Run a script in R 
        args:
            flow_object: dict of flow object
        returns:
            bool: False if nothing has changed, or an update failed,
                    and True if a follow-up script might need to be run

    """

    # Check if output exists and inputs have not changed and return False if 
    # output exists and inputs have not changed
    if check_script_output(flow_object) and check_script_inputs(flow_object):
        return False
    
    # Run script
    run_script_result = subprocess.run(
        ['Rscript', '--vanilla', get_asset_path(flow_object['name'])],
        cwd=save_dir,
        capture_output=True
    )
    
    # check the return code
    if run_script_result.returncode:
        print(f"Run returned code {run_script_result.returncode}")
        print('STDOUT------------\n', run_script_result.stdout.decode('UTF-8'))
        print('STDERR------------\n', run_script_result.stderr.decode('UTF-8'))
        return False
    
    # check the output
    if not check_script_output(flow_object):
        print(f"Output not found for {flow_object['name']}")
        return False
    
    return check_script_output(flow_object)
    

### References

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()