# RScript-Bridge

> Bridge between Stactics AICore framework and RScript prediction scripts

In [None]:
#| default_exp rscript

In [None]:
from nbdev.showdoc import *
import addroot


In [None]:
#| export
import json, os
import subprocess
import warnings

from functools import reduce



In [None]:
#| hide
#warnings.simplefilter(action='ignore', category=FutureWarning)
#warnings.simplefilter(action='ignore', category=DeprecationWarning)

assets_dir = os.path.join(os.path.abspath(os.getcwd()), 'assets', 'rscript')
save_dir = os.path.join(os.path.abspath(os.getcwd()), 'saves', 'rscript')

def get_asset_path(script_name): 
    return os.path.join(assets_dir, script_name)
def get_save_path(datafile_name): 
    return os.path.join(save_dir, datafile_name)


In [None]:
#| hide
from IPython.display import display
from IPython.display import Markdown

from tabulate import tabulate



In [None]:
#| hide
def display_table(feature_dict):
    display(Markdown(tabulate(
      [[v for v in row.values()] for row in feature_dict],
      headers=[k for k in feature_dict[0].keys()],
        tablefmt='github'
    )))
data_file_flow = {}

def display_flow_table(flow_table):
    columns = set([C for S in data_file_flow for C in data_file_flow[S]])
    print(columns)
    display_table(
        [
            dict(file=fn, **{c:rd.get(c,' ') for c in columns}) 
            for fn, rd in flow_table.items()
        ]    
    )


## Running R code

Scripts written in R can be run from a Python program using `subprocess` and `Rscript`.



### `Rscript`

A script can be run from the commandline with

    Rscript ascript.R



### `subproces'

[Python's `subprocess`module](https://docs.python.org/3.11/library/subprocess.html#) has the tools to execute external programs like `Rscript`


In [None]:

subprocess.run(['Rscript',get_asset_path('hello.R')], capture_output=True).stdout.decode('UTF-8')

'[1] "hello world"\n'

## The  scripts



### `Data_preparation.R`

#### Libraries

* lubridate
* stringer
* zoo

#### Input

* `Data/Meta_data.csv`
* `Data/Sapflux_Tilia_train.csv`
* `Data/Weather_Tilia_train.csv`
* `Data/Weather_Tilia_pred.csv`

#### Output

* `Modelling_data.RData`
* `Prediction_data.RData`


In [None]:
data_file_flow['Data_preparation.R'] = {
      "in": [
         "Data/Meta_data.csv",
         "Data/Sapflux_Tilia_train.csv",
         "Data/Weather_Tilia_train.csv",
         "Data/Weather_Tilia_pred.csv"
      ],
      "out": [
         "Modelling_data.RData",
         "Prediction_data.RData"
      ],
    'libs':['lubridate', 'stringr', 'zoo']
}


### `Prediction_part1.R`

#### Libraries

* lubridate
* stringr
* mgcv

#### Input

* `Modelling_data.RData`

#### Output

* `Fitted_models.RData`
* `Weights.RData`



In [None]:
data_file_flow['Prediction_part1.R'] = {
      "in": [
         "Modelling_data.RData"
      ],
      "out": [
         "Fitted_models.RData",
         "Weights.RData"
      ],
    'libs':['lubridate', 'stringr', 'mgcv']
}



### `Prediction_part2.R`

#### Libraries

* lubridate
* stringr
* mgcv

#### Input

* `Fitted_models.RData`
* `Weights.RData`
* `Modelling_data.RData`
* `Prediction_data.RData`

#### Output

* `Predicted_sapflux.RData`

In [None]:
data_file_flow['Prediction_part2.R'] = {
    "in":[
        'Fitted_models.RData',
        'Weights.RData',
        'Modelling_data.RData',
        'Prediction_data.RData'
    ],
    "out":[
        'Predicted_sapflux.RData'
    ],
    'libs':['lubridate', 'stringr', 'mgcv']
}


### `Prediction_part3.R`

#### Libraries

* lubridate
* stringr

#### Input

* `Predicted_sapflux.RData`

#### Output

* `Predicted_water_usage.RData`


In [None]:
data_file_flow['Prediction_part3.R'] = {
    'in':['Predicted_sapflux.RData'],
    'out':['Predicted_water_usage.RData'],
    'libs':['lubridate', 'stringr']
}

In [None]:
#print(json.dumps(data_file_flow, indent=3))
#| hide
script_order = dict(zip(data_file_flow.keys(), range(len(data_file_flow.keys()))))

data_files = reduce(
    lambda Y,X:Y if (X in Y) else [*Y,X],
    [
        f
        for S,P in data_file_flow.items() # patterns
        for D,F in P.items()
        for f in F
        if D in ['in','out']
        
    ],
    []
)
display(Markdown(tabulate(
    [
        [F.split('.')[0].replace('_',' ').split('/')[-1]]+[
            'in' if F in P['in'] else 'out' if F in P['out'] else '--' 
            for S,P in data_file_flow.items()
        ] 
        for F in data_files
    ],
    headers=['data-file / script'] + [
            C.replace('_',' ') for C in data_file_flow.keys()
        ],
    tablefmt='github'
)))

| data-file / script    | Data preparation.R   | Prediction part1.R   | Prediction part2.R   | Prediction part3.R   |
|-----------------------|----------------------|----------------------|----------------------|----------------------|
| Meta data             | in                   | --                   | --                   | --                   |
| Sapflux Tilia train   | in                   | --                   | --                   | --                   |
| Weather Tilia train   | in                   | --                   | --                   | --                   |
| Weather Tilia pred    | in                   | --                   | --                   | --                   |
| Modelling data        | out                  | in                   | in                   | --                   |
| Prediction data       | out                  | --                   | in                   | --                   |
| Fitted models         | --                   | out                  | in                   | --                   |
| Weights               | --                   | out                  | in                   | --                   |
| Predicted sapflux     | --                   | --                   | out                  | in                   |
| Predicted water usage | --                   | --                   | --                   | out                  |

In [None]:
data_files

['Data/Meta_data.csv',
 'Data/Sapflux_Tilia_train.csv',
 'Data/Weather_Tilia_train.csv',
 'Data/Weather_Tilia_pred.csv',
 'Modelling_data.RData',
 'Prediction_data.RData',
 'Fitted_models.RData',
 'Weights.RData',
 'Predicted_sapflux.RData',
 'Predicted_water_usage.RData']

## Import R libraries

Importing libraries can be done with

    Rscript -e 'install.packages("drat", repos="https://cloud.r-project.org")'

In [None]:
print(subprocess.run(['Rscript','--version', ], capture_output=True).stdout.decode('UTF-8'))


Rscript (R) version 4.2.2 (2022-10-31)



In [None]:
subprocess.run(['Rscript','--version', ], capture_output=True)

CompletedProcess(args=['Rscript', '--version'], returncode=0, stdout=b'Rscript (R) version 4.2.2 (2022-10-31)\n', stderr=b'')

In [None]:
list(set([L for V in data_file_flow.values() for L in V['libs']]))

['stringr', 'mgcv', 'zoo', 'lubridate']

In [None]:
run_script_result = subprocess.run(['Rscript','-e', "install.packages('mgcv', repos='https://cloud.r-project.org')"], capture_output=True)


In [None]:
print(run_script_result.stderr.decode('UTF-8'))

Installing package into ‘/home/fenke/R/x86_64-suse-linux-gnu-library/4.2’
(as ‘lib’ is unspecified)
trying URL 'https://cloud.r-project.org/src/contrib/mgcv_1.9-1.tar.gz'
Content type 'application/x-gzip' length 1083217 bytes (1.0 MB)
downloaded 1.0 MB

* installing *source* package ‘mgcv’ ...
** package ‘mgcv’ successfully unpacked and MD5 sums checked
** using staged installation
** libs
discrete.c: In function ‘CXWyd’:
   int n,*k,*ks,*m,*p,*ts,*dt,*qc,*nthreads,*cs,*rs,nx,nt,ncs,nrs,*ar_stop,*ar_row,*cy;
                                                              ^~~
   int n,*k,*ks,*m,*p,*ts,*dt,*qc,*nthreads,*cs,*rs,nx,nt,ncs,nrs,*ar_stop,*ar_row,*cy;
                                                 ^~
   int n,*k,*ks,*m,*p,*ts,*dt,*qc,*nthreads,*cs,*rs,nx,nt,ncs,nrs,*ar_stop,*ar_row,*cy;
                                   ^~~~~~~~
installing to /home/fenke/R/x86_64-suse-linux-gnu-library/4.2/00LOCK-mgcv/00new/mgcv/libs
** R
** data
** inst
** byte-compile and prepare package f

## Running the scripts

### Checksum

Each script has it's own set of input files and should be run to
update it's output when either it's inputs have changed or it's 
expected output does not exist.

We can check for filechanges using a hashing algorithm, for 
instance MD5 or SHA-256. These are available either in Python
or from the commandline.

Lets look at the commandline version of MD5, on linux this is
`md5sum`, with the input file for the preparation stage:

In [None]:
print(json.dumps(data_file_flow[list(data_file_flow.keys())[0]]['in'], indent=3))

[
   "Data/Meta_data.csv",
   "Data/Sapflux_Tilia_train.csv",
   "Data/Weather_Tilia_train.csv",
   "Data/Weather_Tilia_pred.csv"
]


md5sum will output hashes to stdout, which `subprocess.run` captures for us

In [None]:
script_name = list(data_file_flow.keys())[0]
input_files = data_file_flow[script_name]['in']

script_name.split('.')[0:-1], input_files

(['Data_preparation'],
 ['Data/Meta_data.csv',
  'Data/Sapflux_Tilia_train.csv',
  'Data/Weather_Tilia_train.csv',
  'Data/Weather_Tilia_pred.csv'])

In [None]:

md5_encode_result = subprocess.run(
    ['md5sum','-b']+
    input_files, 
    cwd=save_dir,
    capture_output=True)
print(md5_encode_result.stdout.decode('UTF-8'))

4bed61a77505bfd52032591d5c3a6050 *Data/Meta_data.csv
6d705d98caa6618a4a990c3742c16564 *Data/Sapflux_Tilia_train.csv
1232592f9488ce4fbb4ae11ba5be0349 *Data/Weather_Tilia_train.csv
366dac1bf64003d1d08fca6121c036bd *Data/Weather_Tilia_pred.csv



If we want to check the files we run it with the `-c` option and a file with the previously calculated checksums

In [None]:
checksum_file = os.path.join(save_dir, f"input-checksum-{script_name.split('.')[0]}")
with open(checksum_file, 'wt') as cf:
    cf.write(md5_encode_result.stdout.decode('UTF-8'))

In [None]:
md5_check_result = subprocess.run(
    ['md5sum', '-c', checksum_file], 
    cwd=save_dir,
    capture_output=True)
print(md5_check_result.stdout.decode('UTF-8'))
print(f"Run returned code {md5_check_result.returncode}")

Data/Meta_data.csv: OK
Data/Sapflux_Tilia_train.csv: OK
Data/Weather_Tilia_train.csv: OK
Data/Weather_Tilia_pred.csv: OK

Run returned code 0


Had there been a change to a file it would have looked like

In [None]:
md5_check_result = subprocess.run(
    ['md5sum', '-c', checksum_file+'-modified'], 
    cwd=save_dir,
    capture_output=True)
print(md5_check_result.stdout.decode('UTF-8'))
print(f"Run returned code {md5_check_result.returncode}")


Run returned code 1


We don't really need specifics, only the return code will
do.

### Running a script

For each of the scripts we run the follwing code

In [None]:
script_name = list(data_file_flow.keys())[1]
print(script_name)

Prediction_part1.R


In [None]:
input_files = data_file_flow[script_name]['in']
checksum_file = os.path.join(save_dir, f"input-checksum-{script_name.split('.')[0]}")


In [None]:

md5_check_result = subprocess.run(
    ['md5sum', '-c', checksum_file], 
    cwd=save_dir,
    capture_output=True)

print(md5_check_result.stdout.decode('UTF-8'))
print(md5_check_result.stderr.decode('UTF-8'))

print(f"Run returned code {md5_check_result.returncode}")


md5sum: /home/fenke/repos/corebridge/nbs/saves/rscript/input-checksum-Prediction_part1: No such file or directory

Run returned code 1


In [None]:

if md5_check_result.returncode:
    run_script_result = subprocess.run(
        ['Rscript', '--vanilla', get_asset_path(script_name)],
        cwd=save_dir,
        capture_output=True
    )
    print(f"Run returned code {run_script_result.returncode}")
    print('STDOUT------------\n', run_script_result.stdout.decode('UTF-8'))
    print('STDERR------------\n', run_script_result.stderr.decode('UTF-8'))


Run returned code 1
STDOUT------------
 
STDERR------------
 
Attaching package: ‘lubridate’

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union

Loading required package: nlme
This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
Error in readChar(con, 5L, useBytes = TRUE) : cannot open the connection
Calls: load -> readChar
In readChar(con, 5L, useBytes = TRUE) :
  cannot open compressed file 'Modelling_data.RData', probable reason 'No such file or directory'
Execution halted



In [None]:
libname = 'mgcv'
test = subprocess.run(
    ['Rscript', '-e', f"library({libname})"],capture_output=True)


In [None]:
print(test.stderr.decode('UTF-8'))

Loading required package: nlme
This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.



### References

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()