# Integration of the exsiting workflow system  data source

This notebook loads the reStructuredText file from the [existing workflow systems](https://github.com/common-workflow-language/common-workflow-language/wiki/Existing-Workflow-systems) data source (EWS) and extracts all listed tools and tool information. This also includes the tool URL, publication URL, repository URL and the description.

The EWS data source was not used to create the final candidate tool landscape but the existing existing workflow revisited data source. As the EWS data is incomplete.

## Imports

In [None]:
import yaml
import requests
import pandas as pd
from bs4 import BeautifulSoup
import markdown
import re

## Helper functions to extract and transform raw data

In [None]:
def extract_tool_relevant_line_from_string(raw_string):
    m = re.search(r'^#.\s+(.+?)http', raw_string)
    if m:
        return m.group(1)
    return ""

def find_urls_in_string(raw_string):
    return re.findall(r'(https?://[^\s]+)', raw_string)

def string_in_string_list_element(str_name:str, list_of_strings:list[str]):
    for a_string in list_of_strings:
        if a_string in str_name:
            return True

    return False

def get_tool_and_description_part(raw_string):
    pat = re.compile(r'[.:]|\s+[-(]\s*')
    match = pat.search(raw_string)
    if match:
        match_index=match.start()
        tool_name = raw_string[:match_index]
        tool_description = raw_string[match_index:]
        return tool_name.strip(), tool_description.strip()
    return raw_string.strip(), ""

def sanetize_from_rst_syntax(raw_string):
    return re.sub(r"\|\w{2}\|","",raw_string)

def sanitize_raw_tool_line(raw_tool_line):
    # remove restructured text markdown
    processed_line=sanetize_from_rst_syntax(raw_tool_line)
    processed_line=processed_line.strip()
    tool_part_name,des = get_tool_and_description_part(processed_line)
    des = re.sub(r"^\s*[-\.:]\s*","",des)
    #print(tool_part_name,"###",des)
    return tool_part_name,des

## Raw/Intermediate Stage - Load raw data source from the GitHub wiki and extract and transform relevant tool information

In [2]:
RAW_DATA_SOURCE_URL = "https://raw.githubusercontent.com/wiki/common-workflow-language/common-workflow-language/Existing-Workflow-systems.rest"

In [3]:
response = requests.get(RAW_DATA_SOURCE_URL)
if response.status_code == 200:
    readme_text= response.text

In [5]:
# select lines which are part of the tool list
lines = readme_text.split("\n")
wfl_tool_lines = [l for l in lines if l.startswith("#.")]

### Define variables to (1) identify if a URL can be mapped to a publication, (2) URL can be mapped to a code hosting repository

In [9]:
# define host names to identify URLs as a publication
article_host_url_identifiers = [
    "doi.org",
    "ieee.org",
    "arxiv.org",
    "jmlr.org",
    "plos.org",
    "ncbi.nlm.nih.gov",
    "biomedcentral.com",
    "oxfordjournals.org",
    "academic.oup.com",
    "semanticscholar.org",
    "openproceedings.org"
]

# define special cases not covered  by variable `article_host_url_identifiers` and which are a publication URL
article_special_cases = [ "10.21105/joss.00830",
"https://hal.archives-ouvertes.fr/hal-01166298/file/openalea-PradalCohen-Boulakia.pdf",
"http://bioinformatics.hsanmartino.it/bits_library/library/00079.pdf",
"http://bioinformatics.hsanmartino.it/bits_library/library/00568.pdf",
"https://www.harrisgeospatial.com/Learn/Whitepapers/TabId/2359/ArtMID/10212/ArticleID/17299/Workflow-Tools-in-ENVI.aspx",
"ccl.cse.nd.edu/research/papers/jx-escience-2018.pdf",
"http://www.i3s.unice.fr/~johan/publis/MOTEUR-poster-A4.pdf"]

# host names to identify a code repository URL
repository_url_identifiers =  [
    "bitbucket.com",
    "bitbucket.org",
    "gitlab.com",
    "github.com"
]

### Extract relevant tool data from raw source and convert into a pandas DataFrame

In [12]:
all_repos=[]
all_pubs=[]
all_projects=[]
all_tool_name_strs = []
all_tool_description_strs = []

for e in wfl_tool_lines:
    tool_name_raw = extract_tool_relevant_line_from_string(e)
    tool_name,tool_description = sanitize_raw_tool_line(tool_name_raw)
    http_lines = find_urls_in_string(e)

    repository_urls =[]
    project_urls=[]
    publication_urls = []
    for url in http_lines:
        # is repository
        if string_in_string_list_element(url,repository_url_identifiers):
            repository_urls.append(url)
            continue
        if string_in_string_list_element(url,article_host_url_identifiers+article_special_cases ):
            publication_urls.append(url)
            continue
        project_urls.append(url)

    all_repos.append(repository_urls)
    all_pubs.append(publication_urls)
    all_projects.append(project_urls)
    all_tool_name_strs.append(tool_name)
    all_tool_description_strs.append(tool_description)

In [13]:
df = pd.DataFrame([all_tool_name_strs,all_tool_description_strs, all_projects,all_repos,all_pubs,wfl_tool_lines]).T


### Set column names of DataFrame

In [14]:
df.columns=["tool","tool_description","project_url","repository_url","publication_url","raw"]

In [15]:
df.agg([len]).clip(0,1).sum()

tool              len    336
tool_description  len     92
project_url       len    206
repository_url    len    154
publication_url   len     72
raw               len    336
dtype: int64

### Special handling required for Seqware

Extraction functions to split tool name and description does not always work, as different representations exist in the raw data source for different tools.

In [16]:
df.loc[df.tool.str.contains("seqware"),"tool_description"]=df.loc[df.tool.str.contains("seqware"),"tool"]+df.loc[df.tool.str.contains("seqware"),"tool_description"]
df.loc[df.tool.str.contains("seqware"),"tool"]="SeqWare"

### Save result of raw/intermediate stage

In [18]:
df.to_csv("data/01_raw/ews.csv",index=False,sep="\t")

In [19]:
df.to_csv("data/02_intermediate/ews.csv",index=False)