## Important Info

- No official data encoding was provided. ISO-8859-1 is the best match (found through testing).
- Necessary data cleaning has been done (i.e. removed special chars like '\\', updated possible problem chars like 'Ö' using ISO-8859-1 encoding, etc).

Some names will have diacritics or accents while others will not. These differences should be resolved.


## Python info

*Contains extraneous packages.*  All packages displayed for completeness.

Important packages are (in-order) dask, jupyter, neo4j, numpy, pandas, pyarrow.

In [None]:
!python --version
!pip freeze

Python 3.11.2
aiobotocore==2.5.4
aiohttp==3.8.5
aioitertools==0.11.0
aiosignal==1.3.1
anyio==3.7.1
appnope==0.1.3
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
asttokens==2.2.1
async-lru==2.0.4
async-timeout==4.0.3
attrs==23.1.0
Babel==2.12.1
backcall==0.2.0
beautifulsoup4==4.12.2
bleach==6.0.0
boto3==1.28.60
botocore==1.31.17
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
click==8.1.7
cloudpickle==2.2.1
comm==0.1.4
dask==2023.9.2
debugpy==1.6.7.post1
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.7
executing==1.2.0
fastjsonschema==2.18.0
fqdn==1.5.1
frozenlist==1.4.0
fsspec==2023.9.2
graphviz==0.20.1
idna==3.4
importlib-metadata==6.8.0
ipykernel==6.25.1
ipython==8.14.0
isoduration==20.11.0
jedi==0.19.0
Jinja2==3.1.2
jmespath==1.0.1
json5==0.9.14
jsonpointer==2.4
jsonschema==4.19.0
jsonschema-specifications==2023.7.1
jupyter-events==0.7.0
jupyter-lsp==2.2.0
jupyter_client==8.3.0
jupyter_core==5.3.1
jupyter_server==2.7.2
jupyter_server_terminals==0.4.4
jupyterla

## Important globals

In [None]:
import os
from pathlib import Path

# change as necessary
root = os.path.join(Path.cwd(), "csv")
ENTITY_FILE = os.path.join(root, "semmedVER43_2023_R_ENTITY.csv")
PREDICATION_FILE = os.path.join(root, "semmedVER43_2023_R_PREDICATION.csv")
SENTENCE_FILE = os.path.join(root, "semmedVER43_2023_R_SENTENCE.csv")

zipped = os.path.join(root, "cleaned_and_zipped")
ENTITY_ZIP = os.path.join(zipped, "semmedVER43_2023_R_ENTITY.csv.gz")
PREDICATION_ZIP = os.path.join(zipped, "semmedVER43_2023_R_PREDICATION.csv.gz")
SENTENCE_ZIP = os.path.join(zipped, "semmedVER43_2023_R_SENTENCE.csv.gz")

# column labels for indexing. Found in schema desc(https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html)
PREDICATION_COLUMNS = {
    "PREDICATION_ID": str,  # Auto-generated primary key for each unique predication
    "SENTENCE_ID": str,     # Foreign key to the SENTENCE table
    "PMID": str,            # The PubMed identifier of the citation to which the predication belongs
    "PREDICATE": str,       # The string representation of each predicate (for example TREATS, PROCESS_OF)
    "SUBJECT_CUI": str,     # The CUI of the subject of the predication
    "SUBJECT_NAME": str,    # The preferred name of the subject of the predication
    "SUBJECT_SEMTYPE": str, # The semantic type of the subject of the predication
    "SUBJECT_NOVELTY": str, # The novelty of the subject of the predication
    "OBJECT_CUI": str,      # The CUI of the object of the predication
    "OBJECT_NAME": str,     # The preferred name of the object of the predication
    "OBJECT_SEMTYPE": str,  # The semantic type of the object of the predication
    "OBJECT_NOVELTY": str,  # The novelty of the object of the predication
}


SENTENCE_COLUMNS= {
    "SENTENCE_ID": str,               # Auto-generated primary key for each sentence
    "PMID": str,                      # The PubMed identifier of the citation to which the sentence belongs
    "TYPE": str,                      # 'ti' for the title of the citation, 'ab' for the abstract
    "NUMBER": str,                    # The location of the sentence within the title or abstract
    "SENT_START_INDEX": str,          # The character position within the text of the MEDLINE citation of the first character of the sentence  NEW
    "SENT_END_INDEX": str,            # The character position within the text of the MEDLINE citation of the last character of the sentence  NEW
    "SECTION_HEADER": str,            # Section header name of structured abstract (from Version 3.1)
    "NORMALIZED_SECTION_HEADER": str, # Normalized section header name of structured abstract (from Version 3.1)
    "SENTENCE": str,                  # The actual string or text of the sentence
}

ENTITY_COLUMNS = {
    "ENTITY_ID": str,    # Auto-generated primary key for each unique entity
    "SENTENCE_ID": str,  # The foreign key to SENTENCE table
    "CUI": str,          # The CUI of the entity
    "NAME": str,         # The preferred name of the entity
    "TYPE": str,         # The semantic type of the entity
    "GENE_ID": str,      # The EntrezGene ID of the entity
    "GENE_NAME": str,    # The EntrezGene name of the entity
    "TEXT": str,         # The text in the utterance that maps to the entity
    "START_INDEX": str,  # The first character position (in document) of the text denoting the entity
    "END_INDEX": str,    # The last character position (in document) of the text denoting the entity
    "SCORE": str,        # The confidence score
}

# Found in downloads page (https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/SemMedDB_download.html)
# Note, Entity file is supposed to have 1,892,228,683 rows, however, a line count using `wc -l` results in 1,887,317,669 new line
# chars (n-1 rows) being detected on the unmodified file. This is consistent with the data read in after cleaning.
EXPECTED_ROWS = {
    "PREDICATION": 126268045,
    "SENTENCE":    253029872,
    # "ENTITY":      1892228683, # official amount from linked website.
    "ENTITY":      1887317669 + 1,   # found amount.
}

## File Utils

In [None]:
import dask.dataframe as dd
import numpy as np
import math, time
from dask.diagnostics import ProgressBar


def read_csv(filename:str, col_labels:list[str]=None, types:dict=None) -> dd:
    """ Reads in CSV file.

    Args:
        filename: the path to file to load.
        col_labels: the expected labels for each column. Default None.
        types: column labels and their accompanying types. Default None.
    Returns:
        A Dask Dataframe object with loaded data.
    """
    def print_values(bad_line: list[str]):
        for i,item in enumerate(bad_line):
            print(f"Item {i}: {item}")

    opts = {
        "blocksize":        50e6,         # 50 MB chunks
        "dtype":            np.str_,      # treat everything as string for now
        "on_bad_lines":     "warn",
        "encoding":         "iso-8859-1",
        "na_values":        [""],         # representation of empty values
        # "engine":       "python",       # uncomment next two lines if debugging / cleaning data
        # "on_bad_lines": print_values,   # warning, will be VERY slow.
    }


    # type checking.
    if col_labels and type(col_labels) != list:
        print(f"Column labels not in proper format. Expected list, got {type(col_labels)}.")
        col_labels = None
    elif col_labels:
        opts["names"] = col_labels
    if types and type(types) != dict:
        print(f"Types are not in proper format. Expected dict, got {type(types)}.")
        types = None
    elif types:
        opts["dtype"] = types

    df = dd.read_csv(filename,**opts)

    return df

### Predications

In [None]:
try:
    predications = read_csv(PREDICATION_FILE, list(PREDICATION_COLUMNS.keys()), PREDICATION_COLUMNS)
except Exception as e:
    print(f"Something seriously messed up happened.\n{e}")

In [None]:
with ProgressBar():
    total_rows = predications.map_partitions(len).compute().sum()

print(f"Lost Rows: {EXPECTED_ROWS['PREDICATION']-total_rows}, Percent lost: {round((1.00 - (total_rows/EXPECTED_ROWS['PREDICATION'])), 3)}")

[########################################] | 100% Completed | 116.05 s
Lost Rows: 0, Percent lost: 0.0


### Sentence

In [None]:
try:
    sentences = read_csv(SENTENCE_FILE, list(SENTENCE_COLUMNS.keys()), SENTENCE_COLUMNS)
except Exception as e:
    print(f"Something seriously messed up happened.\n{e}")

In [None]:
with ProgressBar():
    total_rows = sentences.map_partitions(len).compute().sum()

print(f"Lost Rows: {EXPECTED_ROWS['SENTENCE']-total_rows}, Percent lost: {round(1.00 - (total_rows/EXPECTED_ROWS['SENTENCE']), 3)}")

[########################################] | 100% Completed | 417.76 s
Lost Rows: 0, Percent lost: 0.0


### Entities

In [None]:
try:
    entities = read_csv(ENTITY_FILE, list(ENTITY_COLUMNS.keys()), ENTITY_COLUMNS)
except Exception as e:
    print(f"Something seriously messed up happened.\n{e}")

In [None]:
with ProgressBar():
    total_rows = entities.map_partitions(len).compute().sum()

print(f"Lost Rows: {EXPECTED_ROWS['ENTITY']-total_rows}, Percent lost: {round(1.00 - (total_rows/EXPECTED_ROWS['ENTITY']), 3)}")

[########################################] | 100% Completed | 28m 53s
Lost Rows: 0, Percent lost: 0.0


## Uploading to AWS

In [None]:
from dotenv import dotenv_values
import boto3
import progressbar
import threading

env_v = dotenv_values(".env")


class Uploader:

    def __init__(self):
        self.storage_opts= {
                "aws_access_key_id" : env_v["aws_access_key_id"],
                "aws_secret_access_key": env_v["aws_secret_access_key"],
                "aws_session_token": env_v["aws_session_token"]}

        self.s3 = boto3.client('s3', **self.storage_opts)

    def update_pb(self, size):
        self.pg.update(self.pg.currval + size)

    def upload_file(self, file):
        self.pg = progressbar.progressbar.ProgressBar(
            maxval=os.stat(file).st_size)
        self.pg.start()

        self.s3.upload_file(file, 'semdb-data', os.path.basename(file), Callback=self.update_pb)

uploader = Uploader()

storage_opts = {'key': env_v["aws_access_key_id"],
                'secret': env_v["aws_secret_access_key"],
                'token': env_v["aws_session_token"]}
parquet_base_url = 's3://semdb-data/parquet/'

## CSV Uploads

*Note: implemented during early testing.*

In [None]:
uploader.upload_file(PREDICATION_ZIP)

 99% |######################################################################## |

In [None]:
uploader.upload_file(SENTENCE_ZIP)

 99% |######################################################################################################################################################### |

In [None]:
uploader.upload_file(ENTITY_ZIP)

## Parquet Uploads

In [None]:
with ProgressBar():
    s3_url = parquet_base_url+'predication'
    predications.to_parquet(s3_url, storage_options=storage_opts, compression='gzip')

[########################################] | 100% Completed | 30m 35s


In [None]:
with ProgressBar():
    s3_url = parquet_base_url+'sentence'
    sentences.to_parquet(s3_url, storage_options=storage_opts, compression='gzip')

[########################################] | 100% Completed | 3hr 1ms


In [None]:
with ProgressBar():
    s3_url = parquet_base_url+'entity'
    entities.to_parquet(s3_url, storage_options=storage_opts, compression='gzip')

[########################################] | 100% Completed | 5hr 57m


In [1]:
urls = {
    'predication_parquet': 's3://semdb-data/parquet/predication/' ,
    'sentence_parquet': 's3://semdb-data/parquet/sentence/',
    'entity_parquet': 's3://semdb-data/parquet/entity/',
    'predication_csv': 's3://semdb-data/CSV/semmedVER43_2023_R_PREDICATION.csv.gz',
    'sentence_csv': 's3://semdb-data/CSV/semmedVER43_2023_R_SENTENCE.csv.gz',
    # 'entity_csv': , Not uploaded.
}