| <div> <img src="https://storage.googleapis.com/open-ff-common/openFF_logo.png" width="100"/></div>|      |<h1>Adding New Disclosures to the Open-FF Data:<br><br>Download, Curate, Assemble, Test, and Archive<br></h1>|
|---|---|---|

In [None]:
!git clone https://github.com/gwallison/intg_support.git &>/dev/null;
!pip install itables  &>/dev/null;
!pip install geopandas  &>/dev/null;

In [None]:
# preamble code
import os, shutil
import pandas as pd
from IPython.display import HTML, display
from IPython.display import Markdown as md
import requests
import datetime
from intg_support.file_handlers import store_df_as_csv, get_csv, save_df, get_df

use_itables = True

if use_itables:
    from itables import init_notebook_mode
    init_notebook_mode(all_interactive=True)
    from itables import show as iShow
    import itables.options as opt
    opt.classes="display compact cell-border"
    opt.maxBytes = 0
    opt.maxColumns = 0
else:
    def iShow(df,maxBytes=0,classes=None):
        display(df)
       
        
def clr_cell(txt='Cell Completed', color = '#cfc'):
    t = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
    s = f"""<div style="background-color: {color}; padding: 10px; border: 1px solid green;">"""
    s+= f'<h3> {txt} </h3> {t}'
    s+= "</div>"
    display(md(s))

def completed(status=True,txt=''):
    if txt =='':
        if status:
            txt = 'This step completed normally.'
        else:
            txt ='Problems encountered in this cell! Resolve before continuing.' 
    if status:
        clr_cell(txt)
    else:
        clr_cell(txt,color='pink')

def get_raw_df(cols=None):
  """without a list of cols, whole df will be returned"""
  return pd.read_parquet(os.path.join(work_dir,'raw_flat.parquet'),
                         columns=cols)
    

root_dir = ''
orig_dir = os.path.join(root_dir,'orig_dir')
work_dir = os.path.join(root_dir,'work_dir')
final_dir = os.path.join(root_dir,'final')
ext_dir = os.path.join(root_dir,'ext')


# Set up
Construct a workspace and collect the resources needed.

## Create directories and fetch previous repository as a reference
Before we start downloading new FracFocus data, we set up a working directory structure and collect the resources we need.

**Directories constructed**
| directory name | description |
| ---: | :--- |
|**sources**|founding repo, older archives|
|**orig_dir**| expanded zip files, downloaded external files, etc: files used as a model for the next round, but not to be directly saved|
|**work_dir**| This is the working directory where new curation files created by these routines are kept. These 'generated' files are saved at the end of the process into either the repository or other archives.|
|**ext**| non-FracFocus data files used in constructing the Open-FF data set |
|**code**| python files used in this processes.  (User jupyter files, such as this one, are in the root directory.) |
|**final**| the place for final files, archives and repositories. |

In [None]:
# Control download: typically set to True
#    set to False if you can skip the downloading part of the repo and the external data, for example, during testing.

download_repo = False
download_ext = False
download_FF = False
unpack_to_orig = False
create_raw = True

In [None]:
import urllib.request
dirs = [orig_dir,work_dir,final_dir,ext_dir]
for d in dirs:
    if os.path.isdir(d):
        print(f'Directory exists: {d}')
    else:
        print(f'Creating directory: {d}')
        os.mkdir(d)
    if d==final_dir:
        others = ['pickles','curation_files','CAS_ref_files','CompTox_ref_files']
        for oth in others:   
            subdir = os.path.join(d,oth)
            if os.path.isdir(os.path.join(subdir)):
                print(f'Directory exists: {subdir}')
            else:
                print(f'Creating directory: {subdir}')
                os.mkdir(subdir)
    if d==work_dir:
        others = ['new_CAS_REF','new_COMPTOX_REF']
        for oth in others:   
            subdir = os.path.join(d,oth)
            if os.path.isdir(os.path.join(subdir)):
                print(f'Directory exists: {subdir}')
            else:
                print(f'Creating directory: {subdir}')
                os.mkdir(subdir)

s_repo_name = os.path.join(orig_dir,'cloud_repo.zip')

if download_repo:
    url = 'https://storage.googleapis.com/open-ff-common/repos/cloud_repo.zip'
    print(url)
    try:
      urllib.request.urlretrieve(url, s_repo_name)
    except:
      completed(False,'Problem downloading repository!')
    print('Continuing without downloading fresh copy of repository')
    
if unpack_to_orig:
    print(' -- Unpacking existing repository into "orig" directory')
    shutil.unpack_archive(s_repo_name,orig_dir)
completed()

## Download external files used to assemble final data set

In [None]:


ext_name = os.path.join(ext_dir,'openff_ext_files.zip')
if download_ext:
    try:
        print("This step may take several minutes. There are big files to transfer...")
        url = 'https://storage.googleapis.com/open-ff-common/openff_ext_files.zip'
        print(f'Downloading external files from {url}')
        urllib.request.urlretrieve(url, ext_name)
        print('Unpacking zip into "ext" directory')
        shutil.unpack_archive(ext_name,ext_dir)
        completed()
    except:
        completed(False,'Problem downloading external files!')
else:
    completed(True,'Completed without new external download')


## Download raw files from FracFocus

In [None]:
import intg_support.fetch_new_bulk_data as fnbd
if download_FF:
    completed(fnbd.store_FF_bulk(newdir = work_dir,sources=orig_dir, archive=True, warn=True))
else:
    fn = os.path.join(work_dir,'testData.zip')
    if os.path.isfile(fn):
        completed(True,'Completed using existing FF download')
    else:
        url = 'https://storage.googleapis.com/open-ff-common/repos/testData.zip'
        urllib.request.urlretrieve(url, fn)
        completed(True,'Copied previously downloaded FF dataset')

## Create master raw FracFocus set as file

In [None]:
import intg_support.Bulk_data_reader as bdr
if create_raw:
    rff = bdr.Read_FF(in_name='testData.zip', 
                      zipdir=work_dir,workdir = work_dir,
                      origdir=orig_dir,
                      flat_pickle = 'raw_flat.parquet')
    raw_df = rff.import_raw()
    # get number of records from old, repository data set
    # oldrecs = pd.read_pickle(os.path.join(orig_dir,'pickles','chemrecs.pkl'))
    oldrecs = get_df(os.path.join(orig_dir,'pickles','chemrecs.parquet'),
                    cols=['UploadKey'])
    if len(oldrecs)>len(raw_df):
        completed(False,'The old repository has MORE records than current download. Bad download??')
    else:
        completed(len(raw_df)>0)
else:
    completed(True,'No action taken; new FF download skipped')

#### Add new disclosures to UploadKey file

In [None]:
today = datetime.datetime.today()
datefn= os.path.join(orig_dir,'curation_files','upload_dates.parquet')
outdf = get_df(datefn)
uklst = outdf.UploadKey.unique()

df = get_raw_df()
ndf = df[~df.UploadKey.isin(uklst)].copy() # just the new ones

gb = ndf.groupby('UploadKey',as_index=False)['OperatorName'].count()
gb['date_added'] = today.strftime("%Y-%m-%d")
gb.rename({'OperatorName':'num_records'}, inplace=True,axis=1)
print(f'Number of new disclosures added to list: {len(gb)}')

outdf.weekly_report = 'DONE'
gb['weekly_report'] = 'NO'
outdf = pd.concat([outdf,gb],sort=True)
# outdf.to_csv(os.path.join(work_dir,'upload_dates.csv'),index=False)
save_df(outdf,os.path.join(work_dir,'upload_dates.parquet'))
 

# Curation steps
These steps are a mix of automated and hand-performed curation tasks. The hand performed tasks require the user to examine database values in spreadsheets and then to make and record decisions on those values about individual records.



## `CASNumber` and `IngredientName` curation tasks

Open-FF uses both raw input fields `CASNumber` and `IngredientName` to clarify chemical identity in each record.  These two fields **should** agree on the identity, but often only one field provides unambiguous identification (usually `CASNumber`) and sometimes the two are conflicting.  Our target is an accurate `bgCAS`, which is our "best guess" at a CAS Registration Number for the material reported in the disclosure.
That is,
> Unique `CASNumber` | `IngredientName` pair  $\rightarrow$ `bgCAS`

There are currently over 28,000 unique pairs.

The curation process outlined below gets our identification as close as possible to our target.  It requires using several sources of information and part of the process includes collecting that information.  Some steps are partially automated whereas other steps require our judgement and are therefore manual.  

This process is also incremental - we only need to curate the *new* chemical identifiers in the most recent download.  However, this process can also be used to examine the whole curated set to refine identification performed earlier.  

Resources needed to create the CAS-Ing list:
- CAS_curated: a list of `CASNumber` values and the tentative `bgCAS` number they imply.
- `IngredientName` synonym list: list of synonyms (and associated CAS number) to weigh against `IngredientName`. This is created from a collection of CAS and CompTox references.
- `TradeName` values associated with CAS-Ing pairs -- this aspect is still in development, though curators may manually examine TradeNames to make decisions.

### step A - use previous list to find any new `CASNumber` values
1. compare list of `CASNumber` values in rawdf to list of `CASNumber` in *olddir/curation_files/cas_curated.csv*
1. make and display list of those new ones.

#### Next steps for YOU:
>Next steps for **new** `CASNumber` (see note below): 
>- if the implied chemical is not in the CAS references, go to SciFinder and make new entry (manual!)
>- otherwise, can skip the SciFinder steps, but go to the CAS_curate step. 
>
>If there are no **new** `CASNumber`, 
>- skip all the way to the moving the current CAS_curate.csv to *newdir/curation_files/CAS_curate.csv*

Note: these "new" `CASNumber` values can be completely new chemicals or just a new version of an already used material (for example, we might find '00000050-00-0' for the authoritative CASRN '50-00-0' that is already documented in Open-FF.).   They may also be something that will not resolve into a valid CASRN, for example: 'proprietary by operator'. You will assign appropriate 'bgCAS' values in the curation step.

In [None]:
# STEP A
# exit_early() # uncomment this line to stop execution at this  cell.
import intg_support.CAS_master_list as casmaster

newcas = casmaster.get_new_tentative_CAS_list(get_raw_df(),orig_dir=orig_dir,work_dir=work_dir)
if len(newcas)>0:
    iShow(newcas)
    if len(newcas[newcas.tent_is_in_ref==False])>0:
        display(md('## Go to STEP B: Use SciFinder for `CASNumber`s not in reference already'))
    else:
        display(md('## Nothing to look up in SciFinder, but some curation necessary.  Skip to **Step XX**'))
else:
    display(md('### No new CAS numbers to curate.  Skip to **Step E**'))
completed() 

### Step B - Add chem info from SciFinder of new tentative CAS numbers into CAS_ref files
If a chemical on this list hasn't been seen before in FF, we need to add some information into the CAS_ref files.
To do that, we currently use SciFinder, which is a product of the Chemical Abstract Service, the naming authority for materials.

Save the file in `new_CAS_ref` directory of **work_dir** 

Once you have saved the file, run the following cell to **verify** that the new `CASNumber`s that you've found to be valid and created a reference for, actually made it into the SciFinder reference.



In [None]:
# Step B
import intg_support.make_CAS_ref_files as mcrf
# This first part creates a new reference file that includes the new SciFinder data.
#   (we will run this again after we collect the CompTox data
maker = mcrf.CAS_list_maker(orig_dir,work_dir)
maker.make_partial_set()

# Next we make a list of CAS records that need to be curated
newcas = casmaster.get_new_tentative_CAS_list(get_raw_df(),orig_dir=orig_dir,work_dir=work_dir)
casmaster.make_CAS_to_curate_file(newcas,ref_dir=orig_dir,work_dir=work_dir)

### Step C - Curate the CAS_to_curate file
In this step you will manually edit the *work_dir/CAS_curated_TO_EDIT.csv* file to curate the new `CASNumber` values. There are typically only a handful of lines in this file that you need to curate, just those newly discovered in the latest FF download. 

Your task is to assign a `curatedCAS` value for each new line, using the clues there.

Once you have completed the editing, save the file back to *work_dir/CAS_curated_modified.csv*.

#### Step C.1 - Make sure all `CASNumber` values have been curated

In [None]:
# exit_early() # uncomment this line to stop execution at this cell.

flag = casmaster.is_new_complete(work_dir)
if flag:
    completed()
else:
    completed(False,"More CASNumbers remain to be curated. Don't proceed until corrected")


---
### Step D - Update CompTox data
The metadata from EPA's CompTox system informs OpenFF in a number of ways. Now that we have a full CAS list, we need to update the CompTox data. 
**If no new CAS numbers were added to CAS_curated.csv, you can skip this step and jump to the CAS | Ing processing**

1. Open [CompTox batch search](https://comptox.epa.gov/dashboard/batch-search)
1. Under "Select Input Type(s)", check "CASRN"
1. Open the  *work_dir/comptox_search_list.csv* file in something like Excel or OpenOffice. (If you can't find this file, you may not need to run this step - no new CAS numbers?).
1. Copy and paste all CAS numbers in the `curatedCAS` column into the CompTox webpage "Enter Identifiers to Search" box.  You can skip the non_CAS numbers like 'proprietary'. They mean nothing to CompTox.
1. On the Comptox page, click the "Choose Export Options" button.
1. Under "Choose Export Format," select "Excel."
1. In the "Chemical Identifiers" section, make sure that the following are checked (but no more than these):
- Chemical Name
- DTXSID
- IUPAC Name
8. Under the "Presence in Lists" table, click the check box in the header that selects ALL lists.  This will be used to map what lists each chemical is part of.
1. Under "Enhanced Data Sheets," select "Synonyms and Identifiers" (**This is currently broken on the EPA site, see below**)
1. Finally, click "Download Export File". This can take several minutes, or even stall if EPA's servers have heavy use.
1. Once the file has been downloaded to your machine, RENAME it "comptox_batch_results.xlsx" (don't open it, just rename it!) 
    - **NOTE that recently this "Download Export File" process does not complete but hangs indefinitely.**  The current work-around is to 
        - Deselect the "Synonyms and Identifiers" checkbox and click "Download Export File" again. You won't be able to get the synonym data, but will still be able to update the name data.
1. Move that file to *work_dir*

### Next, we fetch a fresh version of the CompTox Lists metadata:

- Go to the [EPA CompTox **list** page](https://comptox.epa.gov/dashboard/chemical-lists)
- Click on the "Export" button in the upper right, and select the "Excel" option.  This will download a file to your computer.
- Rename that file (without opening it) as "comptox-chemical-lists-meta.xlsx" and move it to work_dir.

1. Run the following cell.


In [None]:
import intg_support.make_CAS_ref_files as mcrf
maker = mcrf.CAS_list_maker(orig_dir,work_dir)
maker.make_full_package()
# get_df(r"C:\MyDocs\OpenFF\src\openFF-cloud\work_dir\comptox_lists_table.parquet")
completed()

---
## Start CAS|Ing processing

In [None]:
# exit_early() # uncomment this line to stop execution at this cell.
import intg_support.CAS_2_build_casing as cas2
import intg_support.IngName_curator as IngNc
new_casing = cas2.make_casing(get_raw_df(),ref_dir=orig_dir,work_dir=work_dir) 
t = new_casing[new_casing.first_date.isna()].copy()
if len(t)>0:
    refdic = IngNc.build_refdic(ref_dir=work_dir)
    refdic = IngNc.summarize_refs(refdic)
    fsdf = IngNc.full_scan(t,refdic,pd.DataFrame(),work_dir)
    # print(fsdf.columns)
    fsdf = IngNc.analyze_fullscan(fsdf)
    # print(fsdf.columns)
    store_df_as_csv(fsdf,os.path.join(work_dir,'casing_TO_CURATE.csv'))
    fsdf = fsdf.reset_index()
    iShow(fsdf[['CASNumber', 'curatedCAS', 'IngredientName', 'recog_syn', 'synCAS',
           'match_ratio', 'n_close_match', 'source', 'bgCAS', 'rrank', 'picked']],
          maxBytes=0,classes="display compact cell-border")
    completed()
else:
    # if no new, copy original casing_curated.csv to work_dir
    shutil.copy(os.path.join(orig_dir,'curation_files','casing_curated.parquet'),work_dir)
    completed(True,'No new CAS|Ing to process; skip next step')

### step 9 - curate the new CAS|Ing pairs 

- make any desired changes to casing_TO_CURATE.csv.
- save file as *casing_modified.csv* in **newdir**
- then run the following code.  This step will keep only those lines where 'picked'=='xxx' and it will add today's date as the first seen date.

It will then add these lines to the master casing_curated file that will be used in subsequent steps.


In [None]:

Today = datetime.datetime.today().strftime('%Y-%m-%d')
try:
    modified = pd.read_csv(os.path.join(work_dir,'casing_modified.csv'))
    modified['first_date'] = 'D:'+f'{Today}'
    # print(modified.columns)
    oldcasing = get_df(os.path.join(orig_dir,'curation_files','casing_curated.parquet'))
    try: # works only on casing gerenated in non-cloud env. 
        oldcasing['synCAS'] = oldcasing.prospect_CAS_fromIng
        oldcasing['source'] = oldcasing.bgSource
    except:
        pass
    together = pd.concat([modified[modified.picked=='xxx'][['CASNumber','IngredientName','curatedCAS','recog_syn','synCAS',
                                                            'bgCAS','source','first_date','n_close_match']],
                          oldcasing[['CASNumber','IngredientName','curatedCAS','recog_syn','synCAS','bgCAS','source',
                                      'first_date','change_date','change_comment']] ],sort=True)
    together = together[['CASNumber','IngredientName','curatedCAS','recog_syn','synCAS','n_close_match',
                                                            'bgCAS','source','first_date','change_date','change_comment']]
    save_df(together,os.path.join(work_dir,'casing_curated.parquet'))
except:
    display(md("#### casing_modified.csv not found in work_dir.<br>Assuming you mean to use repo version of casing_curated"))
    shutil.copy(os.path.join(orig_dir,'curation_files','casing_curated.parquet'),
                os.path.join(work_dir,'casing_curated.parquet'))
    together = get_df(os.path.join(work_dir,'casing_curated.parquet'))

completed()
iShow(together,maxBytes=0,classes="display compact cell-border")

### step 10 - Verify that all CAS/Ing pairs are curated

In [None]:
# exit_early() # uncomment this line to stop execution at this cell.
import intg_support.CAS_2_build_casing as cas2
completed(cas2.is_casing_complete(get_raw_df(),work_dir))

## Company Name curation tasks

The company names used in FracFocus are not standardized; searching for all records of a company using the raw FracFocus data can be a tedious and frustrating task.   Open-FF uses a translation table to take raw company names (`OperatorName` and `Supplier`) and cluster them into categories that refer to the same company.  

The cells below first finds new company names that need curation attention and stores them in a file called *company_xlateNEW.csv*.  Typically, for about 1000 new disclosures, there are about 50 new names to curate, with many being slight variations on already curated names or brand new companies.  The users job is to do that curation (it usually takes just a few minutes).  The user saves that curated file and that will be used to build a new data set.

In [None]:
# exit_early() # uncomment this line to stop execution at this cell.
import intg_support.CompanyNames_make_list as complist

companies = complist.add_new_to_Xlate(get_raw_df(),ref_dir=orig_dir,out_dir=work_dir)

completed()
iShow(companies.reset_index(drop=True),maxBytes=0,columnDefs=[{"width": "100px", "targets": 0}],
     classes="display compact cell-border", scrollX=True)

### Now curate the new company names
Edit the *company_xlateNEW.csv* file so that `xlateName` is acceptable, the `first_date` is filled out, and the `status` is set to **curated**. 

Save those changes as *work_dir/company_xlate_modified.csv*. 

Run the following cell and verify that you have no company names to curate.

In [None]:
# exit_early() # uncomment this line to stop execution at this cell.
import intg_support.CompanyNames_make_list as complist
completed(complist.is_company_complete(work_dir))

## Location curation tasks

Like the other text fields in FracFocus, state and county names are not required to be standardized.  We try to create curated, standardized versions where we can to help with location errors detection.  Typically, very few new locations are added and so curation is often not even required with this data set.



In [None]:
# exit_early() # uncomment this line to stop execution at this cell.

import intg_support.Location_cleanup as loc_clean
locobj = loc_clean.Location_ID(get_raw_df(),ref_dir=orig_dir,out_dir=work_dir,ext_dir=ext_dir)
newloc = locobj.clean_location()
completed()

### Curate results
**If there are new locations**, curate the *work_dir/location_curatedNEW.csv* file and save to *work_dir/location_curated_modified.csv*

Then run the location check again to make sure you curated all the new locations: 

In [None]:
import intg_support.Location_cleanup as loc_clean
locobj = loc_clean.Location_ID(get_raw_df(),ref_dir=orig_dir,out_dir=work_dir)
completed(locobj.is_location_complete())


## Water carrier detection
To perform accurate calculation of mass, it is critical that the water carrier records in disclosures are identified.

In this current version of Open-FF, all water carrier determinations are performed with code.  No hand-curation is used. We came to the conclusion that, in the irregular disclosures that would be a target for hand curation, there are too many moving parts to make consistent decisions over the whole set especially with new disclosures being added all the time.  By using only coded algorithms to detect the carriers, we can apply consistant rules over the entire set.   

The current set of algorithms reject about 54,000 disclosures as being clearly ineligible for carrier detection (43,000 simply because they lack ingredient data).  Of the remaining 150,000, about 1% are not caught by the detection algorithms.  Data on those are available in a saved file here for user examination. While calculated masses will not be performed on that small set, `MassIngredient` may still be explored. 

In [None]:
# exit_early() # uncomment this line to stop execution at this cell.
import intg_support.Carrier_1_identify_in_new as car1

carobj = car1.Carrier_ID(get_raw_df(),ref_dir=orig_dir,out_dir=work_dir)
completed(carobj.create_full_carrier_set())

# Build and save Open-FF data set

**Start these steps only after all curation steps have been completed successfully!**

This step takes all of the files created in the curation steps and applies them together to the raw data.  Additionally, hooks to external data sources are used to create fields that better identify chemicals, locations etc.  The result of this step is a set of tables that can be used to further build a flat data set (such as a CSV file) or even a relational database.



In [None]:
# get all the CAS and CompTox ref files
cdir = os.path.join(orig_dir,'CAS_ref_files')
fdir = os.path.join(final_dir,"CAS_ref_files")
shutil.copytree(src=cdir,dst=fdir,dirs_exist_ok=True)

cdir = os.path.join(orig_dir,'CompTox_ref_files')
fdir = os.path.join(final_dir,"CompTox_ref_files")
shutil.copytree(src=cdir,dst=fdir,dirs_exist_ok=True)

cdir = os.path.join(work_dir,'new_CAS_REF')
fdir = os.path.join(final_dir,"CAS_ref_files")
shutil.copytree(src=cdir,dst=fdir,dirs_exist_ok=True)

# get files from orig_dir
files = [
         'missing_values.csv',
         'new_state_county_ref.csv',
         'IngName_non-specific_list.parquet'
 ]
for fn in files:
    shutil.copy(os.path.join(orig_dir,'curation_files',fn),
                os.path.join(final_dir,'curation_files',fn))
# get the curation files from the working dir
files = [
         'carrier_list_auto.parquet',
         'carrier_list_prob.parquet',
         'casing_curated.parquet',
         'CAS_curated.parquet',
         'CT_syn_backup.parquet',
         'comptox_list_meta.parquet',
         'comptox_lists_table.parquet',
         'comptox-chemical-lists-meta.xlsx',
         'master_cas_number_list.parquet',
         'master_synonym_list.parquet',
         'CAS_deprecated.parquet',
         'company_xlate.parquet',
         'location_curated.parquet',
         'uploadKey_ref.parquet', 
         'upload_dates.parquet']
for fn in files:
    shutil.copy(os.path.join(work_dir,fn),
                os.path.join(final_dir,'curation_files',fn))

In [None]:
#exit_early() # uncomment this line to stop execution at this cell.
import intg_support.Data_set_constructor as dsc

dataobj = dsc.Data_set_constructor(get_raw_df(),final_dir,final_dir,ext_dir)
out = dataobj.create_full_set()
completed()

## Create flat data set and test it
This step uses the set of tables created earlier to build a single 'flat' data file as well as to run some basic tests on the new data set.  Note that because the full data set is very large (too big for excel) and CSV files are cumbersome at this size, we are using the **parquet** format which is much faster and takes up far less space.  To create an equivalent CSV file, see this XXXXXXX

In [None]:
# exit_early() # uncomment this line to stop execution at this cell.

import intg_support.Analysis_set as a_set
ana_set = a_set.Full_set(sources=final_dir,outdir=final_dir)
df = ana_set.get_set(verbose=False)
completed()

In [None]:
# now perform the tests
import intg_support.Tests_of_final as tof
tests = tof.final_test(df)
tests.run_all_tests()
completed()

## Make repository
One the data set has been created, saved and tested, we construct a "repository."  Once created, this repository is intended to be **read only**, that is, no changes should be made to it.  The idea is that when using a given repository, analysts can depend on it being frozen in time.


In [None]:
# import core.make_data_repo as mdr
import shutil
repo_name = 'cloud_repo_new'
repodir = os.path.join(final_dir,repo_name)
pklsource = os.path.join(final_dir,'pickles')

try:
    os.mkdir(repodir)
except:
    print(f'{repodir} already exists?')
pickledir = os.path.join(repodir,'pickles')
try:
    os.mkdir(pickledir)
except:
    print(f'{pickledir} already exists?')
curdir = os.path.join(repodir,'curation_files')
try:
    os.mkdir(curdir)
except:
    print(f'{curdir} already exists?')
        
# copy CAS and CompTox reference files
cdir = os.path.join(repodir,'CAS_ref_files')
sdir = os.path.join(final_dir,"CAS_ref_files")
shutil.copytree(sdir,cdir,dirs_exist_ok=True)

cdir = os.path.join(repodir,'CompTox_ref_files')
sdir = os.path.join(final_dir,"CompTox_ref_files")
shutil.copytree(sdir,cdir,dirs_exist_ok=True)

# copy curation files
cdir = os.path.join(repodir,'curation_files')
sdir = os.path.join(final_dir,"curation_files")
shutil.copytree(sdir,cdir,dirs_exist_ok=True)

# copy pickles
cdir = os.path.join(repodir,'pickles')
sdir = os.path.join(final_dir,"pickles")
shutil.copytree(sdir,cdir,dirs_exist_ok=True)

# Other files to copy
shutil.copy(os.path.join(final_dir,'full_df.parquet'),repodir)

print('Making archive...')
completed(shutil.make_archive(repodir, 'zip', repodir))