# Open-FF data set customizer
Use this notebook to convert Open-FF's full dataframe (that is in "parquet" format) into a different format and, if desired, filter the data to a smaller subset.

In [2]:
# fetch the Open-FF code repository
# For use in COLAB, the following lines should be uncommented; 
#   comment all lines if running locally

!git clone https://github.com/gwallison/openFF.git &>/dev/null;
%run openFF/notebooks/Data_set_customizer_support.py

In [1]:
# Local - comment out these lines if running in COLAB (remote)
# import sys
# sys.path.insert(0,'c:/MyDocs/integrated/openFF/') # adjust to your setup
# %run Data_set_customizer_support.py

running Data_set_customizer_support
sandbox already exists
Full size on remote disk: 598,858,745 bytes
Open-FF file already downloaded
Creating full dataframe...
The full Open-FF data frame shape (rows,cols): (6051691, 102)


# Filter by state
Select state(s) that you want in the output file.  The selection box contains only those states in the Open-FF data.  Use `shift` or `cntl` click to select more than one state.

In [None]:
import ipywidgets as widgets
st_lst = df.bgStateName.unique().tolist()
st_lst.sort()
st_lst.insert(0,'All states')
states = widgets.SelectMultiple(
    options=st_lst,
    value=['All states'],
    #rows=10,
    description='Select State(s)',
    disabled=False
)
states

In [None]:
if states.value[0]!='All states':
    df = df[df.bgStateName.isin(list(states.value))]
print(f"The current filtered data frame's shape (rows,cols): {df.shape}")

# Include chemical data?
If you are interested in chemical records at all, select the "include chemical data" option.  However, if you are only interested in variables like location, operator name, total base water volume, and/or date, deselecting this option will greatly reduce the output file size.

If you select to include chemical data, you will be given the option to filter which chemicals are included. The options are:
- all (which will include non-chemical categories like "proprietary" and "ambiguousID"),
- specific sets, or
- a custom list that you enter by hand.

In [None]:
include_chem = widgets.Checkbox(
    value=True,
    description='Include chemical data',
    disabled=False,
    indent=True
)
include_chem

In [None]:
if not include_chem.value:
    display(md('## No chemical records to be included. Skip to "Select columns"'))
    chem_set = None
else:
    chem_set = widgets.Dropdown(
    options=[('All','all'),
             ('Custom','custom'),
             ('Clean Water Act','cwa'),
             ('Safe Drinking Water Act','dwsha'),
             ('UVCB','uvcb'),
             ('Sand and water','sand'),
             ('Trade secrets','proprietary')],
    value='all',
    description='Chemical Set:',
    disabled=False,
    )
chem_set

In [None]:
if chem_set.value=='custom':
    df.epa_pref_name.fillna(' -- ',inplace=True)
    gb = df.groupby('bgCAS',as_index=False)['epa_pref_name'].first()
    caslst = []
    for i,row in gb.iterrows():
        caslst.append((row.bgCAS +' - '+row.epa_pref_name,row.bgCAS))
    #print(len(caslst))
    cus_chem= widgets.SelectMultiple(
        options=caslst,
        value=[caslst[0][1]],
        rows=25,
        description='Select Chemical(s)',
        disabled=False
        )
else:
    cus_chem=None
cus_chem

In [None]:
# now process selected chemicals
if chem_set.value != 'all':
    if chem_set.value == 'uvcb':
        caslst = df[df.is_on_UVCB].bgCAS.unique().tolist()
    if chem_set.value == 'cwa':
        caslst = df[df.is_on_CWA].bgCAS.unique().tolist()
    if chem_set.value == 'dwsha':
        caslst = df[df.is_on_DWSHA].bgCAS.unique().tolist()
    if chem_set.value == 'sand':
        caslst = ['14808-60-7','7732-18-5']
    if chem_set.value == 'proprietary':
        caslst = ['proprietary']
    if chem_set.value == 'custom':
        caslst = cus_chem.value
    df = df[df.bgCAS.isin(caslst)]
print(f'Number of chemicals selected: {len(df.bgCAS.unique())}')
print(f"The current filtered data frame's shape (rows,cols): {df.shape}")

# Select columns to include
There are over 100 different columns in the Open-FF full data set, which includes both the original FracFocus columns and columns that Open-FF generates.  In most cases you will not need most of those and selecting a subset will keep your final custom data set smaller.

Choosing the Standard set will reduce the columns to a smaller but typical set and it will also remove disclosures and records that have been flagged as duplicates.  The Full set keeps all records (duplicates can be filtered later using the `in_std_filtered` flag).

In [None]:
std_set = ['StateName','CountyName','Latitude','Longitude',
            'OperatorName','WellName','UploadKey','date','APINumber',
            'bgStateName','bgCountyName','bgLatitude','bgLongitude',
            'TotalBaseWaterVolume','TotalBaseNonWaterVolume','TVD','bgOperatorName','primarySupplier',
            'carrier_status','no_chem_recs','CASNumber','IngredientName',
            'Supplier','bgCAS','calcMass','categoryCAS',
            'PercentHFJob','Purpose','TradeName','bgSupplier',
            'is_valid_cas','bgIngredientName']
           
col_set = widgets.Dropdown(
    options=['Full','Standard'],
    value='Standard',
    description='Column Set:',
    disabled=False,
)
col_set

In [None]:
if col_set.value == 'Standard':
    df = df[df.in_std_filtered].filter(std_set,axis=1)
print(f"The current filtered data frame's shape (rows,cols): {df.shape}")

# Select the output format

The current formats available are:
- **"parquet"** - an compressed structured format for large files. (recommended if you have the ability to use it)
- **"CSV"** - a traditional, text based format, standard input for spreadsheets. Can require 10x the storage and processing time as "parquet," This format does not keep Open-FF formatting so you may need to specify, for example, which columns are text and which are numeric (e.g. APINumber should be text because of leading zeros).
- **"Excel"** - similar to CSV but it will keep some formatting.  However, the maximum sheet size is (1048576, 16384) which will not hold all Open-FF records. Filtered files may fit.

In [None]:
import ipywidgets as widgets
format_type = widgets.ToggleButtons(
    options=['parquet', 'CSV', 'Excel'],
    description='Output type:',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    #tooltips=['Description of slow', 'Description of regular', 'Description of fast'],
#     icons=['check'] * 3
)
format_type

In [None]:
# save the output file

if format_type.value=='CSV':
    # make the CSV file
    outfn = os.path.join(tmp_dir,"my_output.csv")
    df.to_csv(outfn)

if format_type.value=='Excel':
    # make the Excel
    outfn = os.path.join(tmp_dir,"my_output.xlsx")
    df.to_excel(outfn)

if format_type.value=='parquet':
    outfn = os.path.join(tmp_dir,"my_output.parquet")
    df.to_parquet(outfn)

file_size = os.path.getsize(outfn)
print("File Size is :", file_size, "bytes")
print(f'Output saved at: {outfn}, size: {file_size:,} bytes') 
