In [None]:
import sys
sys.path.insert(0,'c:/MyDocs/integrated/') # adjust to your setup

%run "catalog_support.py" 
showHeader('Chemical Synonym Table')

In [None]:
# fetch data set
df_cas = fh.get_df(os.path.join(hndl.sandbox_dir,'workdf.parquet'))

In [None]:
# cascnt = df_cas.groupby('bgCAS',as_index=False)['DisclosureId'].count()
cascnt = df_cas.groupby('bgCAS',as_index=False).size()
cascnt.columns = ['cas_number','record_cnt']
# syncnt = df_cas.groupby('IngredientName', as_index=False)['DisclosureId'].count()
syncnt = df_cas.groupby('IngredientName', as_index=False).size()
syncnt.columns = ['synonym','syn_count']

This interactive table below lists synonyms for chemicals published in FracFocus. This is not an exhaustive set. The table is sort-able by any column (click a second time for reverse sort) and use the **Search** bar to limit what is shown in the table. 

   
Use the search function to find
- alternative names for a given CAS number.  Some of these may be product names. Some CAS numbers have thousands of synonyms.
- possible CAS numbers for a given synonym

These synonyms are the ones provided by the CAS reference site, [SciFinder](http://scifinder.cas.org), a subscription service, and the synonyms from EPA's CompTox database.   It can also be helpful to use
[PubChem](https://pubchem.ncbi.nlm.nih.gov/), a free, public site.  

<!--- An additional source is the Table H-3 in the appendix of EPA document [Hydraulic Fracturing for Oil and Gas: Impacts from the Hydraulic Fracturing Water Cycle on Drinking Water Resources in the United States (Final Report)](https://cfpub.epa.gov/ncea/hfstudy/recordisplay.cfm?deid=332990).  This table provides a list of **generic** names commonly used that are not specific enough to resolve to a single CAS number.  The CAS number given in the table for these names is "estab_non_spec".  These ingredient names appear frequently in FracFocus. -->

|Explanation of columns in the table|
| :---: |

| Column      | Description |
| :----: | :-------- |
|*cas_number*| is the CAS registration number of a chemical in the FracFocus data set. **Click on this link** to view report of this chemical within FracFocus| 
|*synonym*| is the one of the names given by SciFinder and/or CompTox for the material identified by the CAS number|


## Synonyms for chemicals reported in FracFocus

In [None]:
import itables.options as opt
opt.classes="display compact cell-border"


master_df = fh.get_df(os.path.join(hndl.curr_repo_dir,'curation_files','master_synonym_list.parquet'))

syn_df = pd.merge(master_df,syncnt,on='synonym',how='left')
syn_df = pd.merge(syn_df,cascnt,on='cas_number',how='left').reset_index(drop=True)
syn_df = syn_df[~(syn_df.cas_number==syn_df.synonym)] # remove uninformative synonym
syn_df.syn_count.fillna(0,inplace=True)
syn_df.record_cnt.fillna(0,inplace=True)
syn_df = syn_df[~syn_df.duplicated()].reset_index(drop=True)
syn_df['CAS Number'] = np.where(syn_df.record_cnt>0,
                             '<b>'+syn_df.cas_number.map(lambda x: th.getCatLink(x,x))+'</b>',
                             syn_df.cas_number)
syn_df['synon'] = syn_df.synonym.map(lambda x: th.xlate_to_str(x,sep='<br>'))
# iShow(syn_df.sort_values('record_cnt',ascending=False),
#       maxBytes=0, classes="display compact cell-border",index=False)

In [None]:
def xlate_to_str(inp,sep='; ',trunc=False,tlen=20,totallen = 5000,sort=True,
                maxlen=100000,maxMessage='Too many items to display'):
    """used to translate a list into a meaningful string for display"""
    try:
        if isinstance(inp,str):
            inp = [inp]
        l = list(inp)
        if sort:
            l.sort()
        if len(l)>maxlen:
            return maxMessage

        out = ''
        line_len = 0
        for i,a in enumerate(l):
            s = str(a)
            line_len += len(s)
            if line_len > 100:
                out += s+'\n'
                line_len = 0
            else:
                out+= s+sep
        out = out[:-(len(sep))]#
    except:
        return ''
    if len(out)>totallen:
        out = out[:totallen]+' ...' 
    return out

In [None]:
# syn_df.synonym = syn_df.synonym.str.replace('\\n',' - ')
gb1 = syn_df.groupby('CAS Number',as_index=False)['synonym'].apply(list)
gb1['synonyms'] = gb1.synonym.map(lambda x: xlate_to_str(x,sep = '   |   '))
gb1.synonyms = gb1.synonyms.str.replace('\n','  |  ')
gb1 = gb1[['CAS Number','synonyms']]
gb2 = syn_df.groupby('CAS Number',as_index=False)['record_cnt'].sum()
mg = pd.merge(gb1,gb2,on='CAS Number',how='left')
#mg['in_FF'] = np.where(mg.record_cnt>0,'is_in_FF','')
iShow(mg[mg.record_cnt>0][['CAS Number','synonyms']].reset_index(drop=True))