In [None]:
import sys
sys.path.insert(0,'c:/MyDocs/integrated/') # adjust to your setup

%run "catalog_support.py" 

In [None]:
# get some global values
ref_df = pd.read_csv(os.path.join(hndl.sandbox_dir,'ref.csv'))
ref_df.set_index('varname',inplace=True)

In [None]:
# Data for this cas chemical
alldf = pd.read_parquet(os.path.join(hndl.sandbox_dir,'data.parquet'))
alldf['has_mass'] = alldf.mass>0
alldf.APINumber = alldf.APINumber.str[:2] + '-' + alldf.APINumber.str[2:5] + '-' + alldf.APINumber.str[5:10] + '-' + alldf.APINumber.str[10:] 
alldf['MI_comp'] = np.where(alldf.massComp.isna(),'n/a','verified')
alldf.MI_comp = np.where(alldf.massCompFlag==True,'suspect',alldf.MI_comp)
df = alldf[alldf.in_std_filtered].copy()
# print(df.FF_disc.head())
# dataset-wide flags
have_data = len(df)>0
have_mass = df.mass.max()>0

if not have_data:
    display(md('## THERE ARE NO RECORDS IN THE FILTERED DATA FOR THIS CHEMICAL\n\nNo analysis is possible\n\n'))
    raise SystemExit("Stop right there!")

In [None]:
cas = df.bgCAS.iloc[0]
cas_orig = df.bgIngredientName.iloc[0]
if cas_orig==np.NaN:
    cas_orig = '(not identified by CAS number)'
cas_epa = df.epa_pref_name.iloc[0]
cas_iupac = df.iupac_name.iloc[0]
try:
    comm_name = df.IngredientName.value_counts().index[0]
except:
    comm_name = ''

cas_ignore = ['proprietary','ambiguousID','sysAppMeta','conflictingID']

ing_name = cas_epa
if (ing_name == '--')|(ing_name == np.NaN)|(ing_name == None)|(ing_name=='nan'):
    ing_name=cas_orig
if cas in cas_ignore:
    ing_name = ''

In [None]:
# print( th.getMoleculeImg(cas,size=300))
showHeader(cas+'<br>'+ing_name,#subt='Chemical Report',
           link_up_level=1, # for the header links
           imglnk = th.getMoleculeImg(cas,size=300,link_up_level=1))

### Open-FF Chemical Report
This is a script-generated report about a specific chemical used in the Hydraulic Fracking industry. The source of these data is the industry-sponsored website [FracFocus](https://fracfocus.org/), but are analyzed by the independent project [Open-FF](https://frackingchemicaldisclosure.wordpress.com/).

Molecule images courtesy of [CompTox](https://comptox.epa.gov/dashboard/). Disclosures before 2011 are not included.


In [None]:
if not have_data:
    display(md('## THERE ARE NO RECORDS IN THE FILTERED DATA FOR THIS CHEMICAL\n\nOnly limited analysis is possible\n\n'))
#     display(md('###aborting this notebook...\n\n'))
else:
    if not have_mass:
        display(md('## There are no records for this chemical for which mass is reportable.'))
        display(md('### Much of the analysis in this report will be left blank'))
    


In [None]:
df.date = pd.to_datetime(df.date)
df['year'] = df.date.dt.year
df = df[df.year>2010]  # FF includes some old data but before 2011 are very sparse
#display(md('Disclosures before 2011 are not included in these analyses.'))
df.bgStateName = df.bgStateName.str.title()
df.bgCountyName = df.bgCountyName.str.title()

### Table of Contents
- [General information about the chemical](#info)
- [General usage](#frequency) within FracFocus & where those records occur
- [Detailed usage for all records](#detailedAbundance)
  - Uses by percentage of the fracking fluid
  - Uses by mass
- [Companies](#companies) supplying and using this chemical
- [Patterns of use](#patterns)
- [Disclosures list of biggest 100 uses](#biggest100) with map links
- [Tables of raw](#raw) Names, CAS Numbers and Trade names associated with this chemical

<a id='info'></a>

In [None]:
display(md(f'## Names for "{cas}":'))
if not cas in cas_ignore:
    display(md(f"""|Source|Name|
| --- | --- |
|SciFinder primary name:|**{cas_orig}**|
|EPA - preferred name:|**{cas_epa}**|
|IUPAC name:|**{cas_iupac}**|
|Most frequently used name in FF:|**{comm_name}**|"""))
else:
    display(md(f"""|Source|Name|
| --- | --- |
|Most frequently used name in FF:|**{comm_name}**|"""))
display(md("\nSee table of raw names for other names used for this material"))   

## Summary from EPA's ChemInformatics
For more detailed information, see the [ChemInformatics web site](https://www.epa.gov/chemical-research/cheminformatics)

In [None]:
#import make_chemInformatics_summaries as mcis
# ci_df = pd.read_pickle('./chemInfom.pkl')
ci_df = pd.read_parquet(os.path.join(hndl.curr_repo_dir,'curation_files',hndl.ci_summ_fn))
# print(ci_df.columns)
# create the pandas style version
def add_style(v,props=''):
    #print(s)
    if v=='V':return 'text-align: center;background-color:red'
    if v=='H':return 'text-align: center;background-color:orange'
    if v=='M':return 'text-align: center;background-color:yellow'
    if v=='L':return 'text-align: center;background-color:lightgreen'
    if v=='I':return 'text-align: center;background-color:lightgrey'
    if v=='ND':return 'text-align: center;background-color:darkgrey'

    return ''

def make_style(df,casrn = '50-00-0'): # just one   
    # currently, just ignore the authority data
    cols = df.columns.tolist()
    rmvlst = []
    for col in cols:
        if 'authority' in col:
            rmvlst.append(col)
    df = df[df.CAS==casrn].drop(rmvlst,axis=1)
            
    df = df[df.CAS==casrn].drop(['Name','DTXSID'],axis=1)
    if len(df)>0:
        # don't make a file when no data available
        df = df.fillna('ND')
        df.reset_index(drop=True,inplace=True)
        t = df.set_index('CAS').T
        display(t.style.applymap(add_style))
        # out.to_pickle(os.path.join(im_dir,casrn,'haz_df.pkl'))
        return True
    else:
        display(md('### No ChemInformatics analysis available'))
        display(md("""**Important:** Please do NOT infer that the absence of an EPA analysis here implies this chemical is not hazardous. 
    Many fracking materials are not quantified for toxicity because they are still poorly studied or are a mix of unknown or 
    variable composition (even though the constituents are likely to be toxic). """))
        return False

init_notebook_mode(all_interactive=False)  # turn off itables briefly  
flag = make_style(ci_df,cas)
init_notebook_mode(all_interactive=True)

In [None]:
if flag:
    display(md("""### Legend
|Code| Toxicity |-|Code|Toxicity|
| :---: | :---: |---| :---: | :---: |
| **V** | Very High  |-|**H** |  High  |
| **M** | Moderate  |-| **L** | Low  |
| **I** | Inconclusive  |-| **ND** | No Data  |
"""))
    display(md("""### EPA's disclaimer for the Hazard data
> The Hazard Comparison Dashboard is a prototype tool and a compilation of information sourced from many sites, databases and sources including U.S. Federal and state sources and international bodies that saves the user time by providing information in one location. The data are not reviewed by USEPA – the user must apply judgment in use of the information. The results do not indicate EPA’s position on the use or regulation of these chemicals. """))


<a id='frequency'></a>
# General frequency of use

In [None]:
total_Upload = int(ref_df.at['tot_num_disc','value'])
total_Upload_w_chem = int(ref_df.at['tot_num_disc_less_FFV1','value'])
fil_Upload = int(ref_df.at['tot_num_disc_fil','value'])
fil_Upload_w_chem = int(ref_df.at['tot_num_disc_fil_less_FFV1','value'])

In [None]:
pres = len(df.DisclosureId.unique())
massPres = len(df[df.mass>0].DisclosureId.unique())
totpres = len(alldf.DisclosureId.unique())

if totpres<300:
    alpha = 1
elif totpres<2000:
    alpha = .6
elif totpres<20000:
    alpha = .35
else:
    alpha = .2


In [None]:
if (have_data):
    if (pres/total_Upload_w_chem)>0.01: # don't show pie chart if presence is very small
        x = [pres,fil_Upload_w_chem-pres]
        labels = [f'{cas} present','not reported']
        plt.figure(figsize=(5, 5))
        plt.pie(x,explode=[0.15,0])
        plt.legend(labels=labels)
        title = f'Percent of all FracFocus disclosures that report the use of {cas}:'
        title += f'\n {th.round_sig(pres/fil_Upload_w_chem*100,3)}% '
        plt.title(title,fontsize=16);
        plt.show()
    display(md(f"""||Unfiltered data set|Filtered data set|with calculated mass|
|---|---|---|---|
|Num disclosures|{totpres:,}|{pres:,}|{massPres:,}*|
|% of disclosures|{th.round_sig(totpres/total_Upload_w_chem*100,3)}%|{th.round_sig(pres/fil_Upload_w_chem*100,3)}%|{th.round_sig(massPres/fil_Upload_w_chem*100,3)}%|"""))
    if massPres<pres:
        display(md(f'> *See [below](#massCompanies) for list of companies not reporting enough info for mass calculations'))

In [None]:
display(md('# Where this material has been reported'))
display(md(f'FracFocus records that have conflicting or non-standard location data are not included in this map. This includes many records in **Alaska**.'))

In [None]:
def ChemicalMap(df,casrn):
    #start_loc = get_state_center(statename)
    #print(statename,start_loc)
    cond = (df.loc_within_state=='YES')&(df.loc_within_county=='YES')
    gb = df[cond].groupby(['bgStateName','bgCountyName'],as_index=False)['bgCAS'].count()
    gb = gb.rename({'bgStateName':'StateName','bgCountyName':'CountyName','bgCAS':'value'}, axis=1)
    
    gb1 = df[cond&(df.mass>0)].groupby(['bgStateName','bgCountyName'],as_index=False)['bgCAS'].count()
    gb1 = gb1.rename({'bgStateName':'StateName','bgCountyName':'CountyName','bgCAS':'cnt_with_mass'}, axis=1)
    gb = pd.merge(gb,gb1,on=['StateName','CountyName'],how='left')
    
    gb1 = df[cond].groupby(['bgStateName','bgCountyName'],as_index=False)['mass'].sum()
    gb1 = gb1.rename({'bgStateName':'StateName','bgCountyName':'CountyName','mass':'mass_sum'}, axis=1)
    gb = pd.merge(gb,gb1,on=['StateName','CountyName'],how='left')
    gb.mass_sum = gb.mass_sum.map(lambda x: th.round_sig(x,3))
    
    
    #print(gb.value.max())
    if gb.value.max() > 1000:
        logflag = True
        if gb.value.max() > 10000:
            bins = [0,1,2,3,4,5]
        else:
            bins = [0,1,2,3,4]
    else:
        logflag = False
        if gb.value.max()>50:
            bins = [0,20,50,100,500,1000]
        else:
            bins = [0,1,5,10,20,50]
        
    mapping.create_county_choropleth(gb,plotlog=logflag,custom_scale=bins,
                             start_zoom=3,
                             legend_name=f'Number of uses of {casrn}',
                             fields=['StateName','CountyName','orig_value','cnt_with_mass','mass_sum'],
                             aliases=['State: ','County: ','# chemical records: ',
                                     '# records with reportable mass: ','Sum mass (lbs): '])
    


t = df[['bgStateName','bgCountyName','bgCAS','date','loc_within_state','loc_within_county','mass']]
t.bgStateName = t.bgStateName.str.lower()
t.bgCountyName = t.bgCountyName.str.lower()
ChemicalMap(t,cas)

<a id='detailedAbundance'></a>
# Detailed Uses
Each "use" below represents a single **record** in a disclosure, that is, a single line.  A given disclosure may have more than one record of a particular chemical.  

Using all **filtered** data for this chemical.

### Uses by percent of the fracking job
This measure roughly shows a comparison of the concentration of the chemical in the whole fracking job (including the base fluid which is typically over 80%).  Only the disclosures where the sum of PercentHFJob is within 5% of 100% are considered "valid." 

In [None]:
# df[df.within_total_tolerance].PercentHFJob.max()
# df.within_total_tolerance.sum()

In [None]:
if have_data:
    df['not_present'] = 0
    cond = (df.PercentHFJob>0) & (df.within_total_tolerance)
    if df.PercentHFJob.max()>0:
        df['not_present'] = df[cond].PercentHFJob.min() - (df[cond].PercentHFJob.max()-df[cond].PercentHFJob.min())*0.05
    #print(df.PercentHFJob.median())
    cond = (df.PercentHFJob>0) & (df.within_total_tolerance)
    ax = df[cond].plot('date','PercentHFJob', style='o', alpha=alpha,
                figsize=(16,6),legend=False)
    df[~cond].plot('date','not_present', style='|', alpha=1,color='orange',ms=20, ax=ax,legend=False)
    plt.ylabel('Percent of Job',fontsize=16);
    plt.title(f'Percent of job that is {cas} - linear version',fontsize=16);
    ax.grid()
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    

In [None]:
if have_mass:
    display(md(f'##### Number of records with valid percentage data (shown by blue circles): {len(df[cond]):,}\n'))
    display(md(f'##### Number of records without valid percentage data (shown by orange bars): {len(df[~cond]):,}\n'))
    display(md('---'))

In [None]:
if have_mass:
    ax = df[cond].plot('date','PercentHFJob', style='o', alpha=alpha,
                figsize=(16,6))
    plt.ylabel('Percent of Job',fontsize=16);
    plt.title(f'Percent of Job that is {cas} - log version',fontsize=16);
    ax.set(yscale='log')
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    #locmaj = matplotlib.ticker.LogLocator(base=10,numticks=7,subs='all') 
    if len(df[cond]) < 5000: # provide more detailed grid (too many points swamps it out)
        locmaj = matplotlib.ticker.LogLocator(base=10,subs='all') 
    else:
        locmaj = matplotlib.ticker.LogLocator(base=10) #,subs='all') 
    ax.yaxis.set_major_locator(locmaj)
    ax.set(ylim=(max(0.000001,df.PercentHFJob.min()),
                 df.PercentHFJob.max()*1.1));
    ax.grid()

    lns = list(np.percentile(df[cond].PercentHFJob,[25,50,75]))
    #ax.set_ylim(-0.7,len(sn)-0.3)
    for l in lns:
        plt.hlines(l,df[cond].date.min(),
                   df[cond].date.max(),
                   color='black')
    s = 'PERCENTILES:\n'
    s+= ' -- 25%:  {:.6} percent\n'.format(float(lns[0]))
    s+= ' -- 50%:  {:.6} percent\n'.format(float(lns[1]))
    s+= ' -- 75%:  {:.6} percent\n'.format(float(lns[2]))
    s+= ' -- max:  {:.6} percent\n'.format(float(df[cond].PercentHFJob.max()))
    print(s)


<a id='by_mass'></a>
## Uses by mass
This measure shows a comparison of the absolute quantity of a chemical used in a fracking job. 


In [None]:
if (have_mass):
    # set 'presence' data to below mass data
    df['noMass'] = 0
    if df.mass.max()>0:
        df['noMass'] = df.mass.min() - (df.mass.max()-df.mass.min())*0.05
    df.noMass = np.where(df.has_mass,np.NaN,df.noMass) 
    ax = df[df.mass>0].plot('date','mass', style='o', alpha=alpha,
                figsize=(16,6),legend=False)
    df[~df.has_mass].plot('date','noMass', style='|', alpha=1, ms=20, color='orange', legend=False,
                          ax=ax)
    plt.ylabel('Mass in pounds',fontsize=16);
    plt.title(f'Mass of {cas} - linear version',fontsize=16);
    ax.grid()
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14);
    ax = gca().yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));


In [None]:
if have_mass:
    display(md(f'Number of records with a mass (red circles): **{len(df[df.has_mass]):,}**'))    
    display(md(f'Number of records without a calculable mass (orange bars): **{len(df[~df.has_mass]):,}**'))
    display(md('---'))

In [None]:
if have_mass:
    c = df.MI_comp=='n/a'
    ax = df[df.mass>0].plot('date','mass', style='o', alpha=alpha, legend=False,
                 figsize=(16,6))
    plt.ylabel('Mass in pounds',fontsize=16);
    plt.title(f'Mass of {cas} - log version',fontsize=16);
    ax.set(yscale='log')
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)

    if len(df) < 5000: # provide more detailed grid (too many points swamps it out)
        locmaj = matplotlib.ticker.LogLocator(base=10,subs='all') 
    else:
        locmaj = matplotlib.ticker.LogLocator(base=10) #,subs='all') 
    ax.yaxis.set_major_locator(locmaj)
    
    ax.set(ylim=(max(0.01,df.mass.min()),max(df.mass.max(),df.cleanMI.max())*1.2));
    ax.grid()

    lns = list(np.percentile(df[df.mass>0].mass,[25,50,75,95,99]))
    #ax.set_ylim(-0.7,len(sn)-0.3)
    for l in lns:
        plt.hlines(l,df[df.mass>0].date.min(),
                   df[df.mass>0].date.max(),
                   color='black')
    s = 'mass PERCENTILES:\n (represented by lines in the figure)\n'
    s+= ' -- 25%:  {:,} pounds\n'.format(int(lns[0]))
    s+= ' -- 50%:  {:,} pounds\n'.format(int(lns[1]))
    s+= ' -- 75%:  {:,} pounds\n'.format(int(lns[2]))
    s+= ' -- 95%:  {:,} pounds\n'.format(int(lns[3]))
    s+= ' -- 99%:  {:,} pounds\n'.format(int(lns[4]))
    s+= ' -- max:  {:,} pounds\n'.format(int(df.mass.max()))
    print(s)

<a id='companies'></a>
# Companies
Top companies using or supplying this material

In [None]:
if have_data:
    ngroups = min(len(df.bgOperatorName.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
#     if ngroups < 6: fsiz = (8,3)
#     else: fsiz = (8,8)
    ax = df.groupby('bgOperatorName')['DisclosureId'].count()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz)
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    plt.xlabel('Number of records',fontsize=16);
    plt.title(f'Number of chemical records using {cas} by operator',fontsize=16);



## Which suppliers are most frequently named for this chemical?
The field reported below is bgSupplier.

In [None]:
if have_data:
    ngroups = min(len(df.bgSupplier.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
    ax = df.groupby('bgSupplier')['DisclosureId'].count()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz)
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    plt.xlabel('Number of records',fontsize=16);
    plt.title(f'Number of chemical records using {cas} by Supplier',fontsize=16);


## Which primarySuppliers are most frequently associated with this chemical?
The primarySupplier is a disclosure-level value and is not necessarily related to a given chemical record. Nevertheless, it indicates the supplier with the most records in a fracking job.

In [None]:
if have_data:
    ngroups = min(len(df.primarySupplier.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
    ax = df.groupby('primarySupplier')['DisclosureId'].count()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz);
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    plt.xlabel('Number of records',fontsize=16);
    plt.title(f'Number of chemical records using {cas} associated with primarySupplier',fontsize=16);


## Total mass reported by operator: Who uses the biggest quantity?
The field reported below is bgOperator.

In [None]:

if have_mass:
    ngroups = min(len(df[df.mass>0].bgOperatorName.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
    ax =df[df.mass>0].groupby('bgOperatorName')['mass'].sum()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz);
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14,rotation=75)
    plt.xlabel('Mass (pounds)',fontsize=16);
    plt.title(f'Total mass of {cas} used by operator',fontsize=16);

## Total mass reported by primarySupplier - biggest associations

In [None]:
if have_mass:
    ngroups = min(len(df[df.mass>0].bgOperatorName.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
    ax =df.groupby('primarySupplier')['mass'].sum()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz);
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14,rotation=75)
    plt.xlabel('Mass (pounds)',fontsize=16);
    plt.title(f'Total mass of {cas} associated with primarySupplier',fontsize=16);

<a id='patterns'></a>

In [None]:
if have_mass:
    display(md('--- \n ---'))
    display(md('## Patterns of use'))
    display(md('### Across some of the larger operators'))

    import seaborn as sns
    sns.set(style="whitegrid")
    ops =df.groupby('bgOperatorName',as_index=False)['mass'].sum().sort_values(by='mass',
                                                                                 ascending=False)[:6].bgOperatorName.tolist()
    ngroups = len(ops)
    t = df[df.bgOperatorName.isin(ops)]
    #print(len(t))
    #t = t[t.record_flags.str.contains('M')]
    fig = plt.figure(figsize=(10,ngroups+1))
    ax = sns.stripplot(x=t.mass,y=t.bgOperatorName,jitter=.2,alpha=.7)
    plt.xlabel(f'mass in pounds: {cas}',fontsize=14);
    plt.title(f'Variability in use of {cas} across some operators',fontsize=16)
    ax.set(xscale='log')
    ax.set(xlim=(max(0.1,t.mass.min()),t.mass.max()*1.1))
    ax.tick_params(axis="x", labelsize=14)
    ax.tick_params(axis="y", labelsize=14)
    if len(t) < 5000: # provide more detailed grid (too many points swamps it out)
        locmaj = matplotlib.ticker.LogLocator(base=10,subs='all') 
    else:
        locmaj = matplotlib.ticker.LogLocator(base=10) #,subs='all') 
    ax.xaxis.set_major_locator(locmaj)


In [None]:
if have_mass:
    import seaborn as sns
    display(md('### Across some of the primarySuppliers'))

    sns.set(style="whitegrid")
    ops =df.groupby('primarySupplier',as_index=False)['mass'].sum().sort_values(by='mass',
                                                                                 ascending=False)[:6].primarySupplier.tolist()
    ngroups = len(ops)
    t = df[df.primarySupplier.isin(ops)]
    #print(len(t))
    #t = t[t.record_flags.str.contains('M')]
    fig = plt.figure(figsize=(10,ngroups+1))
    ax = sns.stripplot(x=t.mass,y=t.primarySupplier,jitter=.2,alpha=.7)
    plt.xlabel(f'mass in pounds: {cas}',fontsize=14);
    plt.title(f'Variability in use of {cas} across some primarySuppliers',fontsize=16)
    ax.set(xscale='log')
    ax.set(xlim=(max(0.1,t.mass.min()),t.mass.max()*1.1))
    ax.tick_params(axis="x", labelsize=14)
    ax.tick_params(axis="y", labelsize=14)
    if len(t) < 5000: # provide more detailed grid (too many points swamps it out)
        locmaj = matplotlib.ticker.LogLocator(base=10,subs='all') 
    else:
        locmaj = matplotlib.ticker.LogLocator(base=10) #,subs='all') 
    ax.xaxis.set_major_locator(locmaj)


<a id='biggest100'></a>

In [None]:
if have_mass:
    if massPres < 100:
        display(md('## All disclosures with mass'))
    else:
        display(md('--- \n ---\n## Disclosures with the 100 largest uses'))
    # display(md('**mass** is in pounds. **map_link** provides a google map of site.  Note that the Google satellite map may be older than the well pad.'))

In [None]:
if have_mass:
    mdf = df[df.mass>0].sort_values('mass',ascending=False).reset_index(drop=True)
    iShow(mdf[['mass','massSource','FF_disc','bgStateName','bgCountyName',#'map_link',
               'bgOperatorName','TotalBaseWaterVolume',
                'date','primarySupplier','bgSupplier','TradeName_trunc']].rename({'FF_disc':'APINumber (FF link)'},axis=1).head(100),
         classes="display compact cell-border")


<a id='raw'></a>

---
---
# Raw fields for this chemical
These are the fields in the raw FracFocus data as they are entered for the records positively identified as this chemical.  These raw fields often have typos, variations on a chemical name, etc.  In some cases, two variations may seem identical, but probably differ by non-printing characters.

In [None]:

if have_data:
    display(md('## Raw CASNumber variations encountered for this chemical'))

    out = df.groupby('CASNumber',as_index=False)['bgCAS'].count().reset_index(drop=True)
    gb1 = df.groupby('CASNumber')['bgOperatorName'].apply(set).reset_index()
    gb1.columns = ['CASNumber','Operators']
    gb1.Operators = gb1.Operators.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb1,on='CASNumber',how='left')
    gb2 = df.groupby('CASNumber')['primarySupplier'].apply(set).reset_index()
    gb2.columns = ['CASNumber','primarySupplier']
    gb2.primarySupplier = gb2.primarySupplier.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb2,on='CASNumber',how='left').reset_index(drop=True)

    out.columns = ['CASNumber','Number of records','Operators','primarySuppliers']
    iShow(out.sort_values('Number of records',ascending=False),
          maxBytes=0, classes="display compact cell-border")

In [None]:
if have_data:
    display(md('## Raw IngredientName variations encountered for this chemical'))

    out = df.groupby('IngredientName',as_index=False)['bgCAS'].count().reset_index(drop=True)
    gb1 = df.groupby('IngredientName')['bgOperatorName'].apply(set).reset_index()
    gb1.columns = ['IngredientName','Operators']
    gb1.Operators = gb1.Operators.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb1,on='IngredientName',how='left')
    gb2 = df.groupby('IngredientName')['primarySupplier'].apply(set).reset_index()
    gb2.columns = ['IngredientName','primarySupplier']
    gb2.primarySupplier = gb2.primarySupplier.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb2,on='IngredientName',how='left').reset_index(drop=True)

    out.columns = ['IngredientName','Number of records','Operators','primarySuppliers']
    iShow(out.sort_values('Number of records',ascending=False),
          maxBytes=0,classes="display compact cell-border")

In [None]:
if have_data:
    display(md('## Raw TradeName variations encountered for this chemical\n Names truncated to 30 characters before processing'))

    df.TradeName = np.where(df.TradeName.isna(),'MISSING',df.TradeName)

    out = df.groupby('TradeName_trunc',as_index=False)['bgCAS'].count().reset_index(drop=True)
    gb1 = df.groupby('TradeName_trunc')['bgOperatorName'].apply(set).reset_index()
    gb1.columns = ['TradeName_trunc','Operators']
    gb1.Operators = gb1.Operators.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb1,on='TradeName_trunc',how='left')
    gb2 = df.groupby('TradeName_trunc')['primarySupplier'].apply(set).reset_index()
    gb2.columns = ['TradeName_trunc','primarySupplier']
    gb2.primarySupplier = gb2.primarySupplier.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb2,on='TradeName_trunc',how='left').reset_index(drop=True)
    gb3 = df.groupby('TradeName_trunc')['bgSupplier'].apply(set).reset_index()
    gb3.columns = ['TradeName_trunc','bgSupplier']
    gb3.bgSupplier = gb3.bgSupplier.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb3,on='TradeName_trunc',how='left').reset_index(drop=True)

    out.columns = ['TradeName_trunc','Number of records','Operators','primarySuppliers','bgSupplier']
    iShow(out.sort_values('Number of records',ascending=False),
          maxBytes=0,classes="display compact cell-border")

In [None]:
if have_data:
    display(md('## Raw Purpose variations encountered for this chemical\n Values truncated to 30 characters before processing'))
    df.Purpose = np.where(df.Purpose.isna(),'MISSING',df.Purpose)
    df['Purp_trunc'] = np.where(df.Purpose.str.len()>30,
                                df.Purpose.str[:30]+'...',
                                df.Purpose)
    out = df.groupby('Purp_trunc',as_index=False)['bgCAS'].count().reset_index(drop=True)
    gb1 = df.groupby('Purp_trunc')['bgOperatorName'].apply(set).reset_index()
    gb1.columns = ['Purp_trunc','Operators']
    gb1.Operators = gb1.Operators.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb1,on='Purp_trunc',how='left')
    gb2 = df.groupby('Purp_trunc')['primarySupplier'].apply(set).reset_index()
    gb2.columns = ['Purp_trunc','primarySupplier']
    gb2.primarySupplier = gb2.primarySupplier.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb2,on='Purp_trunc',how='left').reset_index(drop=True)
    gb3 = df.groupby('Purp_trunc')['bgSupplier'].apply(set).reset_index()
    gb3.columns = ['Purp_trunc','bgSupplier']
    gb3.bgSupplier = gb3.bgSupplier.map(lambda x: th.xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb3,on='Purp_trunc',how='left').reset_index(drop=True)

    out.columns = ['Purp_trunc','Number of records','Operators','primarySuppliers','bgSupplier']
    iShow(out.sort_values('Number of records',ascending=False),
          maxBytes=0, classes="display compact cell-border")

<a id='massCompanies'></a>

In [None]:
if have_data:
    if massPres < pres:
        display(md('---\n ---\n## Operators not reporting enough data for mass calculations for this chemical'))
        display(md('Shown are operators and how many records without mass, as well as the types of information missing or in error for each company. In addition, not all disclosures are curated by Open-FF yet.'))
        tmp = df[~(df.mass>0)].groupby('bgOperatorName',as_index=False)['DisclosureId'].count()
        df['not_curated'] = df.carrier_status=='unknown'
        df['noPerc'] = ~(df.PercentHFJob>0)
        df['noTBWV'] = ~df.has_TBWV
        df['noWaterCarrier'] = ~df.has_water_carrier
        df['outOfTolerance'] = ~df.within_total_tolerance
        gb = df[~(df.mass>0)].groupby('bgOperatorName',as_index=False)[['not_curated',
                                                                            'noTBWV','noWaterCarrier',
                                                                            'outOfTolerance','noPerc']].sum()
        tmp = pd.merge(tmp,gb,on='bgOperatorName',how='left').reset_index(drop=True)
        tmp.not_curated = ((tmp.not_curated/tmp.DisclosureId)*100).round(0)
        tmp.noTBWV = ((tmp.noTBWV/tmp.DisclosureId)*100).round(0)
        tmp.noWaterCarrier = ((tmp.noWaterCarrier/tmp.DisclosureId)*100).round(0)
        tmp.outOfTolerance = ((tmp.outOfTolerance/tmp.DisclosureId)*100).round(0)
        tmp.noPerc = ((tmp.noPerc/tmp.DisclosureId)*100).round(0)
        tmp.columns = ['Operator','records without mass','% carrier not curated','% without TBWV','% no water carrier record',
                       '% total percent out of tolerance','% no PercentHFJob']
        iShow(tmp.sort_values('records without mass',ascending=False),
              maxBytes=0, classes="display compact cell-border")    