In [None]:
%run "../catalog_common.py"
ID_header('Open-FF:  Detailed Chemical Report', incl_links=True,
          link_up_level=True)
set_page_param()


In [None]:
#preamble to analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import matplotlib.ticker
from IPython.display import Markdown as md
from IPython.display import HTML, display
from time import sleep

from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
from itables import show as iShow
import itables.options as opt

In [None]:
min_num_for_big = 29

This is a script-generated report about a specific chemical used in the Hydraulic Fracking industry. The data are supplied by the industry-sponsored website [FracFocus](https://fracfocus.org/), but analyzed by the independent project [Open-FF](https://frackingchemicaldisclosure.wordpress.com/).
Structure images courtesy of [ChemID](https://chem.nlm.nih.gov/chemidplus/).

In [None]:
# Data for this cas chemical
alldf = pd.read_csv('data.csv',low_memory=False)
alldf['has_mass'] = alldf.calcMass>0
 

df = alldf[alldf.in_std_filtered].copy()

# dataset-wide flags
have_data = len(df)>0
have_mass = df.calcMass.max()>0



In [None]:
# get some global values
ref_df = pd.read_csv('ref.csv')
ref_df.set_index('varname',inplace=True)


In [None]:
# md(f'This report (created on {ref_df.at["today","value"].split()[0]}) was generated from FracFocus bulk data downloaded on {ref_df.at["data_date","value"]}.')

In [None]:
# get epa and other names
names = pd.read_csv('bgCAS.csv')
names = names.fillna('_empty_')

In [None]:
cas = ref_df.at['target_cas','value'].strip()
epa_sub = names[names.bgCAS==cas]['epa_Substance_Name'].values[0]
cas_orig = names[names.bgCAS==cas]['bgIngredientName'].values[0]
#eh_name = names[names.bgCAS==cas]['eh_IngredientName'].values[0]
epa_reg = names[names.bgCAS==cas]['epa_Registry_Name'].values[0]
try:
    comm_name = df.IngredientName.value_counts().index[0]
except:
    comm_name = ''

#print(f'{epa_sub}\n{epa_reg}\n{cas_orig}\n{eh_name}')

subname = epa_sub
subtitle = 'Substance name (EPA list)'
if epa_sub=='_empty_':
    subname = cas_orig
    subtitle = 'Substance name (SciFinder)'
if epa_reg != '_empty_':
    regname = epa_reg
    regtitle = 'Registry name (EPA list)'
else:
    regname = ''; regtitle = ''
# if eh_name !='_empty_':
#     ehname = eh_name
#     ehtitle = 'Name used in Elsner/Hoelzer'
# else:
#     ehname = ''; ehtitle = ''
if comm_name:
    commname = comm_name
    commtitle = 'Most freqently used name \nin FF IngredientName'
else:
    commname = ''; commtitle = ''

In [None]:
chemid = f'<p style="text-align: center; font-size:400%">{cas}</p>'
if cas in ['conflictingID','proprietary','ambiguousID','sysAppMeta']:
    chemid = f'<p style="text-align: center; font-size:200%">{cas}</p>'

regcode = f"""       <tr>
                        <td><p style="text-align: center; font-size:100%">{regtitle}</p></td>
                    </tr>
                    <tr>
                        <td><p style="text-align: center; font-size:200%">{regname}</p>'</td>
                    </tr>"""
if regname == '':
    regcode = ''
commcode = f"""       <tr>
                        <td><p style="text-align: center; font-size:100%">{commtitle}</p></td>
                    </tr>
                    <tr>
                        <td><p style="text-align: center; font-size:200%">{commname}</p>'</td>
                    </tr>"""
if commname == '':
    commcode = ''
# ehcode = f"""       <tr>
#                         <td><p style="text-align: center; font-size:100%">{ehtitle}</p></td>
#                     </tr>
#                     <tr>
#                         <td><p style="text-align: center; font-size:200%">{ehname}</p>'</td>
#                     </tr>"""
# if ehname == '':
#     ehcode = ''

display(HTML(f"""<style>
                </style>
                <table style='margin: 0 auto' >
                <tr>
                <td width=25%>
                    <table >
                    <tr>
                        <td><p style="text-align: center; font-size:100%">CAS Registration Number</p></td>
                    </tr>
                    <tr>
                        <td>{chemid}</td>
                    </tr>
                    <tr>
                        <td  align="left">
                            <img src="https://chem.nlm.nih.gov/chemidplus/structure/{cas}" alt="no image available from ChemID" />
                        </td>
                    </tr>
                    </table>
                <td width=75%>
                    <table >
                    <tr>
                        <td><p style="text-align: center; font-size:100%">{subtitle}</p></td>
                    </tr>
                    <tr>
                        <td><p style="text-align: center; font-size:200%">{subname}</p>'</td>
                    </tr>
                    {regcode}
                    {commcode}
                    </table>
                    
            </table>"""))

In [None]:
if not have_data:
    display(md('## THERE ARE NO RECORDS IN THE FILTERED DATA FOR THIS CHEMICAL\n\nOnly limited analysis is possible\n\n'))
#     display(md('###aborting this notebook...\n\n'))
else:
    if not have_mass:
        display(md('## There are no records for this chemical for which mass is calculable.'))
        display(md('### Much of the analysis in this report will be left blank'))
    


In [None]:
if have_data:
    df.date = pd.to_datetime(df.date)
    df['year'] = df.date.dt.year
    df = df[df.year>2010]  # FF includes some old data but before 2011 are very sparse
    display(md('Disclosures before 2011 are not included in these analyses.'))
    df.bgStateName = df.bgStateName.str.title()
    df.bgCountyName = df.bgCountyName.str.title()

### Table of Contents
- [General abundance](#frequency) within FracFocus
- [Detailed abundance for all records](#detailAbundance)
  - Uses by percentage of the fracking fluid
  - Uses by mass
- [Companies](#companies) supplying and using this chemical
- [Patterns of use](#patterns)
- [Disclosures list of biggest 100 uses](#biggest100) with map links
- [Tables of raw](#raw) Names, CAS Numbers and Trade names associated with this chemical

<a id='frequency'></a>
# General abundance

In [None]:
total_Upload = int(ref_df.at['tot_num_disc_less_FFV1','value'])
fil_Upload = int(ref_df.at['tot_num_disc_fil','value'])

In [None]:
pres = len(df.UploadKey.unique())
massPres = len(df[df.calcMass>0].UploadKey.unique())
totpres = len(alldf.UploadKey.unique())

if totpres<300:
    alpha = 1
elif totpres<2000:
    alpha = .6
elif totpres<20000:
    alpha = .35
else:
    alpha = .2


In [None]:

if (have_data):
    if (pres/total_Upload)>0.01: # don't show pie chart if presence is very small
        x = [pres,fil_Upload-pres]
        labels = [f'{cas} present','not reported']
        plt.figure(figsize=(5, 5))
        plt.pie(x,explode=[0.15,0])
        plt.legend(labels=labels)
        title = f'Percent of all FracFocus disclosures that report the use of {cas}:'
        title += f'\n {round_sig(pres/fil_Upload*100,3)}% '
        plt.title(title,fontsize=16);
        plt.show()
    display(md(f"""||Unfiltered data set|Filtered data set|with calculated mass|
|---|---|---|---|
|Num disclosures|{totpres:,}|{pres:,}|{massPres:,}*|
|% of disclosures|{round_sig(totpres/total_Upload*100,3)}%|{round_sig(pres/fil_Upload*100,3)}%|{round_sig(massPres/fil_Upload*100,3)}%|"""))
    if massPres<pres:
        display(md(f'> *See [below](#massCompanies) for list of companies not reporting enough info for mass calculations'))

<a id='detailedAbundance'></a>
# Detailed abundance


Using all **filtered** data for this chemical.

### Uses by percent of the fracking job
This measure roughly shows a comparison of the concentration of the chemical in the whole fracking job (including the base fluid which is typically over 80%).  Only the disclosures where the sum of PercentHFJob is within 5% of 100% are considered "valid." 

In [None]:
# df[df.within_total_tolerance].PercentHFJob.max()
# df.within_total_tolerance.sum()

In [None]:
if have_data:
    df['not_present'] = 0
    cond = (df.PercentHFJob>0) & (df.within_total_tolerance)
    if df.PercentHFJob.max()>0:
        df['not_present'] = df[cond].PercentHFJob.min() - (df[cond].PercentHFJob.max()-df[cond].PercentHFJob.min())*0.05
    #print(df.PercentHFJob.median())
    cond = (df.PercentHFJob>0) & (df.within_total_tolerance)
    ax = df[cond].plot('date','PercentHFJob', style='o', alpha=alpha,
                figsize=(16,6),legend=False)
    df[~cond].plot('date','not_present', style='|', alpha=1,color='orange',ms=20, ax=ax,legend=False)
    plt.ylabel('Percent of Job',fontsize=16);
    plt.title(f'Percent of job that is {cas} - linear version',fontsize=16);
    ax.grid()
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    

In [None]:
if have_mass:
    display(md(f'##### Number of records with valid percentage data (shown by blue circles): {len(df[cond]):,}\n'))
    display(md(f'##### Number of records without valid percentage data (shown by orange bars): {len(df[~cond]):,}\n'))
    display(md('---'))

In [None]:
if have_mass:
    ax = df[cond].plot('date','PercentHFJob', style='o', alpha=alpha,
                figsize=(16,6))
    plt.ylabel('Percent of Job',fontsize=16);
    plt.title(f'Percent of Job that is {cas} - log version',fontsize=16);
    ax.set(yscale='log')
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    #locmaj = matplotlib.ticker.LogLocator(base=10,numticks=7,subs='all') 
    locmaj = matplotlib.ticker.LogLocator(base=10,subs='all') 
    ax.yaxis.set_major_locator(locmaj)
    ax.set(ylim=(max(0.000001,df.PercentHFJob.min()),
                 df.PercentHFJob.max()*1.1));
    ax.grid()

    lns = list(np.percentile(df[cond].PercentHFJob,[25,50,75]))
    #ax.set_ylim(-0.7,len(sn)-0.3)
    for l in lns:
        plt.hlines(l,df[cond].date.min(),
                   df[cond].date.max(),
                   color='black')
    s = 'PERCENTILES:\n'
    s+= ' -- 25%:  {:.6} percent\n'.format(float(lns[0]))
    s+= ' -- 50%:  {:.6} percent\n'.format(float(lns[1]))
    s+= ' -- 75%:  {:.6} percent\n'.format(float(lns[2]))
    s+= ' -- max:  {:.6} percent\n'.format(float(df[cond].PercentHFJob.max()))
    print(s)


<a id='by_mass'></a>
### Uses by mass
This measure shows a comparison of the absolute quantity of a chemical used in a fracking job. Masses are calculated from standard information provided in many chemical records, but a substantial number of disclosures do not provide sufficient data.

In [None]:
if have_mass:
    # set 'presence' data to below mass data
    df['noMass'] = 0
    if df.calcMass.max()>0:
        df['noMass'] = df.calcMass.min() - (df.calcMass.max()-df.calcMass.min())*0.05
    df.noMass = np.where(df.has_mass,np.NaN,df.noMass) 
    ax = df[df.calcMass>0].plot('date','calcMass', style='o', alpha=alpha,
                figsize=(16,6),legend=False)
    df[~df.has_mass].plot('date','noMass', style='|', alpha=1, ms=20, color='orange', legend=False,
                          ax=ax)
    plt.ylabel('Mass in pounds',fontsize=16);
    plt.title(f'Mass of {cas} - linear version',fontsize=16);
    ax.grid()
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)

In [None]:
if have_mass:
    display(md(f'##### Number of records with a calculable mass (shown by blue circles):   {len(df[df.has_mass]):,}\n'))    
    display(md(f'##### Number of records without a calculable mass (shown by orange bars): {len(df[~df.has_mass]):,}\n'))
    display(md('---'))

In [None]:
if have_mass:
    ax = df[df.calcMass>0].plot('date','calcMass', style='o', alpha=alpha, legend=False,
                figsize=(16,6))
    plt.ylabel('Mass in pounds',fontsize=16);
    plt.title(f'Mass of {cas} - log version',fontsize=16);
    ax.set(yscale='log')
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    #locmaj = matplotlib.ticker.LogLocator(base=10,numticks=7,subs='all') 
    locmaj = matplotlib.ticker.LogLocator(base=10,subs='all') 
    ax.yaxis.set_major_locator(locmaj)
    ax.set(ylim=(max(0.01,df.calcMass.min()),df.calcMass.max()*1.2));
    ax.grid()

    lns = list(np.percentile(df[df.calcMass>0].calcMass,[25,50,75]))
    #ax.set_ylim(-0.7,len(sn)-0.3)
    for l in lns:
        plt.hlines(l,df[df.calcMass>0].date.min(),
                   df[df.calcMass>0].date.max(),
                   color='black')
    s = 'PERCENTILES:\n'
    s+= ' -- 25%:  {:,} pounds\n'.format(int(lns[0]))
    s+= ' -- 50%:  {:,} pounds\n'.format(int(lns[1]))
    s+= ' -- 75%:  {:,} pounds\n'.format(int(lns[2]))
    s+= ' -- max:  {:,} pounds\n'.format(int(df.calcMass.max()))
    print(s)

<a id='companies'></a>

---
---
# Companies

## Which operators are the most frequent users
The field reported below is bgOperatorName.

In [None]:

if have_data:
    ngroups = min(len(df.bgOperatorName.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
#     if ngroups < 6: fsiz = (8,3)
#     else: fsiz = (8,8)
    ax = df.groupby('bgOperatorName')['UploadKey'].count()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz)
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    plt.xlabel('Number of records',fontsize=16);
    plt.title(f'Number of chemical records using {cas} by operator',fontsize=16);

## Which suppliers are most frequently named for this chemical?
The field reported below is bgSupplier.

In [None]:
if have_data:
    ngroups = min(len(df.bgSupplier.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
    ax = df.groupby('bgSupplier')['UploadKey'].count()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz)
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    plt.xlabel('Number of records',fontsize=16);
    plt.title(f'Number of chemical records using {cas} by Supplier',fontsize=16);


## Which primarySuppliers are most frequently associated with this chemical?
The primarySupplier is a disclosure-level value and is not necessarily related to a given chemical record. Nevertheless, it indicates the supplier with the most records in a fracking job.

In [None]:
if have_data:
    ngroups = min(len(df.primarySupplier.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
    ax = df.groupby('primarySupplier')['UploadKey'].count()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz);
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14)
    plt.xlabel('Number of records',fontsize=16);
    plt.title(f'Number of chemical records using {cas} associated with primarySupplier',fontsize=16);


## Total mass reported by operator: Who uses the biggest quantity?
The field reported below is bgOperator.

In [None]:

if have_mass:
    ngroups = min(len(df[df.calcMass>0].bgOperatorName.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
    ax =df[df.calcMass>0].groupby('bgOperatorName')['calcMass'].sum()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz);
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14,rotation=75)
    plt.xlabel('Mass (pounds)',fontsize=16);

    plt.title(f'Total mass of {cas} used by operator',fontsize=16);

## Total mass reported by primarySupplier - biggest associations

In [None]:
if have_mass:
    ngroups = min(len(df[df.calcMass>0].bgOperatorName.unique()),10)
    fsiz = (8,(ngroups*0.4)+1)
    ax =df.groupby('primarySupplier')['calcMass'].sum()\
         .sort_values(ascending=False)[:10].plot.barh(figsize=fsiz);
    ax.tick_params(axis="y", labelsize=14)
    ax.tick_params(axis="x", labelsize=14,rotation=75)
    plt.xlabel('Mass (pounds)',fontsize=16);
    plt.title(f'Total mass of {cas} associated with primarySupplier',fontsize=16);

<a id='patterns'></a>

In [None]:
if have_mass:
    display(md('--- \n ---'))
    display(md('## Patterns of use'))
    display(md('### Across some of the larger operators'))


    sns.set(style="whitegrid")
    ops =df.groupby('bgOperatorName',as_index=False)['calcMass'].sum().sort_values(by='calcMass',
                                                                                 ascending=False)[:6].bgOperatorName.tolist()
    ngroups = len(ops)
    t = df[df.bgOperatorName.isin(ops)]
    #print(len(t))
    #t = t[t.record_flags.str.contains('M')]
    fig = plt.figure(figsize=(10,ngroups+1))
    ax = sns.stripplot(x=t.calcMass,y=t.bgOperatorName,jitter=.2,alpha=.7)
    plt.xlabel(f'mass in pounds: {cas}',fontsize=14);
    plt.title(f'Variability in use of {cas} across some operators',fontsize=16)
    ax.set(xscale='log')
    ax.set(xlim=(max(0.1,t.calcMass.min()),t.calcMass.max()*1.1))
    ax.tick_params(axis="x", labelsize=14)
    ax.tick_params(axis="y", labelsize=14)
    locmaj = matplotlib.ticker.LogLocator(base=10,subs='all') 
    ax.xaxis.set_major_locator(locmaj)


In [None]:
if have_mass:
    display(md('### Across some of the primarySuppliers'))

    sns.set(style="whitegrid")
    ops =df.groupby('primarySupplier',as_index=False)['calcMass'].sum().sort_values(by='calcMass',
                                                                                 ascending=False)[:6].primarySupplier.tolist()
    ngroups = len(ops)
    t = df[df.primarySupplier.isin(ops)]
    #print(len(t))
    #t = t[t.record_flags.str.contains('M')]
    fig = plt.figure(figsize=(10,ngroups+1))
    ax = sns.stripplot(x=t.calcMass,y=t.primarySupplier,jitter=.2,alpha=.7)
    plt.xlabel(f'mass in pounds: {cas}',fontsize=14);
    plt.title(f'Variability in use of {cas} across some primarySuppliers',fontsize=16)
    ax.set(xscale='log')
    ax.set(xlim=(max(0.1,t.calcMass.min()),t.calcMass.max()*1.1))
    ax.tick_params(axis="x", labelsize=14)
    ax.tick_params(axis="y", labelsize=14)
    locmaj = matplotlib.ticker.LogLocator(base=10,subs='all') 
    ax.xaxis.set_major_locator(locmaj)


<a id='biggest100'></a>

In [None]:
if have_mass:
    if massPres < 100:
        display(md('## All disclosures with mass'))
    else:
        display(md('--- \n ---\n## Disclosures with the 100 largest uses'))
    display(md('**calcMass** is in pounds. **map_link** provides a google map of site.  Note that the Google satellite map may be older than the well pad.'))

In [None]:
if have_mass:
    mdf = df[df.calcMass>0].sort_values('calcMass',ascending=False).reset_index(drop=True)
    #df['mass (pounds)'] = df.calcMass.map(lambda x: round_sig(x,4))
    #df['water carrier (gallons)'] = df.TotalBaseWaterVolume.map(lambda x: round_sig(x,3))
    iShow(mdf[['calcMass','bgStateName','bgCountyName','map_link','bgOperatorName','APINumber','TotalBaseWaterVolume',
                'date','primarySupplier','bgSupplier','TradeName_trunc']].head(100))


<a id='raw'></a>

---
---
# Raw fields for this chemical
These are the fields in the raw FracFocus data as they are entered for the records positively identified as this chemical.  These raw fields often have typos, variations on a chemical name, etc.  In some cases, two variations may seem identical, but probably differ by non-printing characters.

In [None]:

if have_data:
    display(md('## Raw CASNumber variations encountered for this chemical'))

    out = df.groupby('CASNumber',as_index=False)['bgCAS'].count().reset_index(drop=True)
    gb1 = df.groupby('CASNumber')['bgOperatorName'].apply(set).reset_index()
    gb1.columns = ['CASNumber','Operators']
    gb1.Operators = gb1.Operators.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb1,on='CASNumber',how='left')
    gb2 = df.groupby('CASNumber')['primarySupplier'].apply(set).reset_index()
    gb2.columns = ['CASNumber','primarySupplier']
    gb2.primarySupplier = gb2.primarySupplier.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb2,on='CASNumber',how='left').reset_index(drop=True)

    out.columns = ['CASNumber','Number of records','Operators','primarySuppliers']
    iShow(out,maxBytes=0)

In [None]:
if have_data:
    display(md('## Raw IngredientName variations encountered for this chemical'))

    out = df.groupby('IngredientName',as_index=False)['bgCAS'].count().reset_index(drop=True)
    gb1 = df.groupby('IngredientName')['bgOperatorName'].apply(set).reset_index()
    gb1.columns = ['IngredientName','Operators']
    gb1.Operators = gb1.Operators.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb1,on='IngredientName',how='left')
    gb2 = df.groupby('IngredientName')['primarySupplier'].apply(set).reset_index()
    gb2.columns = ['IngredientName','primarySupplier']
    gb2.primarySupplier = gb2.primarySupplier.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb2,on='IngredientName',how='left').reset_index(drop=True)

    out.columns = ['IngredientName','Number of records','Operators','primarySuppliers']
    iShow(out,maxBytes=0)

In [None]:
if have_data:
    display(md('## Raw TradeName variations encountered for this chemical\n Names truncated to 30 characters before processing'))

    df.TradeName = np.where(df.TradeName.isna(),'MISSING',df.TradeName)

    out = df.groupby('TradeName_trunc',as_index=False)['bgCAS'].count().reset_index(drop=True)
    gb1 = df.groupby('TradeName_trunc')['bgOperatorName'].apply(set).reset_index()
    gb1.columns = ['TradeName_trunc','Operators']
    gb1.Operators = gb1.Operators.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb1,on='TradeName_trunc',how='left')
    gb2 = df.groupby('TradeName_trunc')['primarySupplier'].apply(set).reset_index()
    gb2.columns = ['TradeName_trunc','primarySupplier']
    gb2.primarySupplier = gb2.primarySupplier.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb2,on='TradeName_trunc',how='left').reset_index(drop=True)
    gb3 = df.groupby('TradeName_trunc')['bgSupplier'].apply(set).reset_index()
    gb3.columns = ['TradeName_trunc','bgSupplier']
    gb3.bgSupplier = gb3.bgSupplier.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb3,on='TradeName_trunc',how='left').reset_index(drop=True)

    out.columns = ['TradeName_trunc','Number of records','Operators','primarySuppliers','bgSupplier']
    iShow(out,maxBytes=0)

In [None]:
if have_data:
    display(md('## Raw Purpose variations encountered for this chemical\n Values truncated to 30 characters before processing'))
    df.Purpose = np.where(df.Purpose.isna(),'MISSING',df.Purpose)
    df['Purp_trunc'] = np.where(df.Purpose.str.len()>30,
                                df.Purpose.str[:30]+'...',
                                df.Purpose)
    out = df.groupby('Purp_trunc',as_index=False)['bgCAS'].count().reset_index(drop=True)
    gb1 = df.groupby('Purp_trunc')['bgOperatorName'].apply(set).reset_index()
    gb1.columns = ['Purp_trunc','Operators']
    gb1.Operators = gb1.Operators.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb1,on='Purp_trunc',how='left')
    gb2 = df.groupby('Purp_trunc')['primarySupplier'].apply(set).reset_index()
    gb2.columns = ['Purp_trunc','primarySupplier']
    gb2.primarySupplier = gb2.primarySupplier.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb2,on='Purp_trunc',how='left').reset_index(drop=True)
    gb3 = df.groupby('Purp_trunc')['bgSupplier'].apply(set).reset_index()
    gb3.columns = ['Purp_trunc','bgSupplier']
    gb3.bgSupplier = gb3.bgSupplier.map(lambda x: xlate_to_str(x,maxlen=10))
    out = pd.merge(out,gb3,on='Purp_trunc',how='left').reset_index(drop=True)

    out.columns = ['Purp_trunc','Number of records','Operators','primarySuppliers','bgSupplier']
    iShow(out,maxBytes=0)

<a id='massCompanies'></a>

In [None]:
if have_data:
    if massPres < pres:
        display(md('---\n ---\n## Operators not reporting enough data for mass calculations for this chemical'))
        display(md('Shown are operators and how many records without mass, as well as the types of information missing or in error for each company. In addition, not all disclosures are curated by Open-FF yet.'))
        tmp = df[~(df.calcMass>0)].groupby('bgOperatorName',as_index=False)['UploadKey'].count()
        df['not_curated'] = df.carrier_status=='unknown'
        df['noPerc'] = ~(df.PercentHFJob>0)
        df['noTBWV'] = ~df.has_TBWV
        df['noWaterCarrier'] = ~df.has_water_carrier
        df['outOfTolerance'] = ~df.within_total_tolerance
        gb = df[~(df.calcMass>0)].groupby('bgOperatorName',as_index=False)[['not_curated',
                                                                            'noTBWV','noWaterCarrier',
                                                                            'outOfTolerance','noPerc']].sum()
        tmp = pd.merge(tmp,gb,on='bgOperatorName',how='left').reset_index(drop=True)
        tmp.not_curated = ((tmp.not_curated/tmp.UploadKey)*100).round(0)
        tmp.noTBWV = ((tmp.noTBWV/tmp.UploadKey)*100).round(0)
        tmp.noWaterCarrier = ((tmp.noWaterCarrier/tmp.UploadKey)*100).round(0)
        tmp.outOfTolerance = ((tmp.outOfTolerance/tmp.UploadKey)*100).round(0)
        tmp.noPerc = ((tmp.noPerc/tmp.UploadKey)*100).round(0)
        tmp.columns = ['Operator','records without mass','% carrier not curated','% without TBWV','% no water carrier record',
                       '% total percent out of tolerance','% no PercentHFJob']
        iShow(tmp,maxBytes=0)    

In [None]:
# from IPython.core.display import HTML
# def css_styling():
#     styles = open("./styles/custom.css", "r").read()
#     return HTML(styles)
# css_styling()