In [None]:
import sys
sys.path.insert(0,'c:/MyDocs/integrated/') # adjust to your setup

%run "catalog_support.py" 
showHeader('Recent FracFocus Disclosures')

In [None]:
df = fh.get_df(os.path.join(hndl.curr_repo_dir,'full_df.parquet'))
df = df[df.in_std_filtered]
# df = pd.read_parquet('newdf.parquet')


In [None]:
gb2 = df.groupby('bgCAS')['IngredientName'].agg(lambda x: x.value_counts().index[0])
gb2 = gb2.reset_index()
gb2.columns = ['bgCAS','comm_name']
df = pd.merge(df,gb2,on='bgCAS',how='left')

df['perc_rank'] = df.groupby('bgCAS')['calcMass'].rank(pct=True)*100

In [None]:
#preamble to analysis
# import pandas as pd
# import numpy as np
import matplotlib.pyplot as plt
from pylab import gca, mpl
%matplotlib inline
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import seaborn as sns
import matplotlib.ticker
from math import ceil

from time import sleep
from datetime import datetime, timedelta
now = datetime.now()
one_year_ago = now-timedelta(days=365)
today = str(datetime.today())
today = today.split()[0]
todaytxt = datetime.today().strftime("%B %d, %Y")


In [None]:
updates = fh.get_df(os.path.join(hndl.curr_repo_dir,'curation_files','upload_dates.parquet'))
updates['dt_added'] = pd.to_datetime(updates.date_added)

new_dId = updates[updates.weekly_report=='NO'].DisclosureId.tolist()
last_report = str(updates[updates.weekly_report=='DONE'].dt_added.max()).split()[0] 

last_report_dt = pd.to_datetime(last_report)
last_report = last_report_dt.strftime("%B %d, %Y")

In [None]:
# updates[updates.weekly_report!='DONE']

In [None]:
display(md(f"""## FracFocus disclosures published between {last_report} and {hndl.bulkdata_date}"""))

This report summarizes the fracking jobs published recently at [FracFocus](https://fracfocus.org/), the industry-funded chemical disclosure instrument. It includes the Who, What and Where of recent fracking activity: the operating companies, the chemicals they use (including quantities), and where these jobs are located.

This report is produced by [Open-FF](https://frackingchemicaldisclosure.wordpress.com/), an open source project, sponsored by [The FracTracker Alliance](https://www.fractracker.org/), to make the FracFocus data more usable. The nature of the fracking chemical data is complicated and can be difficult to make sense of; Open-FF aims to make it those data more digestible. In addition, FracFocus disclosures are plagued by inconsistencies, ambiguous and missing values and many obvious data errors; Open-FF flags and filters many of those problems. Our hope is that these reports give readers both big-picture perspectives of industry activities as well as enough detail to dig deeply into specifics such as individual chemicals, fracking job, or company.

   
## In this report:

### Overview of all FracFocus disclosures published this period by
- [State](#state), [County](#county), [Operators](#operators), [Land type](#fed) 
- [Water use](#water)
- [Recognized Chemicals of Concern](#chems)
- [Proprietary claims](#proprietary) 

### [Detailed list](#detailed) of all disclosures including:
- Location, company name, API number, water use, chemicals of concern, and Federal/Indian well indicator
- Link to satellite image and Google map of fracking site

#### [Disclosures with masses of Chemicals of Concern](#bigcomp) in the top 10% of FracFocus   

# Overview

In [None]:
display(md(f'### Since {last_report}, the number of new disclosures added to FracFocus is: '))
display(md(f'> # {len(new_dId)}'))

In [None]:
# df.columns

newdf = df[df.DisclosureId.isin(new_dId)].copy()
# newdf.to_parquet('newdf.parquet')
if len(newdf)>0:
    newdf['FF_disc'] = newdf.apply(lambda x: th.getFFLink(x),axis=1)
    newdf['map_link'] = newdf.apply(lambda x: th.getMapLink(x,'map'),axis=1)
else:
    newdf['map_link'] = ''
    newdf['FF_disc'] = ''

In [None]:
display(md(f'### Where these {len(new_dId)} fracking jobs occurred:'))

<a id='state'></a>
###   ...  by state

In [None]:
gb = newdf.groupby(['StateName','CountyName','DisclosureId'],as_index=False)['CASNumber'].count().drop('CASNumber',axis=1)
#len(gb)

In [None]:
#print(gb.head())
gb.groupby(['StateName'])['DisclosureId'].count().sort_values().plot.barh(ylabel='',#figsize=(10,6),
                                                             title="Number of this week's disclosures, by state",
                                                             fontsize=12);

<a id='county'></a>
###   ...  by county
To see just one state, click on the state name in pie.

In [None]:
import plotly.express as px
import plotly.offline as pyo
gbb = newdf.groupby(['StateName','CountyName','DisclosureId'],as_index=False).size()
gbb = gbb.groupby(['StateName','CountyName'],as_index=False).DisclosureId.count().rename({'DisclosureId':'disclosures'},axis=1)
# # Set notebook mode to work in offline
pyo.init_notebook_mode()
fig = px.sunburst(gbb, path=['StateName', 'CountyName'], values='disclosures',
                 width=700, height=700, title='States and Counties, share')

fig.show()


### ... by Map
After zooming in to see individual well markers, click on the marker to see details about the well and disclosure.

Wells are not mapped if a disclosure's geolocation data is inconsistent with its state and county location. (Most of Alaska's disclosures are not represented by acceptable county names, and so are excluded.)

The satellite image is probably older than the construction of the well pad; therefore the pad may not be in the image.

In [None]:
def PointMap(df):
    gb = df[df.loc_within_county=='YES'].groupby('DisclosureId',as_index=False)[['bgLatitude','bgLongitude','APINumber','TotalBaseWaterVolume',
                                 'date','OperatorName','WellName']].first()
    gb['date'] = gb.date.astype('str')
    gb.TotalBaseWaterVolume = gb.TotalBaseWaterVolume.map(lambda x: th.round_sig(x,3,guarantee_str='??')) + ' gallons'
    gb.APINumber = gb.APINumber.astype('str')
    # gb.drop('date',axis=1,inplace=True)
    return mapping.create_integrated_point_map(gb,fields=['APINumber','TotalBaseWaterVolume','date','OperatorName','WellName'],
                                               aliases=['API Number','Water Volume','Job End Date','Operator','Well Name'],
                                              use_remote=True)

PointMap(newdf)

<a id='operators'></a>
### ...  by top Operators

In [None]:
gb = newdf.groupby(['OperatorName','DisclosureId'],as_index=False)['CASNumber'].count()
gb.groupby(['OperatorName'])['DisclosureId'].count().sort_values()[-15:].plot.barh(figsize=(6,6),
                                                                                    title='number of disclosures');

### Operator list for all disclosures

In [None]:
#gbs = newdf.groupby(['OperatorName','DisclosureId'],as_index=False)['StateName'].count()
gb2 = newdf.groupby(['OperatorName'])['StateName'].apply(set)
gb1 = gb.groupby(['OperatorName'],as_index=False)['DisclosureId'].count()\
        .sort_values(['DisclosureId','OperatorName'],ascending=False)
gb1 = pd.merge(gb1,gb2,on='OperatorName',how='left')
gb1.StateName = gb1.StateName.map(lambda x: th.xlate_to_str(x, sep= ', '))
gb1.columns = ['Operator','Num disclosures','in States']

iShow(gb1.reset_index(drop=True),columnDefs=[{'width':'100px', 'targets':"_all"}])



---
<a id='water'></a>
## Water use for this report's disclosures

Thick vertical lines in the graph indicate 25, 50 and 75% percentiles for this report's disclosures.

In [None]:
import seaborn as sns
sns.set(style="whitegrid")
wdf = newdf.groupby('DisclosureId',as_index=False)[['TotalBaseWaterVolume','StateName',
                                                    'ws_perc_total','perc_pw',
                                                    'perc_sw_high_TDS','perc_sw_low_TDS',
                                                    'perc_gw_high_TDS','perc_gw_low_TDS',
                                                    'perc_other_high_TDS','perc_other_low_TDS']].first()
sn = wdf.groupby('StateName',as_index=False)['DisclosureId'].count().astype('str')
sn.columns = ['StateName', 'cnt']
wdf = pd.merge(wdf,sn,on='StateName',how='left')
wdf['State_Num'] = wdf.StateName+'__'+wdf.cnt
wdf = wdf.sort_values('cnt',ascending=False)
fig = plt.figure(figsize=(15,8))
ax = sns.stripplot(x=wdf.TotalBaseWaterVolume,y=wdf.State_Num,jitter=.2,alpha=.4,size=10)
plt.xlabel(f'water volume (gallons):',fontsize=14);
plt.title(f'Water Use for disclosures published since the last report',fontsize=16)
ax.set(xscale='log')
ax.set(xlim=(max(10000,wdf.TotalBaseWaterVolume.min()),wdf.TotalBaseWaterVolume.max()*1.2))
ax.grid(axis='y')
ax.tick_params(axis="x", labelsize=14)
ax.tick_params(axis="y", labelsize=14)
locmaj = matplotlib.ticker.LogLocator(base=10,subs='all') 
ax.xaxis.set_major_locator(locmaj)

lns = list(np.percentile(wdf[wdf.TotalBaseWaterVolume>0].TotalBaseWaterVolume,[25,50,75]))
ax.set_ylim(-0.7,len(sn)-0.3)
for l in lns:
    plt.vlines(l,-0.7,len(sn),color='black')
totgal = wdf.TotalBaseWaterVolume.sum()

s = f' -- number of empty TBWV: {(~(wdf.TotalBaseWaterVolume>0)).sum()}\n'
s+= ' -- 25%:  {:,} gallons\n'.format(int(lns[0]))
s+= ' -- 50%:  {:,} gallons\n'.format(int(lns[1]))
s+= ' -- 75%:  {:,} gallons\n'.format(int(lns[2]))
s+= ' -- max:  {:,} gallons\n'.format(int(wdf.TotalBaseWaterVolume.max()))
s+= ' -- TOTAL {:,} gallons\n'.format(int(totgal))
print(s)


## Reported Water Sources

In [None]:
wdf = wdf[wdf.ws_perc_total==100]

wdf['perc_sw'] = (wdf.perc_sw_low_TDS + wdf.perc_sw_high_TDS)
wdf['perc_gw'] = (wdf.perc_gw_low_TDS + wdf.perc_gw_high_TDS)
wdf['perc_other'] = (wdf.perc_other_low_TDS + wdf.perc_other_high_TDS)

gb = wdf.groupby('StateName', as_index=False)[['ws_perc_total','perc_pw',
                                                    'perc_sw',
                                                    'perc_gw',
                                                    'perc_other']].sum()
gb.perc_pw = gb.perc_pw/gb.ws_perc_total*100
gb.perc_sw = gb.perc_sw/gb.ws_perc_total*100
gb.perc_gw = gb.perc_gw/gb.ws_perc_total*100
gb.perc_other = gb.perc_other/gb.ws_perc_total*100
gb['num_disclosures'] = gb.ws_perc_total/100
gb = gb.sort_values('num_disclosures', ascending=False)
gb[['StateName','num_disclosures','perc_sw','perc_gw','perc_pw','perc_other']].rename({'perc_sw':'% Surface Water',
                                                                                      'perc_gw':'% Ground Water',
                                                                                      'perc_pw':'% Produced Water',
                                                                                      'perc_other':'% Other'},axis=1)


<a id='chems'></a>
## Recognized Chemicals of Concern used in these disclosures
The following table is based on lists of chemicals that have health or environmental effects: SDWA, CWA, Proposition 65, PFAS and TEDX. 

In [None]:
# new chemicals of concern table
newdf['on_list'] = np.where(newdf.is_on_CWA,'CWA; ','')
newdf.on_list = np.where(newdf.is_on_DWSHA,newdf.on_list+'DWSHA; ',newdf.on_list)
newdf.on_list = np.where(newdf.is_on_TEDX,newdf.on_list+'TEDX; ',newdf.on_list)
newdf.on_list = np.where(newdf.is_on_prop65,newdf.on_list+'prop65; ',newdf.on_list)
newdf.on_list = np.where(newdf.is_on_PFAS_list,newdf.on_list+'PFAS; ',newdf.on_list)
# newdf.on_list = np.where(newdf.is_on_volatile_list,newdf.on_list+'Volatile; ',newdf.on_list)

chemconcern = newdf[~(newdf.on_list=='')].groupby('bgCAS',as_index=False)[['comm_name']].first()\
                     .rename({'comm_name':'Chemical Name'},axis=1)
gb1 =  newdf[~(newdf.on_list=='')].groupby('bgCAS',as_index=False)[['IngredientName']].count().rename({'IngredientName':'record count'},
                                                                                                       axis=1)
gb2 = newdf[~(newdf.on_list=='')].groupby('bgCAS',as_index=False)[['on_list']].first()
gb3 = newdf[~(newdf.on_list=='')].groupby('bgCAS',as_index=False)[['calcMass']].max().rename({'calcMass':'Largest Mass'},
                                                                                               axis=1)
gb4 = newdf[~(newdf.on_list=='')].groupby('bgCAS',as_index=False)[['perc_rank']].max().rename({'perc_rank':'% Rank of Largest Mass'},
                                                                                               axis=1)

chemconcern = pd.merge(chemconcern,gb1,on='bgCAS',how='left')
chemconcern = pd.merge(chemconcern,gb2,on='bgCAS',how='left')
chemconcern = pd.merge(chemconcern,gb3,on='bgCAS',how='left')
chemconcern = pd.merge(chemconcern,gb4,on='bgCAS',how='left')
l=[]
for i in range(len(chemconcern)):
    l.append('c'+str(i+1).zfill(2))
chemconcern['CID'] = pd.Series(l)
chemconcern['CAS Number'] = chemconcern.bgCAS.map(lambda x: th.getCatLink(x,x,use_remote=True))
print(f'Total number of records on Chemical of Concern list for these disclosures: {chemconcern["record count"].sum()}')
iShow(chemconcern[['CID','CAS Number','Chemical Name','record count',
                   'Largest Mass','% Rank of Largest Mass',
                   'on_list']].reset_index(drop=True),
      index=False,maxBytes=0,classes="display compact cell-border")

|Explanation of columns in the index above|
| :---: |


| Column      | Description |
| :----: | :-------- |
|*CID*| is the chemical ID used in the disclosure table | 
|*CAS Number*| is the CAS registration number of the chemical. Click on the number for a summary of this chemical for all of FracFocus.|  
|*Chemical Name*| is one of the common names for the material|
|*record count* |indicates the number of records in this week's disclosures. A given chemical may appear more than once in a disclosure|
|*Largest mass*| is the single greatest mass of the chemical in this week's disclosures (in pounds)|
|*% Rank of largest Mass*|how the largest mass from this week compares to the rest of FracFocus data (through last update). Ex. 95.0 means that this mass was in the top 5% of all uses ever recorded in FracFocus. If there are records with values of 90% or more, a list of all disclosures are in a [table](#bigcomp) later in this report. 
|*on_list*|indicates which of the following lists the chemical is on.  Type the name of the list into the Search box to limit to those chemicals.<br> - **CWA**: indicates that the chemical is on the [Clean Water Act list](https://comptox.epa.gov/dashboard/chemical_lists/CWA311HS) as compiled in EPA's CompTox<br>- **DWSHA**: indicated that the chemical is on the EPA's [Drinking Water Safety and Health Advisory](https://comptox.epa.gov/dashboard/chemical_lists/EPADWS) list<br>- **TEDX**: indicates that the chemical is on [The Endocrine Disruption Exchange](https://endocrinedisruption.org) list<br>- **prop_65**: indicates that the chemical is on [California's Proposition 65](https://oehha.ca.gov/proposition-65/proposition-65-list) list<br>- **PFAS**: indicates that the chemical is on EPA's comprehensive list of PFAS related compounds|



In [None]:
newdf['proprietary'] = newdf.bgCAS=='proprietary'
gb1 = newdf.groupby('DisclosureId',as_index=False)['PercentHFJob'].sum()
gb1['in_perc_tolerance'] = (gb1.PercentHFJob<110)&(gb1.PercentHFJob>90)

gb2 = newdf[newdf.proprietary].groupby('DisclosureId',as_index=False)['calcMass'].sum()
gb2 = gb2.rename({'calcMass':'prop_mass'},axis=1)

gb = newdf.groupby(['DisclosureId'],as_index=False)[['CountyName','StateName',
                                                    'OperatorName','APINumber',
                                                    'TotalBaseWaterVolume','ws_perc_total','TVD',
                                                    # 'FederalWell','IndianWell',
                                                    'date','map_link','FF_disc']].first()
gb = pd.merge(gb,gb1[['DisclosureId','PercentHFJob']],on='DisclosureId',how='left')
gb = pd.merge(gb,gb2,on='DisclosureId',how='left')


In [None]:
newdf = pd.merge(newdf,chemconcern[['bgCAS','CID']],on='bgCAS',how='left')

In [None]:
def sort_id(st):
    l = list(st)
    l.sort()
    return l
weekchem = newdf[newdf.bgCAS.isin(chemconcern.bgCAS.tolist())].groupby(['DisclosureId'])['CID'].apply(set).reset_index()
weekchem.CID = weekchem.CID.map(lambda x: sort_id(x))
newdf = newdf.drop('CID',axis=1)
disc = pd.merge(gb,weekchem,on='DisclosureId',how='left')

<a id='proprietary'></a>
# Proprietary labeling 
FracFocus allows disclosures to hide the identity of chemicals that companies claim are business secrets.  This practice has been controversial since the beginning of FracFocus.  Some changes in format (the 'system approach') were made to purportedly reduce the use of proprietary claims.  Nevertheless, they are still commonly employed.  The following summarizes how much the past week's disclosures hid chemical identity with these claims. A detailed disclosure listing later in this report specifies the percentage of chemical records that are hidden by these claims for every disclosure.  They range from zero percent to 70% or even more.   

As of version 10, Open-FF detects proprietary claims by both the CASNumber and IngredientName fields.  


In [None]:
numprop = newdf.proprietary.sum()
display(md(f'## Total number of proprietary claims since {last_report}: {numprop:,} records'))
display(md('---'))

In [None]:
# def proprietary_bars(df,plot_title='TEST'):
#     df = df.copy()
#     df['year'] = df.date.dt.year
#     prop = df.bgCAS=='proprietary'
#     gb = df[prop].groupby('DisclosureId',as_index=False)['bgCAS'].count().rename({'bgCAS':'numprop'},axis=1)
#     gb1 = df[df.is_valid_cas].groupby('DisclosureId',as_index=False)['bgCAS'].count().rename({'bgCAS':'numvalid'},axis=1)
#     gb2 = df.groupby('DisclosureId',as_index=False)['date'].first()
#     mg = pd.merge(gb2,gb,on='DisclosureId',how='left')
#     mg = pd.merge(mg,gb1,on='DisclosureId',how='left')
#     mg.fillna(0,inplace=True) # there will be disclosures with 0 proprietary; need to fillb
#     mg['percProp'] = (mg.numprop / mg.numvalid) * 100

#     mg['propCut'] = pd.cut(mg.percProp,right=False,bins=[0,0.0001,10,25,50,101],
#                           labels=['no proprietary designations','up to 10% of records\nare proprietary designations',
#                                   'between 10 and 25% of records\nare proprietary designations',
#                                   'between 25 and 50% of records\nare proprietary designations',
#                                   'greater than 50% of records\nare proprietary designations'])
    
#     # mg.propCut.value_counts(sort=False).plot(kind='barh',colormap='Reds')
    
#     import seaborn as sns
#     t = mg.propCut.value_counts(sort=False).reset_index()
#     totcnt = t.propCut.sum()
#     t['prop_perc'] = t.propCut/totcnt *100
#     ax = sns.barplot(data=t,y='index',x='propCut',palette='Reds',orient="h")
#     ax.set_xlabel("Number of disclosures")
#     ax.set_ylabel("")
#     ax.set_title(plot_title)
#     # ax.set_xlim(right=58000)
#     ax.invert_yaxis()
    
#     perc_lst = t.prop_perc.tolist()
#     for i,p in enumerate(ax.patches):
#         width = p.get_width()
#         #nw = f'  {round_sig(width,8)}'
#         nw = f'  {float(round(perc_lst[i],1))}%'
#         plt.text(p.get_width(), p.get_y()+0.55*p.get_height(),
#                  nw,
#                  ha='left', va='center',fontsize=12)

prop = newdf.bgCAS=='proprietary'
gb = newdf[prop].groupby('DisclosureId',as_index=False)['bgCAS'].count().rename({'bgCAS':'numprop'},axis=1)
gb1 = newdf[newdf.is_valid_cas].groupby('DisclosureId',as_index=False)['bgCAS'].count().rename({'bgCAS':'numvalid'},axis=1)
gb2 = newdf.groupby('DisclosureId',as_index=False)['date'].first()
mg = pd.merge(gb2,gb,on='DisclosureId',how='left')
mg = pd.merge(mg,gb1,on='DisclosureId',how='left')
mg.fillna(0,inplace=True) # there will be disclosures with 0 proprietary; need to fillb
mg['perc_proprietary'] = (mg.numprop / mg.numvalid) * 100


c_plots.proprietary_bars(mg,plot_title='How heavily are Trade Secrets used in these new disclosures?')

In [None]:
# proprietary summaries
newdf['clean_cas'] = (newdf.is_valid_cas)&~(newdf.bgCAS=='proprietary')
pgb = newdf.groupby('DisclosureId',as_index=False)[['proprietary','clean_cas']].sum()
pgb['fraction_prop'] = pgb.proprietary/(pgb.clean_cas+pgb.proprietary)
pgb['percent_proprietary'] = (pgb.fraction_prop)*100
#print(pgb.percent_proprietary)
# ax = pgb.percent_proprietary.hist()
# ax.set_ylabel('Number of disclosures')
# ax.set_xlabel('Percent of records that are proprietary')
# ax.set_title('Percent of records in a disclosure that are claimed as "proprietary"',fontsize=15);

In [None]:
# # proprietary summaries by state
# sts = newdf.StateName.unique().tolist()
# sts.sort()
# newdf['cas_stat'] = np.where(newdf.proprietary,'proprietary','other')
# newdf['cas_stat'] = np.where(newdf.clean_cas,'chem identified',newdf.cas_stat)
# if len(sts)>4:
#     griddim = (ceil(len(sts)/4),4)
#     emp = griddim[0]*griddim[1] - len(sts)

#     fig, axes = plt.subplots(ceil(len(sts)/4), 4, figsize=(15, 10))
#     for i,st in enumerate(sts):
#         ax = axes[i // 4, i % 4]
#         cond = (newdf.StateName==st)&(~(newdf.cas_stat=='other'))
#         newdf[cond].groupby(['cas_stat'])['DisclosureId'].count().plot.pie(ylabel='',fontsize=10,
#                                                                       ax = ax, title=st)
#     if emp>0:
#         for i in range(emp):
#             fig.delaxes(axes.flatten()[-(i+1)])    
# else:
#     fig,axes = plt.subplots(1,len(sts),figsize=(10,10))
#     for i,st in enumerate(sts):
#         ax = axes[i]
#         cond = (newdf.StateName==st)&(~(newdf.cas_stat=='other'))
#         newdf[cond].groupby(['cas_stat'])['DisclosureId'].count().plot.pie(ylabel='',fontsize=10,
#                                                                       ax = ax, title=st)


In [None]:
# <a id='proprietary_names'></a>
# ## Ingredient names used for proprietary records for this report's disclosures
# Because these are proprietary claims, the reported ingredient name will not be specific, usually just a general class.

# In some cases, companies do not report the PercentHFJob or report 0.0% for proprietary records. We use 0.0 pounds for those records in the following table.

In [None]:
# newdf['ing'] = newdf.IngredientName.str.strip().str.lower()
# tmp = pd.DataFrame(newdf[newdf.proprietary].value_counts('ing'))
# promass = newdf[newdf.proprietary].groupby('ing',as_index=False)['calcMass'].sum()
# promass.calcMass = promass.calcMass.astype('int')
# tmp = pd.merge(tmp,promass,on='ing',how='left')
# tmp = tmp.reset_index(drop=True)
# propops = newdf[newdf.proprietary].groupby('ing')['bgOperatorName'].apply(set)
# tmp = pd.merge(tmp,propops,on='ing',how='left')
# tmp.bgOperatorName = tmp.bgOperatorName.map(lambda x: th.xlate_to_str(x,'; ',trunc=True,tlen=20,maxlen=10))
# tmp = tmp.sort_values([0,'calcMass'],ascending=False)
# tmp.columns=['Reported Ingredient Name','Num of records','total proprietary mass (pounds)','Operators using this label']
# #tmp.style.hide_index().format({'total proprietary mass (pounds)':"{:,}"})
# tmp = tmp.reset_index(drop=True)

# opt.classes = ['display','compact']
# opt.columnDefs=[{"width": "1200px", "targets": "Num of records"}]
# iShow(tmp,maxBytes=0)

---
# Detailed disclosure listings

### Notes:

**Listing of chemicals in individual disclosures:**  In the table below, individual fracking disclosures are identified by APINumber.  If you are interested in seeing the details of the raw data, use that APINumber at the FracFocus ["Find a Well" site](http://fracfocusdata.org/DisclosureSearch/Search.aspx).  That search site will serve pdf files of individual fracking events to your computer with most of the same raw data available used here. However,  mass of chemicals is not available from these pdfs.

**Maps:**  In the table below, a link is provided to a Google map/satellite view of the location provided in the disclosure (click on the APINumber).   Many recently published fracking sites are **newer** than the satellite image that Google uses, so the well pad may not be visible.  However, you can still view the geographic context of the drilling site.

<a id='detailed'></a>
## Listing of all new disclosures published since last report

|Columns descriptions and some issues to look for |
| :--- |

- **FID** - Fracking job ID number (used in following table to identify heavy use of chemical)
- **Water volume** - volume in gallons of the water base fluid used in a fracking job
  - Especially large fracking carrier (>30 million gallons)
  - No data or a report of ZERO gallons of water. This prevents the calculation of chemical mass, though direct reporting may still be available
- **TVD** - True vertical depth (feet); Some companies may be reporting both veritcal and horizontal lengths in this number.
  - No data or a report of ZERO feet.
  - Deeper than deepest oil well (35,050 ft.): likely error
- **Percent proprietary** - proportion of chemical records for which identity is hidden
- **End date** - Last day of job
  - many states require disclosure within 30 to 90 days. 
  - occasionally a reported end date is in the future, indicating mis-representation
- **Total percentage ("perc sum")** - the sum of all records in a disclosure should be 100% (within a tolerance)
  - if less than 90%, disclosure is probably incomplete
  - greater than 110% - often caused by a "system approach"  entry error: duplicates of some chemical records or hidden percentages in the tradename/purpose/supplier section. 
- **CIDs in disclosure** - see the Chemicals of Concern table above for more information
- **APINumber** - click on this link to view FracFocus's online disclosure and PDF.
- **map_link.** - Google map view of the reported location. While location data can be wrong, be aware that the well pad may be newer than the Google satellite view and therefore not in the image.

In [None]:
#gb['Water_vol_gallons'] = gb.TotalBaseWaterVolume.map(lambda x: round_sig(x,3))
# disc['Type_well'] = np.where(disc.FederalWell=='True','Federal','')
# disc.Type_well= np.where(disc.IndianWell=='True','Indian',disc.Type_well)
# disc.Type_well= np.where((disc.IndianWell=='True')&(disc.FederalWell=='True'),'Fed & Indian',disc.Type_well)
# gb = gb.drop(['TotalBaseWaterVolume','FederalWell','IndianWell'],axis=1)
gbn = disc.copy()

gbn['datestr'] = gbn.date.apply(lambda x: x.strftime('%Y-%m-%d'))
gbn = pd.merge(gbn,pgb[['DisclosureId','fraction_prop']],on='DisclosureId',how='left')
gbn = gbn.sort_values(['StateName','CountyName','OperatorName']).reset_index()
gbn['ws'] = gbn.ws_perc_total>0
l=[]
for i in range(len(gbn)):
    #print(i,row.StateName,row.CountyName)
    l.append('F'+str(i+1).zfill(3))
gbn['FID'] = pd.Series(l)
if len(gbn)>0:
    t = gbn[['FID','StateName','CountyName','OperatorName','FF_disc','map_link','datestr',
            'TotalBaseWaterVolume','ws','TVD','PercentHFJob',
            'fraction_prop','CID']].copy()
    t.CID = t.CID.map(lambda x: th.xlate_to_str(x))
    #t.APINumber = t.APINumber.map(lambda x: make_anchor(x))
    t.TotalBaseWaterVolume.fillna(0,inplace=True)
    t.TotalBaseWaterVolume = t.TotalBaseWaterVolume.astype('int')
    t.TVD = t.TVD.astype('int')
    t.PercentHFJob= t.PercentHFJob.astype('int')

    t = t.rename({'StateName':'State',
                  'CountyName':'County',
                  'OperatorName':'Operator',
                  #'APINumber':'API Number',
                  'FF_disc':'API Number (link to FF)',
                  'datestr':'end date',
                  'pub_delay':'pub delay days',
                  'TotalBaseWaterVolume':'Water volume (gal)',
                  'TVD':'TVD (feet)',
                  'PercentHFJob':'perc sum',
                  'map_link':'Google map',
                  'ws':'water source reported',
                  # 'Type_well':'Type Land',
                  'fraction_prop':'fraction proprietary',
                  'CID':'CIDs in disclosure'},
                  #'prop_mass':'proprietary mass pounds'},
                  axis=1)
    iShow(t,maxBytes=0,classes="display compact cell-border")

In [None]:
have_big_concerns = False
if chemconcern['% Rank of Largest Mass'].max() > 90:
    have_big_concerns = True
    ccbgcas = chemconcern.bgCAS.unique().tolist()
    bigcomp = newdf[newdf.bgCAS.isin(ccbgcas)].copy()
    bigcomp = bigcomp[bigcomp['perc_rank']>90]
    bigcomp = bigcomp[['APINumber','bgCAS','comm_name','calcMass','perc_rank','bgOperatorName','FF_disc']]


<a id='bigcomp'></a>
## Disclosures with chemicals-of-concern masses in the top 10% of all FracFocus

In [None]:
if have_big_concerns:
    #display(md('## Disclosures with chemicals-of-concern masses in the top 10% of all FracFocus'))
    display(md('A disclosure may have more than one chemical record in the top 10% for a given CASNumber. This could be due to more than one Trade-named product using that chemical.'))
    mg = pd.merge(bigcomp,gbn[['FID','APINumber','OperatorName']],on='APINumber',how='left')
    bigc = mg.reset_index(drop=True)

    bigc.columns = ['APINumber_raw','CAS Number','Name','Mass of chemical','Percent ranking in FracFocus','bgOperatorName','API Number','FID','Operator']
    iShow(bigc[['FID','API Number','Operator','CAS Number','Name','Mass of chemical','Percent ranking in FracFocus']],
          maxBytes=0,classes="display compact cell-border")
else:
    display(md('## No disclosures with chemicals-of-concern masses in the top 10% of FracFocus'))