In [None]:
import sys
sys.path.insert(0,'c:/MyDocs/integrated/') # adjust to your setup

%run "catalog_support.py" 
showHeader('Auxillary Data Sets')

In [None]:
# fetch data set
df = fh.get_df(os.path.join(hndl.sandbox_dir,'workdf.parquet'))

In [None]:
df['year'] = df.date.dt.year
df = df[(df.year>2010)&(df.year<=datetime.datetime.now().year)]
gb = df.groupby('DisclosureId',as_index=False)[['APINumber','year','bgStateName','ingKeyPresent']].first()
gb['api10'] = gb.APINumber.str[:10]

bulkgb = df.groupby('bgCAS',as_index=False).size().rename({'size':'bulk_cnt'},axis=1)

In [None]:
def make_cas_table(bulkgb,altdf):
    altgb = altdf.groupby('bgCAS',as_index=False).size().rename({'size':'alt_cnt'},axis=1)
    altgb1 = altdf.groupby('bgCAS',as_index=False)['bgIngredientName'].first()
    altgb = pd.merge(altgb,altgb1,on='bgCAS',how='left')
    altgb2 = altdf.groupby('bgCAS',as_index=False)['calcMass'].sum()
    altgb2.calcMass = altgb2.calcMass.map(lambda x: round_sig(x,3))
    # altgb2.calcMass = np.where(altgb2.calcMass==0,np.NaN,altgb2.calcMass)
    altgb = pd.merge(altgb,altgb2,on='bgCAS',how='left')
    mg = pd.merge(altgb,bulkgb,on='bgCAS',how='left',indicator=True)
    mg['present_in'] = mg._merge
    mg.present_in = np.where(mg.present_in=='left_only','alt data only',mg.present_in)
    return mg[['bgCAS','bgIngredientName','alt_cnt','calcMass','bulk_cnt','present_in']].reset_index(drop=True)
    

## Description
This page summarizes data sets that Open-FF has compiled or imported from other sources, but are not directly connected to the bulk download of FracFocus and so are not integrated with those data.  

**These data sets often come with caveats; please read and consider them carefully.**  For example, we know that companies can change already published data without justification or notification.  These auxillary data sets that are older may contain data that the industry considers out of date.  Nevertheless, those data sets may still be useful for some purposes.

## Access
We currently don't keep copies of these data online. Please contact us if you are interested.  

   
#  Data Sets:
**[FracFocus version 1](#FFV1)**
Scrape of early FracFocus disclosures.  Downloaded in April 2021.

**[SkyTruth archive](#ST)**
Data compiled by SkyTruth in 2011-2013 by scraping early FracFocus disclosures.

**[New Mexico](#NM)**
Scrape of state-held, HTML disclosures.  Downloaded in May 2022. Includes links to the state-held disclosures that are similar to the FracFocus "Find-A-Well" PDFs.

**[Ohio Drilling Chemicals](Ohio_Drilling_Chemicals.html)**
Ohio requires Operators to disclose chemicals added during the *drilling* phase. This separate page provides a window into these data, including a link to all PDF disclosures and a catalog of (scrape-able) CAS numbers reported. Based on work in June 2022.


---
---
<a id='FFV1'></a>
# FracFocus version 1, scraped from PDF files 
## downloaded from FracFocus, April 2021



The FracFocus bulk download data contains about 45,000 disclosures for 2011-2013 that have only "header" data - location, Operator, date, water volume, etc., but not chemical records. These are the so-called FFversion 1.0 disclosures. Even though the PDF versions of these disclosures (served by Find-a-Well) have chemical records, the bulk download versions do not.  Early versions of Open-FF tried to use the SkyTruth archive (elsewhere on this page) to fill that gap, but when we learned that the archive contained outdated values in some disclosures, we removed it.  

This data set is an improvement: In April 2021, at least eight years after the release of FFversion 2, we downloaded all PDFs from the FFv1 period.  We then attempted to scrape those data into a data set.  Unfortunately, a sizable fraction of the FFV1 PDFs are poorly formatted and too difficult to scrape reliably.  Nevertheless, we managed to pull together about 33,000 disclosures.

In [None]:
repo_name = 'FFV1_scrape_2022_09_11' # use this to override catalog_common.py
master_df = ana_set_old.Full_set(repo = repo_name, outdir='../common/').get_set(verbose=False)
master_df = master_df[(master_df.date.dt.year>2010)&\
                      (master_df.date.dt.year<=datetime.datetime.now().year)]
master_df = master_df[master_df.in_std_filtered]

master_df['year'] = master_df.date.dt.year
ffv1gb = master_df.groupby('DisclosureId',as_index=False)[['APINumber','year']].first()
ffv1gb['api10'] = ffv1gb.APINumber.str[:10]

## Summary

In [None]:
ffv1gb.year.value_counts()

## Overlap with bulk data

In [None]:
mg = pd.merge(ffv1gb,gb,on=['api10','year'],how='outer',indicator=True)
mg.ingKeyPresent.fillna(True,inplace=True)
# mg.head()

In [None]:
mg['source'] = np.where((mg._merge=='right_only')&(~mg.ingKeyPresent),'FF only, no chem','FF with chem')
mg.source = np.where((mg._merge=='both')&(~mg.ingKeyPresent),'Scrape & FF no chem',mg.source)
mg.source = np.where((mg._merge=='both')&(mg.ingKeyPresent),'Scrape & FF with chem',mg.source)
mg.source = np.where(mg._merge=='left_only','Scrape only',mg.source)

In [None]:
ax = mg.groupby(['year','source']).size().unstack().plot(kind='bar', stacked=True, 
                                                         figsize=(12,6),
                                                          title='Overlap of scraped disclosures and FF bulk download')
ax.set_ylabel('Number of disclosures',fontsize=14);


In [None]:
alldf = master_df[master_df.ingKeyPresent].groupby('DisclosureId',as_index=False)[['date','TotalBaseWaterVolume',
                                                                                'APINumber']].first()
gb1 = alldf.groupby('date').size()
allwk_sum = gb1.resample("W").sum()
ax = allwk_sum.plot(figsize=(12,5), ylabel='Number of disclosures');
ax.set_title('Weekly number of disclosures by fracture date in FFV1 scraped data',fontsize=15);

## Where are they?

In [None]:

gb1 = master_df[master_df.loc_within_state=='YES'].groupby(['bgStateName','bgCountyName',
                                                           'DisclosureId'],as_index=False)['bgCAS'].count()
gb1 = gb1.groupby(['bgStateName','bgCountyName'],as_index=False)['DisclosureId'].count().rename({'bgStateName':'StateName',
                                                                            'bgCountyName':'CountyName',
                                                                           'DisclosureId':'value'},
                                                                          axis=1)
# gb = pd.read_csv(r"C:\MyDocs\OpenFF\src\testing\tmp\temp.csv")
#start_loc = get_state_center('new mexico')
mapping.create_county_choropleth(gb1,plotlog=True,custom_scale= [0,1,2,3,4,5],
                        #start_loc=start_loc, # center of state's data
                        legend_name='Number of FracFocus disclosures',
                        start_zoom=3,fields=['StateName','CountyName','orig_value'],
                        aliases = ['State: ','County: ','Number of FF disclosures: ']
                       )


## Chemical list for this data set
Including the number of times it appears in this set, thesum of the masses of the records (when calculable) and if it is also present in the bulk set.

In [None]:
iShow(make_cas_table(bulkgb,master_df).reset_index(drop=True),maxBytes=0,columnDefs=[{"width": "100px", "targets": 0}],
      classes="display compact cell-border", scrollX=True)

---
---
<a id='ST'></a>
# SkyTruth archive, scraped from PDF files 
## downloaded from FracFocus in 2011 - May 2013



Between 2011 and May 2013, the NGO [SkyTruth](https://skytruth.org/) attempted to create a usable data set from the PDFs that were being published on the FracFocus website.  The bulk download option was not available yet.  This archive is interesting in a number of respects:
- the group worked hard to scrape all of the PDFs into a data set even though many PDFs were poorly formed. 
- the data are a snapshot of the PDFs at the time.  We know that the industry has changed some of the data and therefore the current PDFs are different than this archive in some ways.  As far as we know, no one has yet documented what changes were made. However, because of those differences, **it is likely that the industry would consider these data as flawed.**

See the [FFV1 data](#FFV1) set as an alternative or a comparative.

In [None]:
repo_name = 'SkyTruth_2022_09_11' # use this to override catalog_common.py
master_df = ana_set_old.Full_set(repo = repo_name, outdir='../common/').get_set(verbose=False)
master_df = master_df[(master_df.date.dt.year>2010)&\
                      (master_df.date.dt.year<=datetime.datetime.now().year)]
master_df = master_df[master_df.in_std_filtered]

master_df['year'] = master_df.date.dt.year
stgb = master_df.groupby('DisclosureId',as_index=False)[['APINumber','year']].first()
stgb['api10'] = stgb.APINumber.str[:10]
# gb.head()

## Overlap with bulk data

In [None]:
mg = pd.merge(stgb,gb,on=['api10','year'],how='outer',indicator=True)
mg.ingKeyPresent.fillna(True,inplace=True)
# mg.head()

In [None]:
mg['source'] = np.where((mg._merge=='right_only')&(~mg.ingKeyPresent),'FF only, no chem','FF with chem')
mg.source = np.where((mg._merge=='both')&(~mg.ingKeyPresent),'ST & FF no chem',mg.source)
mg.source = np.where((mg._merge=='both')&(mg.ingKeyPresent),'ST & FF with chem',mg.source)
mg.source = np.where(mg._merge=='left_only','ST only',mg.source)

In [None]:
ax = mg.groupby(['year','source']).size().unstack().plot(kind='bar', stacked=True, 
                                                         figsize=(12,6),
                                                          title='Overlap of SkyTruth disclosures and FF bulk download')
ax.set_ylabel('Number of disclosures',fontsize=14);


In [None]:
alldf = master_df[master_df.ingKeyPresent].groupby('DisclosureId',as_index=False)[['date','TotalBaseWaterVolume',
                                                                                'APINumber']].first()
gb1 = alldf.groupby('date').size()
allwk_sum = gb1.resample("W").sum()
ax = allwk_sum.plot(figsize=(12,5), ylabel='Number of disclosures');
ax.set_title('Weekly number of disclosures by fracture date in SkyTruth scraped data',fontsize=15);

## Where are they?

In [None]:

gb1 = master_df.groupby(['bgStateName','bgCountyName',
                        'DisclosureId'],as_index=False)['bgCAS'].count()
gb1 = gb1.groupby(['bgStateName','bgCountyName'],as_index=False)['DisclosureId'].count().rename({'bgStateName':'StateName',
                                                                            'bgCountyName':'CountyName',
                                                                           'DisclosureId':'value'},
                                                                          axis=1)
# gb = pd.read_csv(r"C:\MyDocs\OpenFF\src\testing\tmp\temp.csv")
#start_loc = get_state_center('new mexico')
create_county_choropleth(gb1,plotlog=True,custom_scale= [0,1,2,3,4,5],
                        #start_loc=start_loc, # center of state's data
                        legend_name='Number of FracFocus disclosures',
                        start_zoom=3,fields=['StateName','CountyName','orig_value'],
                        aliases = ['State: ','County: ','Number of FF disclosures: ']
                       )


## Chemical list for this data set
Including the number of times it appears in this set, thesum of the masses of the records (when calculable) and if it is also present in the bulk set.

In [None]:
iShow(make_cas_table(bulkgb,master_df).reset_index(drop=True),maxBytes=0,columnDefs=[{"width": "100px", "targets": 0}],
      classes="display compact cell-border", scrollX=True)

---
---
<a id='NM'></a>
# Scrape of New Mexico state chemical disclosures

New Mexico allowed companies to submit disclosures to either FracFocus or directly to the state where they would be published only on the state websites.  This practice apparently ended in 2018; no state disclosures we could find were newer than early 2018.

These state-held disclosures exist in (at least) two forms. The most public version are available [at an index on a OCD permitting page](https://wwwapps.emnrd.nm.gov/ocd/ocdpermitting/OperatorData/PermitStatusResults.aspx?Type=HFFD).  Clicking on the permit number will take you to a place to download a PDF file of the disclosure.  Unfortunately, almost all of the PDFs there are **image** based (instead of text-based) which makes them very hard and error-prone to scrape. 

Fortunately, another set of these disclosures exist at a different address and these are **HTML** based and so easier to scrape into a full data set. That is what we've used for this data set. Find links to individual disclosures at the bottom of this section.

In [None]:
repo_name = 'NM_scrape_2022_09_11' # use this to override catalog_common.py
master_df = ana_set_old.Full_set(repo = repo_name, outdir='../common/').get_set(verbose=False)
master_df = master_df[(master_df.date.dt.year>2010)&\
                      (master_df.date.dt.year<=datetime.datetime.now().year)]
master_df = master_df[master_df.in_std_filtered]

master_df['year'] = master_df.date.dt.year
nmgb = master_df.groupby('DisclosureId',as_index=False)[['APINumber','year']].first()
nmgb['api10'] = nmgb.APINumber.str[:10]


## Overlap with bulk data


In [None]:
mg = pd.merge(nmgb,gb[gb.bgStateName=='new mexico'],on=['api10','year'],how='outer',indicator=True)
mg.ingKeyPresent.fillna(True,inplace=True)
# mg.head()

In [None]:
mg['source'] = np.where((mg._merge=='right_only')&(~mg.ingKeyPresent),'FF only, no chem','FF bulk only')
mg.source = np.where(mg._merge=='both','both',mg.source)
mg.source = np.where(mg._merge=='left_only','state data only',mg.source)

In [None]:
ax = mg.groupby(['year','source']).size().unstack().plot(kind='bar', stacked=True, 
                                                         figsize=(12,6),
                                                          title='Overlap of NM state-held disclosures and FF bulk download')
ax.set_ylabel('Number of disclosures',fontsize=14);


In [None]:
alldf = master_df[master_df.ingKeyPresent].groupby('DisclosureId',as_index=False)[['date','TotalBaseWaterVolume',
                                                                                'APINumber']].first()
gb1 = alldf.groupby('date').size()
allwk_sum = gb1.resample("W").sum()
ax = allwk_sum.plot(figsize=(12,5), ylabel='Number of disclosures');
ax.set_title('Weekly number of disclosures by fracture date in New Mexico scraped data',fontsize=15);

## Where are they?

In [None]:

gb1 = master_df[master_df.loc_within_state=='YES'].groupby(['bgStateName','bgCountyName',
                                                           'DisclosureId'],as_index=False)['bgCAS'].count()
gb1 = gb1.groupby(['bgStateName','bgCountyName'],as_index=False)['DisclosureId'].count().rename({'bgStateName':'StateName',
                                                                            'bgCountyName':'CountyName',
                                                                           'DisclosureId':'value'},
                                                                          axis=1)
# gb = pd.read_csv(r"C:\MyDocs\OpenFF\src\testing\tmp\temp.csv")
start_loc = get_state_center('new mexico')
create_county_choropleth(gb1,plotlog=True,custom_scale= [0,1,2,3,4,5],
                        start_loc=start_loc, # center of state's data
                        legend_name='Number of FracFocus disclosures',
                        start_zoom=6,fields=['StateName','CountyName','orig_value'],
                        aliases = ['State: ','County: ','Number of FF disclosures: ']
                       )


## Chemical list for this data set
Including the number of times it appears in this set, thesum of the masses of the records (when calculable) and if it is also present in the bulk set.

In [None]:
iShow(make_cas_table(bulkgb,master_df).reset_index(drop=True),maxBytes=0,columnDefs=[{"width": "100px", "targets": 0}],
      classes="display compact cell-border", scrollX=True)

## Links to the New Mexico state-held disclosure forms



In [None]:
nmtab = pd.read_csv(r"C:\MyDocs\OpenFF\src\openFF-catalog\NM_meta_links.csv",quotechar='$',encoding='utf-8')
nmtab.link = nmtab.link.str.replace('_',',')
nmtab['Link'] = nmtab.link.map(lambda x: wrapLink(x,'Link to NM record'))
nmtab.date = pd.to_datetime(nmtab.date)

iShow(nmtab[['APINumber','CountyName','date','OperatorName','Link']],maxBytes=0,
      classes="display compact cell-border")
# print(len(nmtab))