In [None]:
%run "catalog_common.py" 
ID_header('FracFocus Scope and<br>Aggregate Stats', incl_links=True)
set_page_param()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import gca, mpl

from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
from itables import show as iShow
import itables.options as opt

master_df = ana_set.Full_set(repo = repo_name, outdir='../common/').get_set(verbose=False)
master_df = master_df[(master_df.date.dt.year>2010)&\
                      (master_df.date.dt.year<=datetime.datetime.now().year)]
master_df = master_df[master_df.in_std_filtered]

gb2 = master_df.groupby('bgCAS')['IngredientName'].agg(lambda x: x.value_counts().index[0])
gb2 = gb2.reset_index()
gb2.columns = ['bgCAS','comm_name']
master_df = pd.merge(master_df,gb2,on='bgCAS',how='left')


---
# General Stats

The figures below use filtered data (no duplicate disclosures or records) for Jan 1, 2011 to the most recently published data.  Because of publishing delays, the most recent months are probably under represented.

**In the early data, roughly 2011 - May 2013, the bulk download does not include chemical records.**  However, the PDF files for that period are still served through ["Find_A-Well"](http://fracfocusdata.org/DisclosureSearch/Search.aspx) and document the reported chemicals.  Some projects have attempted to scrape those chemical records into an organized data set but they come with caveats (such as they may contain records that the industry no longer considers valid; or they are incomplete because many of the PDFs are poorly formatted and therefore difficult to scrape). 

In [None]:
alldf = master_df[master_df.ingKeyPresent].groupby('UploadKey',as_index=False)[['date','TotalBaseWaterVolume']].first()
gb = alldf.groupby('date').size()
allwk_sum = gb.resample("W").sum()
ax = allwk_sum.plot(figsize=(12,5), ylabel='Number of disclosures');
ax.set_title('Orange: Without chemical records; Blue: with chemical records',fontsize=10);
plt.suptitle('Weekly number of disclosures by end date in bulk download data',fontsize=15);
alldfv1 = master_df[~master_df.ingKeyPresent].groupby('UploadKey',as_index=False)[['date','TotalBaseWaterVolume']].first()
gbv1 = alldfv1.groupby('date').size()
allwk_sumv1 = gbv1.resample("W").sum()
allwk_sumv1.plot(ax=ax);

## This area plot code isn't working...
# t1 = allwk_sum.reset_index()
# t1.columns = ['date','with_chem_recs']
# t2 = allwk_sumv1.reset_index()
# t2.columns = ['date','without_chem_recs']
# mg = pd.merge(t1,t2,on='date',how='outer')
# mg.fillna(0,inplace=True)
# mg.plot.area()


---
# Water use

## Links to data
| Data Set with link | Description |
| :--: | :-- |
| [Water and Sand use](scope/water_sand.csv)|- all locations from 2011 to last major update in FracFocus<br>- Total base water volume (in gallons)<br>- sand (CASRN: 14808-60-7) mass, for disclosures for which mass is calculable.<br> - "OperatorName" is field as given in FracFocus.<br>- "bgOperatorName" is a generated field to standardize multiple names for the same company.<br>- "APINumber" is a 14-digit number (as text string) from the FracFocus; early disclosures with only 10-digits are filled out with 'XXXX'<br>- "api10" is a simple 10-digit version of the APINumber (as a text string)|


## Gallons used, recorded as TotalBaseWaterVolume
Note multiplier in upper left corner of plot.  "1e6" means "multiply y-axis values by 1,000,000", "1e9"-> 1,000,000,000

In [None]:
alldf = master_df.groupby('UploadKey',as_index=False)[['date','TotalBaseWaterVolume']].first()
gb1 = alldf.groupby('date')['TotalBaseWaterVolume'].median()
allwk_tbwv = gb1.resample("W").max()
ax = allwk_tbwv.plot(figsize=(12,5), ylabel='Median Water Volume Used By Week',style='o');
ax.set_title('Median water use (gallons) per week',fontsize=18);
ax = gca().yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))


In [None]:
gb3 = alldf.groupby('date')['TotalBaseWaterVolume'].sum()
allwk_tbwv_sum = gb3.resample("W").sum()
ax = allwk_tbwv_sum.plot(figsize=(12,5), ylabel='Total Water Volume Used By Week',style='o');
ax.set_title('Total weekly water use across FracFocus',fontsize=18);
plt.suptitle('Data for most recent months probably relects publication delays.',fontsize=10);
ax = gca().yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))



In [None]:
gb1 = alldf.groupby('date')['TotalBaseWaterVolume'].max()

gb2 = gb1[gb1<100000000]
allwk_tbwv = gb2.resample("W").max()
ax = allwk_tbwv.plot(figsize=(12,5), ylabel='Max Water Volume Used By Week',style='o');
ax.set_title('Single maximum water use (gallons) per week',fontsize=18);
plt.suptitle('All values over 100,000,000 gallons excluded - they are probably typos, but cannot be sure; see table below.',fontsize=10);
ax = gca().yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))


In [None]:
print('Disclosures over 100 million gallons')
print(gb1[gb1>100000000])

---
# Proppants
Remember, most 2011- mid 2013 disclosures do not have chemical records.  Proppants are, therefore, missing in those years in the figures below.

In [None]:
# gb1 = master_df.groupby('UploadKey',as_index=False)[['APINumber','date','TotalBaseWaterVolume','bgStateName']].first()
gb3 = master_df[master_df.bgCAS=='14808-60-7'].groupby('date')[['calcMass']].sum()
allwk_sand_sum = gb3.resample("W").sum()
otherprop = ['66402-68-4','1302-93-8','1302-76-7','1344-28-1','1318-16-7','308075-07-2','14464-46-1','1302-74-5']

ax = allwk_sand_sum.plot(figsize=(12,5), ylabel='Total 14808-60-7 Mass (lbs) By Week',style='o');
ax.set_title('Total weekly sand (CASRN: 14808-60-7) use across FracFocus',fontsize=18)
ax = gca().yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));


In [None]:
gb3 = master_df[master_df.bgCAS.isin(otherprop)].groupby('date')[['calcMass']].sum()
allwk_other_sum = gb3.resample("W").sum()
ax = allwk_other_sum.plot(figsize=(12,5), ylabel='Total other proppants Mass (lbs) By Week',style='o');
ax.set_title('Total weekly use of other proppants across FracFocus',fontsize=18)
ax = gca().yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));

props = master_df[master_df.bgCAS.isin(otherprop)].groupby('bgCAS',as_index=False)['epa_pref_name'].first()
print(f'List of "other proppants" graphed: \n{props}\n')