In [None]:
import sys
sys.path.insert(0,'c:/MyDocs/integrated/') # adjust to your setup

%run "catalog_support.py" 

Numdays = 90

showHeader('Raw Disclosures',line2=f'{Numdays} days of FracFocus changes',use_remote=True)

In [None]:
import datetime

last_repo = datetime.datetime(year=2024,month=7,day=18)
today = datetime.datetime.today()
edate = today - datetime.timedelta(days=Numdays)
# print('earlist shown',edate)

In [None]:
import os
import pickle
import openFF.common.text_handlers as th
arc_dir = r"C:\MyDocs\integrated\openFF_archive\diff_dicts"

diff_fns = os.listdir(arc_dir)
download_dates = []
added = []
changed = []
removed = []
casing = set()
operator = set()


for fn in diff_fns:
    tdate = datetime.datetime(int(fn[10:14]),int(fn[15:17]),int(fn[18:20]))
    # print(tdate)
    if tdate>= edate:
        download_dates.append(fn[10:20])
        with open(os.path.join(arc_dir,fn),'rb') as f:
            diff_dic = pickle.load(f)
        if len(diff_dic['removed_disc'])>0:
            t = diff_dic['removed_disc'].copy()
            # print(t.head())
            t['date_changed'] = tdate
            t['change_type'] = 'removed'
            removed.append(t)
        if len(diff_dic['added_disc'])>0:
            t = diff_dic['added_disc'].copy()
            t['date_changed'] = tdate
            t['change_type'] = 'added'
            added.append(t)
        if len(diff_dic['changed_disc'])>0:
            t = diff_dic['changed_disc'].copy()
            t['date_changed'] = tdate
            t['change_type'] = 'modified'            
            changed.append(t)
        if len(diff_dic['casing'])>0:
            for item in diff_dic['casing']:
                # print(item)
                if item[1]==None: ig = ''
                else: ig = item[1].strip().lower()
                tup = (item[0],ig)
                casing.add(tup)
        if len(diff_dic['OperatorName'])>0:
            for item in diff_dic['OperatorName']:
                operator.add(item)

alllists = added + changed + removed
wholeset = pd.concat(alllists,sort=True)


In [None]:
gb_added = wholeset[wholeset.change_type=='added'].groupby('date_changed').size()
added_sum = gb_added.resample("D").sum()
added_sum = added_sum+.001 # to distinguish between zeros and no data

gb_removed = wholeset[wholeset.change_type=='removed'].groupby('date_changed').size()
removed_sum = gb_removed.resample("D").sum()
removed_sum = removed_sum+0.001

gb_changed = wholeset[wholeset.change_type=='modified'].groupby('date_changed').size()
changed_sum = gb_changed.resample("D").sum()
changed_sum = changed_sum+0.001

## Downloads from FracFocus


In [None]:
wholeset['job_end_date'] = wholeset.JobEndDate.str.split().str[0]
wholeset['job_end_date'] = pd.to_datetime(wholeset.job_end_date,format="%m/%d/%Y")
wholeset['FF_disc'] = wholeset.apply(lambda x: th.getFFLink(x), axis=1)
wholeset['disc_link'] = wholeset.apply(lambda x: th.getDisclosureLink(APINumber=x.APINumber,
                                                                      disclosureid=x.discID,
                                                                      text_to_show='Open-FF disclosure',
                                                                      use_remote=True,
                                                                      check_if_exists=False), axis=1)
cond = (wholeset.date_changed>=last_repo)&(wholeset.change_type!='removed')
wholeset.disc_link = np.where(cond,' ',wholeset.disc_link)
wholeset['TBWV'] = wholeset.TotalBaseWaterVolume.astype('float64')
wholeset['TBWV'] = wholeset.TBWV.map(lambda x: th.round_sig(x,5))


In [None]:
wholeset.change_type.value_counts()


In [None]:
wholeset['has more than one'] = np.where(wholeset.APINumber.duplicated(keep=False),'APINumber dupe','')

In [None]:
# for curated disclosures, get detected flaws

DiDs = wholeset.discID.unique().tolist()
rec_iss = pd.read_parquet(os.path.join(hndl.curr_repo_dir,'record_issues.parquet'),columns=['r_flags','reckey'])
reck = pd.read_parquet(os.path.join(hndl.curr_repo_pkl_dir,'chemrecs.parquet'),columns=['DisclosureId','reckey'])
mg = pd.merge(rec_iss,reck,on='reckey',how='left')
gb = mg[mg.DisclosureId.isin(DiDs)].groupby('DisclosureId',as_index=False)['r_flags'].apply(set)

def str_from_set(x):
    s = ''
    for item in x:
        s += item +' '
    return s

gb['recstr'] = gb.r_flags.map(lambda x: str_from_set(x))

In [None]:
dis_iss = pd.read_parquet(os.path.join(hndl.curr_repo_dir,'disclosure_issues.parquet'),columns=['d_flags','DisclosureId'])
dis_iss = dis_iss[dis_iss.DisclosureId.isin(DiDs)]
gb = gb.merge(dis_iss,on='DisclosureId',how='outer')
gb = gb.fillna('')
gb['issues'] = gb.d_flags +' '+gb.recstr
wholeset = wholeset.merge(gb[['DisclosureId','issues']],left_on='discID',right_on='DisclosureId',how='left')
wholeset.issues = wholeset.issues.fillna('')

## watch list summary
See bottom of page for whole list

In [None]:
# url = 'https://raw.githubusercontent.com/gwallison/FF_issues/master/watch_list.csv'
# wdf = pd.read_csv(url,dtype = {'APINumber':'str'})
url = 'https://raw.githubusercontent.com/gwallison/FF_issues/master/watch_list_master.parquet'
wdf = pd.read_parquet(url)
wdf = wdf.rename({'DisclosureId':'wl_DisclosureId'},axis=1)
# wdf.date_entered = pd.to_datetime(wdf.date_entered,format='%m/%d/%y')
# wdf.FF_report_date = pd.to_datetime(wdf.FF_report_date,format='%m/%d/%y')
# wdf.Blog_date = pd.to_datetime(wdf.Blog_date,format='%m/%d/%y')
# wdf.FF_updates = pd.to_datetime(wdf.FF_updates,format='%m/%d/%y')

apis = wdf.APINumber.unique().tolist()

watchlist_found = pd.merge(wdf,wholeset,on='APINumber',how='inner')
watchlist_found[['wl_name','change_type']].value_counts()

## Pattern of new disclosure additions
These disclosures are detected as new because their `DisclosureId` number hasn't been in the database before.  Note that it is possible that they are a new version of a previously published disclosure; sometimes operators change disclosures by removing the old one from FracFocus and creating a new one.

In [None]:
# print(download_dates)
import warnings
warnings.filterwarnings("ignore")
import calplot
calplot.calplot(added_sum,  cmap='Spectral_r');


- **Blue line** = Cumulative new disclosures added (my include replacements for removed disclosures)
- **Orange line** = New disclosures with detected issues
- **Vertical dashed line** = date of last Open-FF data set generation

In [None]:
t = wholeset[wholeset.change_type=='added']

gb = t.groupby('date_changed',as_index=False).size()
gb['cs'] = gb['size'].cumsum()
gb = gb[['date_changed','cs']].set_index('date_changed')
ax = gb.cs.plot(title='Number of new disclosures',
               ylabel='cumulative disclosures', xlabel='date changed')

gb = t[t.issues.str.len()>1].groupby('date_changed',as_index=False).size()
gb['with_issues'] = gb['size'].cumsum()
gb = gb[['date_changed','with_issues']].set_index('date_changed')
ax = gb.with_issues.plot(ax=ax)


ax.axvline(last_repo, color="green", linestyle="dashed");


In [None]:
show_whole = wholeset[wholeset.change_type=='added'].rename({'job_end_date':'job end date','date_changed':'date added',
                            'change_type':'change type'},axis=1)
show_whole[['FF_disc','disc_link','job end date','StateName','CountyName','OperatorName',
          'TBWV','date added','has more than one','issues']].reset_index(drop=True)

### Removed disclosures

In [None]:
if removed_sum.sum()>0.5:
    calplot.calplot(removed_sum, cmap='Spectral_r');
else:
    display(md('#### No removed disclosures found'))

In [None]:
if removed_sum.sum()>0.5:   #0.5 to account for the 0.001 baseline
    show_whole = wholeset[wholeset.change_type=='removed'].rename({'job_end_date':'job end date','date_changed':'date removed',
                            'change_type':'change type'},axis=1)
    iShow(show_whole[['FF_disc','disc_link','job end date','StateName','CountyName','OperatorName',
          'TBWV','date removed','has more than one','issues']].reset_index(drop=True))

### Modified disclosures

In [None]:
if changed_sum.sum()>0.5:  #0.5 to account for the 0.001 baseline
    calplot.calplot(changed_sum, cmap='Spectral_r');
else:
    display(md('#### No modified disclosures detected'))

In [None]:
if changed_sum.sum()>0.5:
    show_whole = wholeset[wholeset.change_type=='modified'].rename({'job_end_date':'job end date','date_changed':'date modified',
                            'change_type':'change type'},axis=1)
    iShow(show_whole[['FF_disc','disc_link','job end date','StateName','CountyName','OperatorName',
          'TBWV','date modified','has more than one','issues']].reset_index(drop=True))

In [None]:
# get list of *reported* issues
def add_to_set(s, iset):
    lst = s.split()
    for i in lst:
        iset.add(i)
    return iset

iset = set()
for i, row in wholeset.iterrows():
    iset = add_to_set(row.issues,iset)

### Issues list

In [None]:
import FF_issues.process_master_files as pmf
pobj = pmf.Process_Master_Files()
df = pobj.process_obj()


#### Discloure-level Issues

In [None]:
c = df.Flag_id.str[0]=='d'
c1 = df.Flag_id.isin(iset)
t = df[c&c1].copy()
t['flaw_link'] = t.Flag_id.map(lambda x: th.getFlawLink(x))
t[['Title','flaw_link','Warning_level']]

#### Record-level Issues

In [None]:
c = df.Flag_id.str[0]=='r'
# c1 = df.Flag_id.isin(iset)
t = df[c&c1].copy()
t['flaw_link'] = t.Flag_id.map(lambda x: th.getFlawLink(x))
t[['Title','flaw_link','Warning_level']]

## New CASNumber : IngredientName pairs

In [None]:
# Are there new casing?
repo_casing = fh.get_casing_df()
changed_casing = pd.DataFrame(casing,columns=['CASNumber','IngredientName'])
mg = pd.merge(changed_casing,repo_casing,on=['CASNumber','IngredientName'], how='left',indicator=True)
mg[mg._merge=='left_only'][['CASNumber','IngredientName']]

In [None]:
# Are there new Operators?
repo_companies = fh.get_company_df()
complst = repo_companies.rawName.tolist()
newcomp = []
for op in operator:
    if not op in complst:
        newcomp.append(op)
if len(newcomp)> 0:
    display(md('## New Operator names detected'))
    newcomp.sort()
    for item in newcomp:
        display(md(f'##### {item}'))
else:
    display(md('### No new operator names detected'))

___
## Watch list
Wells with changed disclosures that were previously detected with problems

In [None]:
watchlist_found