In [None]:
import sys
sys.path.insert(0,'c:/MyDocs/integrated/') # adjust to your setup

%run "catalog_support.py" 

Numdays = 60

showHeader('Raw Disclosures',line2=f'{Numdays} days of FracFocus changes',use_remote=True)

In [None]:
import datetime

last_repo = datetime.datetime(year=2024,month=4,day=12)
today = datetime.datetime.today()
edate = today - datetime.timedelta(days=Numdays)
print('earlist shown',edate)

In [None]:
import os
import pickle
import openFF.common.text_handlers as th
arc_dir = r"C:\MyDocs\integrated\openFF_archive\diff_dicts"

diff_fns = os.listdir(arc_dir)
download_dates = []
added = []
changed = []
removed = []
casing = set()
operator = set()


for fn in diff_fns:
    tdate = datetime.datetime(int(fn[10:14]),int(fn[15:17]),int(fn[18:20]))
    # print(tdate)
    if tdate>= edate:
        download_dates.append(fn[10:20])
        with open(os.path.join(arc_dir,fn),'rb') as f:
            diff_dic = pickle.load(f)
        if len(diff_dic['removed_disc'])>0:
            t = diff_dic['removed_disc'].copy()
            # print(t.head())
            t['date_changed'] = tdate
            t['change_type'] = 'removed'
            removed.append(t)
        if len(diff_dic['added_disc'])>0:
            t = diff_dic['added_disc'].copy()
            t['date_changed'] = tdate
            t['change_type'] = 'added'
            added.append(t)
        if len(diff_dic['changed_disc'])>0:
            t = diff_dic['changed_disc'].copy()
            t['date_changed'] = tdate
            t['change_type'] = 'modified'            
            changed.append(t)
        if len(diff_dic['casing'])>0:
            for item in diff_dic['casing']:
                # print(item)
                if item[1]==None: ig = ''
                else: ig = item[1].strip().lower()
                tup = (item[0],ig)
                casing.add(tup)
        if len(diff_dic['OperatorName'])>0:
            for item in diff_dic['OperatorName']:
                operator.add(item)

alllists = added + changed + removed
wholeset = pd.concat(alllists,sort=True)


## Download dates used

In [None]:
print(download_dates)

In [None]:
wholeset['job_end_date'] = wholeset.JobEndDate.str.split().str[0]
wholeset['job_end_date'] = pd.to_datetime(wholeset.job_end_date,format="%m/%d/%Y")
wholeset['FF_disc'] = wholeset.apply(lambda x: th.getFFLink(x), axis=1)
wholeset['disc_link'] = wholeset.apply(lambda x: th.getDisclosureLink(APINumber=x.APINumber,
                                                                      disclosureid=x.discID,
                                                                      text_to_show='Open-FF disclosure',
                                                                      use_remote=True,
                                                                      check_if_exists=False), axis=1)
wholeset.disc_link = np.where(wholeset.date_changed>=last_repo,' ',wholeset.disc_link)
wholeset['TBWV'] = wholeset.TotalBaseWaterVolume.astype('float64')
wholeset['TBWV'] = wholeset.TBWV.map(lambda x: th.round_sig(x,5))


In [None]:
wholeset.change_type.value_counts()

In [None]:
wholeset[['FF_disc','disc_link','job_end_date','StateName','CountyName','OperatorName','TBWV','change_type','date_changed']]

## Pattern of new disclosure additions
These disclosures are detected as new because their `DisclosureId` number hasn't been in the database before.  Note that it is possible that they are a new version of a previously published disclosure; sometimes operators change disclosures by removing the old one from FracFocus and creating a new one.

In [None]:
t = wholeset[wholeset.change_type=='added']
gb = t.groupby('date_changed',as_index=False).size()
gb['cs'] = gb['size'].cumsum()
gb = gb[['date_changed','cs']].set_index('date_changed')
ax = gb.cs.plot(title='vertical line indicates last Open-FF dataset generation',
               ylabel='cumulative disclosures', xlabel='date changed')
ax.axvline(last_repo, color="green", linestyle="dashed")
plt.suptitle(f'Number of new disclosures added in the past {Numdays} days');
# t = t.reset_index().sort_values('date_changed')
# t = t.set_index('date_changed')
# t.plot('date_changed','count',style='o')

## New CASNumber : IngredientName pairs

In [None]:
# Are there new casing?
repo_casing = fh.get_casing_df()
changed_casing = pd.DataFrame(casing,columns=['CASNumber','IngredientName'])
mg = pd.merge(changed_casing,repo_casing,on=['CASNumber','IngredientName'], how='left',indicator=True)
mg[mg._merge=='left_only'][['CASNumber','IngredientName']]

In [None]:
# Are there new Operators?
repo_companies = fh.get_company_df()
complst = repo_companies.rawName.tolist()
newcomp = []
for op in operator:
    if not op in complst:
        newcomp.append(op)
if len(newcomp)> 0:
    display(md('## New Operator names detected'))
    newcomp.sort()
    for item in newcomp:
        display(md(f'##### {item}'))
else:
    display(md('### No new operator names detected'))