**Learn**

In [94]:
import config
from autoscraper import AutoScraper 
import datetime
import pickle
from deepdiff import DeepDiff
from tqdm import tqdm
from functools import reduce
import operator
from collections import defaultdict

In [2]:
menu_scraper = AutoScraper()
for url, wanted_list in config.DATA:
    menu_scraper.build(url=url,wanted_list=wanted_list)

In [3]:
sub_urls = menu_scraper.get_result_similar(config.BASE_URL)

In [4]:
links_scraper = AutoScraper()
for url,wanted_list in config.LINKS_DATA:
    links_scraper.build(url=url,wanted_list=wanted_list,update=True)

In [5]:
links_scraper.get_result_similar('https://www.usaid.gov/afghanistan/newsroom')

['/documents/initiative-strengthen-local-administrations-isla',
 '/documents/assistance-development-afghan-legal-access-and-transparency-adalat',
 '/documents/conflict-mitigation-assistance-civilians-comac',
 '/documents/electoral-support-project-esp',
 '/afghanistan/fact-sheets/strengthening-peace-building-conflict-resolution-and-governance-0',
 '/news-information/press-releases/apr-21-2022-administrator-power-meeting-harjit-sajjan-minister-development-canada',
 '/news-information/press-releases/mar-31-2022-united-states-provides-nearly-204-million-new-funding-address',
 '/news-information/press-releases/mar-14-2022-administrator-samantha-power-meeting-united-nations-office-project-services',
 '/news-information/press-releases/feb-25-2022-new-general-license-20-authorizing-certain-afghanistan-transactions',
 '/news-information/press-releases/jan-12-2022-samantha-power-meeting-un-world-food-program-beasley',
 'https://www.usaid.gov/documents/initiative-strengthen-local-administrations-

In [6]:
table_scraper = AutoScraper()

In [None]:
with open('learnt_scrapers.pkl','wb') as f:
    pickle.dump([menu_scraper,links_scraper],f)

**Fetch**

In [126]:
fetch_base_url = 'https://www.usaid.gov/'
fetch_exts = ['afghanistan','bangladesh','burma','cambodia','india',
              'indonesia','laos','maldives','mongolia','nepal', 
              'sri-lanka','thailand','vietnam','asia-regional']

In [127]:
def fetch_results(fetch_base_url, fetch_exts, menu_scraper, links_scraper):
    
    results_tree = dict()
    
    for ext in tqdm(fetch_exts):
        results_tree[ext] = dict()
        fetch_url = fetch_base_url+ext
        menu_results = [result for result in menu_scraper.get_result_similar(fetch_url) if 'http' in result and 'www' in result]

        for option in menu_results:
            menu_option_results = [result for result in links_scraper.get_result_similar(option) if 'http' in result or result.isdigit()]
            menu_option_pages = [result for result in menu_option_results if result.isdigit()]
            if menu_option_pages != []:
                menu_option_pages = map(int, menu_option_pages)
                menu_option_pages = [page_num - 1 for page_num in menu_option_pages] 
                add_results = [links_scraper.get_result_similar(option + f'?page={page_num}') for page_num in menu_option_pages if page_num!=0]
                add_results_flat = [result for result_list in add_results for result in result_list]
                add_results_fil = [result for result in add_results_flat if 'http' in result]
                menu_option_results.extend(add_results_fil)
            menu_option_results_fil = [result for result in menu_option_results if not result.isdigit()]
            results_tree[ext][option] = menu_option_results_fil
    return results_tree

In [128]:
results_tree = fetch_results(fetch_base_url,fetch_exts[:7],menu_scraper,links_scraper)

100%|█████████████████████████████████████████████| 7/7 [03:14<00:00, 27.79s/it]


In [129]:
def prune_results(results_tree):
    result_tree_pruned = dict()
    for m_k,m_v in results_tree.items():
        result_tree_pruned[m_k] = dict()
        for o_k,o_v in m_v.items():
            val_list = [val for val_list in result_tree_pruned[m_k].values() for val in val_list]
            new_o_v = [v for v in o_v if v not in val_list]
            result_tree_pruned[m_k][o_k] = new_o_v
    return result_tree_pruned

In [130]:
results_tree_pruned = prune_results(results_tree)

In [131]:
with open('last_saved_data.pkl','wb') as f:
    pickle.dump(results_tree_pruned,f)

In [132]:
with open('last_saved_data.pkl','rb') as f:
    last_results = pickle.load(f)

In [133]:
results_tree = fetch_results(fetch_base_url,fetch_exts,menu_scraper,links_scraper)
results_tree_pruned = prune_results(results_tree)

100%|███████████████████████████████████████████| 14/14 [03:34<00:00, 15.30s/it]


In [137]:
diff_results = DeepDiff(last_results,results_tree_pruned,ignore_order=True)

In [138]:
dict1 = {'afg':{'menu_1':['val2','val1'],'menu_2':['val_1'],'menu_3':['val_1']}}
dict2 = {'afg':{'menu_1':['val1']}}
# dict2 = {'afg':{'menu_1':['val2','val1'],'menu2':['val2']}}
test_diff = DeepDiff(dict2,dict1,ignore_order=True)

In [139]:
diff_results

{'dictionary_item_added': [root['maldives'], root['mongolia'], root['nepal'], root['sri-lanka'], root['thailand'], root['vietnam'], root['asia-regional']]}

In [140]:
def getFromDict(dataDict, mapList):
    return reduce(operator.getitem, mapList, dataDict)

def setInDict(dataDict, mapList, value):
    getFromDict(dataDict, mapList[:-1])[mapList[-1]] = value

In [141]:
def convert(added_str, added_dict, results_dict, key, value=None):
    if key == 'dictionary_item_added':
        added_str = added_str.replace('root','')
        added_str = added_str.replace('[','')
        added_str = added_str.replace("'",'')
        keys = added_str.split(']')[:-1]
        
        value = getFromDict(results_dict,keys)
        setInDict(added_dict, keys, value)
    
    elif key == 'iterable_item_added':
        flag=0
        added_str = added_str.replace('root','')
        added_str = added_str.replace('[','')
        added_str = added_str.replace("'",'')
        keys = added_str.split(']')[:-1]
        try:
            value = getFromDict(added_dict,keys[:-1])
            print(value)
            flag=1
        except:
            flag=0
        if flag==1:
            updated_value = value.append(added_val)
            setInDict(added_dict, keys[:-1], updated_value)
        elif flag==0:
            setInDict(added_dict, keys[:-1], [value])

In [142]:
added_keys = {'dictionary_item_added','iterable_item_added'}
added_dict = defaultdict(dict)
for key in added_keys:
    try:
        diff_results[key]
    except:
        continue
    if key == 'dictionary_item_added':
        for added_str in diff_results[key]:
            convert(added_str,added_dict, results_tree_pruned, key)
    elif key == 'iterable_item_added':
        for added_str, added_val in diff_results[key].items():
            convert(added_str,added_dict, results_tree_pruned, key, value=added_val)

In [144]:
dict(added_dict)

{'maldives': {'https://www.usaid.gov/maldives/history': ['https://www.usaid.gov/maldives/press-releases/oct-27-2021-united-states-provides-additional-2-million-urgent-covid-19',
   'https://www.usaid.gov/maldives/press-releases/aug-9-2021-us-donates-128700-pfizer-biontech-covid-19-vaccines',
   'https://www.usaid.gov/maldives/press-releases/july-15-2021-us-donates-100000-rapid-diagnostic-tests-maldives-early',
   'https://www.usaid.gov/news-information/press-releases/jun-25-2021-administrator-samantha-power-call-maldives-foreign-minister',
   'https://www.usaid.gov/news-information/press-releases/jun-18-2021-usaid-sends-emergency-covid-19-aid-nepal-and-pakistan'],
  'https://www.usaid.gov/maldives/newsroom': ['https://www.usaid.gov/maldives/press-releases/mar-15-2022-united-states-launches-5-year-10-point-5m-climate-change',
   'https://www.usaid.gov/maldives/documents/fact-sheet-governance',
   'https://www.usaid.gov/maldives/vacancy-announcements/05-21-usaid-project-management-specia

In [91]:
test_diff

{'dictionary_item_added': [root['afg']['menu_2'], root['afg']['menu_3']],
 'iterable_item_added': {"root['afg']['menu_1'][0]": 'val2'}}