In [94]:
import os
import pandas as pd
import re
import urllib.request
from urllib.error import HTTPError

# Setup

target_website = 'drive.google.com' # the website to look for when filtering down hrefs

cwd = os.getcwd()
#m_path = cwd if 'manuscript-object' not in cwd else f'{cwd}/../'
m_path = cwd if 'manuscript-object' in cwd else f'{cwd}/manuscript-object/'
fieldnotes_path = f'{m_path}/fieldnotes/FA18+other-fieldnotes-list+links.csv'
out_path = f'{m_path}/fieldnotes/fieldnotes_hrefs.csv'

url_column = 'full-html'
hrefs_column = 'references'

re_href = re.compile(r'<a href="(https?://.*?)".*?>') # regex to find http hrefs in a block of html
#re_href = re.compile(r'<a href="(.*?)".*?>') # regex to find hrefs in a block of html (not necessarily web links)

In [95]:
# Script:
def find_hrefs_from_url(url:str, regex, target='') -> str:
    try:
        page = urllib.request.urlopen(url) # open url
    except HTTPError:
        return 'url not found'
    except ValueError:
        return 'not a valid web address'
    
    page_text = page.read().decode('utf-8') # decode to text
    
    matches = [href for href in regex.findall(page_text) if target in href] # get hrefs linking to target
    
    if not matches:
        return 'none'     # if target nowhere in that url, return 'none'
    else:
        return ','.join(matches)  # otherwise return a string containing each href separated by commas

In [96]:
fieldnotes = pd.read_csv(fieldnotes_path)

In [97]:
fieldnotes[hrefs_column] = fieldnotes[url_column].apply(lambda url: find_hrefs_from_url(url, re_href, target=target_website))

In [98]:
temp = [len(url) for url in fieldnotes[hrefs_column]]
#fieldnotes[url_column][temp.index(max(temp))]
[url for url in fieldnotes[hrefs_column] if url is not 'none']

['https://drive.google.com/file/d/0B29s4y6G-cwiekhobV8yOU0yZjA/view?usp=sharing',
 'https://drive.google.com/open?id=1TllEXdC3uXKB1QFMosmmfrRqMP2yoUFW,https://drive.google.com/open?id=1BSuFhOche01T1h7vQz62ukmxXPpfaXJC',
 'https://drive.google.com/open?id=1BSuFhOche01T1h7vQz62ukmxXPpfaXJC',
 'https://drive.google.com/open?id=1HJz475zJ7b7hL30ezmtajzxPbw5AU2atRkRM8vzvhGU,https://drive.google.com/open?id=1HJz475zJ7b7hL30ezmtajzxPbw5AU2atRkRM8vzvhGU',
 'https://drive.google.com/open?id=0BwJi-u8sfkVDUWRmYm85UkZCWlU',
 'not a valid web address',
 'url not found',
 'url not found',
 'url not found',
 'url not found',
 'url not found']

In [99]:
fieldnotes.to_csv(out_path, index=False)