In [1]:
#date set in airflow run and updated with papermill
dt_0 = '2020-10-08'

In [2]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from dateutil.parser import parse
import pandas as pd

pd.options.display.max_colwidth = 200

In [3]:
#URL to parse for updates 
url = "https://ustr.gov/issue-areas/enforcement/section-301-investigations/section-301-china/200-billion-trade-action"

In [4]:
def parse_url(url):
    """
    Function to scrape a website and parse tags and associated dates
    and create a pandas dataframe
    
    Parameters
    ----------
    url : str
        website url

    Returns
    -------
    pandas dataframe
        date and text columns
    """
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    tags = soup.find_all('span')
    tags = [tag.text for tag in tags]
    tags = [tag.encode('utf-8', 'ignore') for tag in tags]

    date_tags = []
    for tag in tags:
        try:
            date = parse(tag, fuzzy=True, yearfirst=False)
            date = datetime.strftime(date, '%Y-%m-%d')
            tag = tag.decode()
            tag = tag.replace(u'\xa0', u' ')
            tag = " ".join(tag.split())
            date_tags.append([date, tag])
        except ValueError:
            pass

    df = pd.DataFrame(date_tags, columns = ['Date', 'Text'])
    df['Date'] = pd.to_datetime(df['Date'], errors = 'coerce')
    df = df.sort_values(by = 'Date', ascending=False) \
           .loc[df['Date'] < datetime.today()] \
           .reset_index(drop=True) \
           .drop_duplicates()
    
    return df


def email_text(df, dt_0):
    """
    Function to check dataframe with parsed website dates and text
    for recent updates    
    
    Parameters
    ----------
    df : dataframe
        date and text columns
    current_date : str
        execution date

    Returns
    -------
    prints dataframe or str
    """
    current_date = datetime.strptime(dt_0, '%Y-%m-%d') - timedelta(days=2)
    
    df = df.loc[df['Date'] >= current_date]
    if len(df) > 0:
        return df
    if len(df) == 0:
        print('No Updates')

In [5]:
# TODO fix UnknownTimezoneWarning error
import warnings; warnings.simplefilter('ignore')

df = parse_url(url)
email_text(df, dt_0)

Unnamed: 0,Date,Text
0,2021-05-10,Initial Additional Duty of 10 Percent
1,2021-05-03,Amendment to List 3 Exclusions to Cover Goods on the Water
2,2020-10-07,"Amendment to Exclusions Granted October 7, 2020"
3,2020-10-07,"Notice of Product Exclusion Extension Amendments - October 7, 2020"


In [6]:
# results example
df.head(20)

Unnamed: 0,Date,Text
0,2021-05-10,Initial Additional Duty of 10 Percent
1,2021-05-03,Amendment to List 3 Exclusions to Cover Goods on the Water
2,2020-10-07,"Amendment to Exclusions Granted October 7, 2020"
3,2020-10-07,"Notice of Product Exclusion Extension Amendments - October 7, 2020"
4,2020-09-16,"Notice of Product Exclusion Extension Amendment - September 16, 2020"
5,2020-08-24,"Amendments to Exclusions Granted August 24, 2020"
6,2020-07-10,"Amendments to Exclusions Granted July 10, 2020"
7,2020-06-19,"Exclusions Granted June 19, 2020"
8,2020-06-03,"- June 3, 2020"
11,2020-06-03,"Request for Comments Concerning the Extension of Particular Exclusions - June 3, 2020"
