In [3]:
import re

starts_with_hash = 0
with open('input.txt') as f:
    for line in f: # look at each line in the file
        if re.match("^#",line): # use a regex to see if it starts with '#'
            starts_with_hash += 1 # if it does, add 1 to the count
    print(f"The number of lines that start with hash is {starts_with_hash}")

The number of lines that start with hash is 2


In [4]:
with open('email_addresses.txt', 'w') as f:
    f.write("joelgrus@gmail.com\n")
    f.write("joel@m.datasciencester.com\n")
    f.write("joelgrus@m.datasciencester.com\n")


In [5]:
from collections import Counter


def get_domain(email_address: str) -> str:
    """Split on '@' and return the last piece"""
    return email_address.lower().split("@")[-1]


# a couple of tests
if get_domain('joelgrus@gmail.com') == 'gmail.com':
    print("Success!")
else:
    print("Failed!")

if get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com':
    print("Success!")
else:
    print("Failed!")

with open('email_addresses.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())
                            for line in f
                            if "@" in line)


Success!
Success!


In [6]:
with open('tab_delimited_stock_prices.txt', 'w') as f:
    f.write("""6/20/2014\tAAPL\t90.91
6/20/2014\tMSFT\t41.68
6/20/2014\tFB\t64.5
6/19/2014\tAAPL\t91.86
6/19/2014\tMSFT\t41.51
6/19/2014\tFB\t64.34
""")


In [7]:
def process(date: str, symbol: str, closing_price: float) -> None:
    # Imaginge that this function actually does something.
    assert closing_price > 0.0


In [8]:
import csv
with open('tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter='\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        process(date, symbol, closing_price)


In [9]:
with open('colon_delimited_stock_prices.txt', 'w') as f:
    f.write("""date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5
""")

In [10]:
with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row["date"]
        symbol = dict_row["symbol"]
        closing_price = float(dict_row["closing_price"])
        process(date, symbol, closing_price)

In [11]:
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5}
with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f, delimiter=',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])


In [12]:
from bs4 import BeautifulSoup
import requests
# I put the relevant HTML file on GitHub. In order to fit
# the URL in the book I had to split it across two lines.
# Recall that whitespace-separated strings get concatenated.
url = ("https://raw.githubusercontent.com/"
       "joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')


In [13]:
first_paragraph = soup.find('p')
print(first_paragraph)

<p id="p1">This is the first paragraph.</p>


In [14]:
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()
print(first_paragraph_text)
print(first_paragraph_words)

This is the first paragraph.
['This', 'is', 'the', 'first', 'paragraph.']


In [15]:
first_paragraph_id = soup.p['id'] # raises KeyError if no 'id'
print(first_paragraph_id)
first_paragraph_id2 = soup.p.get('id') # returns None if no 'id'
print(first_paragraph_id2)

p1
p1


In [16]:
all_paragraphs = soup.find_all('p') # or just soup('p')
print(all_paragraphs)
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]
print(paragraphs_with_ids)

[<p id="p1">This is the first paragraph.</p>, <p class="important">This is the second paragraph.</p>]
[<p id="p1">This is the first paragraph.</p>]


In [17]:
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p')
                         if 'important' in p.get('class', [])]
print(important_paragraphs)
print(important_paragraphs2)
print(important_paragraphs3)

[<p class="important">This is the second paragraph.</p>]
[<p class="important">This is the second paragraph.</p>]
[<p class="important">This is the second paragraph.</p>]


In [18]:
# Warning: will return the same <span> multiple times
# if it sits inside multiple <div>s.
# Be more clever if that's the case.
spans_inside_divs = [span for div in soup('div') # for each <div> on the page
                     for span in div('span')] # find each <span> inside it
print(spans_inside_divs)

[<span id="name">Joel</span>, <span id="twitter">@joelgrus</span>, <span id="email">joelgrus-at-gmail</span>]


In [19]:
from bs4 import BeautifulSoup
import requests
url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")
all_urls = [a['href'] for a in soup('a') if a.has_attr('href')]
print(len(all_urls)) # 967 for me, way too many

967


In [21]:
import re
# Must start with http:// or https://
# Must end with .house.gov or .house.gov/
regex = r"^https?://.*\.house\.gov/?$"
# Let's write some tests!
assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")
# And now apply
good_urls = [url for url in all_urls if re.match(regex, url)]
print(len(good_urls)) # still 870 for me

870


In [22]:
good_urls = list(set(good_urls))
print(len(good_urls)) # only 435 for me

435


In [23]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')
# Use a set because the links might appear multiple times.
links = {a['href'] for a in soup('a') if 'press releases' in
         a.text.lower()}
print(links)  # {'/media/press-releases'}


{'https://jayapal.house.gov/category/news/', 'https://jayapal.house.gov/category/press-releases/'}


In [24]:
from typing import Dict, Set
press_releases: Dict[str, Set[str]] = {}
for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://posey.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=1487', '#tab-2'}
https://gwenmoore.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://schrier.house.gov: {'/media/press-releases'}
https://webster.house.gov/: {'/press-releases'}
https://lieu.house.gov/: {'/media-center/press-releases'}
https://adamsmith.house.gov/: {'#recentposts-pressreleases', '/press-releases'}
https://maloney.house.gov/: {'/news/press-releases'}
https://gosar.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://panetta.house.gov: {'/media/press-releases'}
https://huizenga.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=2041'}
https://dankildee.house.gov: {'/media/press-releases'}
https://hankjohnson.house.gov/: {'/media-center/press-releases'}
https://franklin.house.gov: {'/media/press-releases'}
https://sablan.house.gov/: set()
https://aderholt.house.gov/: {'/media-center/press-releases'}
https://bilirakis.house.gov/: {'/media/press-releases'}
https://davidson.h

KeyboardInterrupt: 

In [25]:
def paragraph_mentions(text: str, keyword: str) -> bool:
    """
    Returns True if a <p> inside the text mentions {keyword}
    """
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [p.get_text() for p in soup('p')]
    return any(keyword.lower() in paragraph.lower() for paragraph in paragraphs)

In [26]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
if paragraph_mentions(text, "twitter"): # is inside a <p>
    print("Success!")
else:
    print("Failed!")
    
if not paragraph_mentions(text, "facebook"):  # not inside a <p>
    print("Success!")
else:
    print("Failed!")    


Success!
Success!


In [27]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text
        if paragraph_mentions(text, 'data'):
            print(f"{house_url}")
            break # done with this house_url

https://lieu.house.gov/
https://roy.house.gov


In [30]:
import json
serialized = """{ "title" : "Data Science Book",
                  "author" : "Joel Grus",
                  "publicationYear" : 2019,
                  "topics" : [ "data", "science", "data science"] }"""
# parse the JSON to create a Python dict
deserialized = json.loads(serialized)

if deserialized["publicationYear"] == 2019:
    print("Success!")
else:
    print("Failed!")
    
if "data science" in deserialized["topics"]:
    print("Success!")
else:
    print("Failed!")



Success!
Success!


In [31]:
import requests, json
github_user = "joelgrus"
endpoint = f"https://api.github.com/users/{github_user}/repos"
repos = json.loads(requests.get(endpoint).text)

In [36]:
from collections import Counter
from dateutil.parser import parse
dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)
print(dates)
print(month_counts)
print(weekday_counts)

[datetime.datetime(2017, 12, 2, 20, 13, 49, tzinfo=tzutc()), datetime.datetime(2018, 11, 30, 22, 41, 16, tzinfo=tzutc()), datetime.datetime(2019, 12, 1, 2, 57, 18, tzinfo=tzutc()), datetime.datetime(2020, 11, 21, 16, 21, 49, tzinfo=tzutc()), datetime.datetime(2021, 11, 24, 13, 53, 23, tzinfo=tzutc()), datetime.datetime(2018, 2, 23, 15, 51, 4, tzinfo=tzutc()), datetime.datetime(2017, 12, 19, 0, 12, 40, tzinfo=tzutc()), datetime.datetime(2018, 1, 31, 23, 51, 16, tzinfo=tzutc()), datetime.datetime(2018, 12, 19, 19, 44, 45, tzinfo=tzutc()), datetime.datetime(2018, 9, 5, 2, 43, 52, tzinfo=tzutc()), datetime.datetime(2019, 2, 1, 20, 25, 46, tzinfo=tzutc()), datetime.datetime(2013, 7, 5, 2, 2, 28, tzinfo=tzutc()), datetime.datetime(2017, 5, 10, 17, 22, 45, tzinfo=tzutc()), datetime.datetime(2013, 11, 15, 5, 33, 22, tzinfo=tzutc()), datetime.datetime(2012, 9, 18, 4, 20, 23, tzinfo=tzutc()), datetime.datetime(2016, 7, 19, 17, 34, 31, tzinfo=tzutc()), datetime.datetime(2015, 11, 11, 14, 15, 36, 

In [37]:
last_5_repositories = sorted(
    repos, key=lambda r: r["pushed_at"], reverse=True)[:5]
last_5_languages = [repo["language"] for repo in last_5_repositories]
print(last_5_languages)

['JavaScript', 'Python', 'Python', 'Python', 'Python']
