# Errata.ipynb: pull errata and add those found

In [1]:
import requests
import pandas as pd
import bs4
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 300)

getting the page data: creates list of `<p>` tags in the appropriate div

In [2]:
url = "https://www.statlearning.com/errata-python-edition"
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
ps = (
    soup.find_all("div", class_="sqs-html-content")[1] # 1 was manually determined 
        .find_all("p")
)

Formatting into DataFrame

In [8]:
from copy import copy
errata_list = []
for p in ps:
    if p.text[:2].lower() in ["in", "on"]: 
        # nb casefold does more than lower but we won't need it atm
        pp = copy(p)
        em = ""
        if pp.em:
            em = pp.em.extract().text.strip() 
            if em[-1]=='.': 
                em = em[:-1]
            thanks_to = "Thanks to"
            if em and em[: len(thanks_to)].lower() == thanks_to.lower():
                em = em[len(thanks_to):]
        errata_list.append([pp.text, em, True])
errata = pd.DataFrame(
    errata_list, 
    columns=["Errata", "Attribution", "From Author Website"]
    )


Pull out page number from the erratum

In [4]:
# Import re module
import re

# Define extract_digits function
def extract_digits(string):
    # Find the first match of one or more digits in the string
    match = re.search(r"\d+", string)
    # Get the matched substring or None
    digits = match.group() if match else None
    # Convert the substring to an integer or 0
    return int(digits) if digits else 0

errata["Page Number"] = errata["Errata"].apply(extract_digits)

Viewing the data.
  - as of 18th September 2023, there are 18 entries

In [7]:
print("number of rows:", len(errata))
(
    errata.style.set_properties(**{"text-align": "left"}) #left-align text
        .set_table_styles([dict(selector="th", props=[("text-align", "left")])]) #left-align headers
)

number of rows: 18


Unnamed: 0,Errata,Attribution,From Author Website,Page Number
0,"On page 44, “Out[22]:” should not be numbered.",The authors,True,44
1,"On page 49, the input block after “In[43]:” should be numbered (this will affect the numbering of downstream input blocks as well).",The authors,True,49
2,"On page 61, block 103, there should be a semi-colon in the last line to indicate that the output should be suppressed. Also, the semi-colon in the first line is superfluous, and should be removed.",Julien Gomes,True,61
3,"On page 66, there is an error in the code in Exercise 2(f): the linecollege['Elite'] = pd.cut(college['Top10perc'], [0,0.5,1], labels=['No', 'Yes'])should be replaced withcollege[“Elite”] = pd.cut(college[“Top10perc”]/100, [0, 0.5, 1], labels = [“No, “Yes”]).",Dylan Owens,True,66
4,"In the footnote on the bottom of page 76, the sentence ""Details of how to compute the 95% confidence interval precisely in R will be provided later in this chapter"" should mention Python instead of R.",Rush Kirubi,True,76
5,"On the bottom of page 81, the sentence “Any statistical software package can be used to compute these coefficient estimates, and later in this chapter we will show how this can be done in R.” should mention Python instead of R.",Jasmin Bogatinovski and Omar Mallick,True,81
6,"On the top of page 94: The sentence “It is estimated that those in the South will have $18.69 less debt than those in the East, and that those in the West will have $12.50 less debt than those in the East” should instead say “It is estimated that those in the West will have $18.69 less debt than those in the East, and that those in the South will have $12.50 less debt than those in the East.",Yongjun Zhu and Felipe Provezano Coutinho,True,94
7,"On page 131, exercise 11d: ""Show algebraically, and confirm numerically in R"" should read ""Show algebraically, and confirm numerically in Python"".",Julien Gomes,True,131
8,"On the bottom of page 184, the last sentence is missing two words. It should read: “In this case Purchase has only Yes and No values and the method returns how many values of each there are.”",Johannes Ruf,True,184
9,"On page 187, the printed text under “In[60]:” should not be in green.",The authors,True,187


Saving the errata DataFrame to disk

In [6]:
errata.to_csv("errata.csv")

Optional: Saving Raw HTML (mainly for debugging)

In [9]:
from pathlib import Path
raw_response = Path("./errata.html")
raw_response.write_text(response.text)

273711