In [2]:
pip install requests beautifulsoup4 pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting requests
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.0 MB/s eta 0:00:01
[?25hCollecting beautifulsoup4
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 10.2 MB/s eta 0:00:01
[?25hCollecting pandas
  Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 815 kB/s eta 0:00:01    |███                             | 1.1 MB 1.3 MB/s eta 0:00:09
[?25hCollecting charset-normalizer<4,>=2
  Downloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl (120 kB)
[K     |████████████████████████████████| 120 kB 4.6 MB/s eta 0:00:01
[?25hCollecting urllib3<3,>=1.21.1
  Downloading urllib3-2.2.2-py3-none-any.whl (121 kB)
[K     |████████████████████████████████| 121 kB 2.5 MB/s eta 0:00:01
[?25hCollecting certifi>=

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin



In [21]:
# Set folder where to save the dataset and HTML files
folder_dataset = os.path.expanduser("~/Downloads/test/dataset/")
folder_html_files = os.path.expanduser("~/Downloads/test/html_files/")
os.makedirs(folder_dataset, exist_ok=True)
os.makedirs(folder_html_files, exist_ok=True)

# Set URL of the main webpage to scrape
# Read the CSV file into a DataFrame
df = pd.read_csv('/users/victoriachen/Documents/My Baby Thesis/libraries_in_london.csv')

# Get the rows and convert it to a string
urls = df['urls'].iloc[0:3].tolist()

print(urls)

['https://www.vwml.org/', 'http://www.wellcomecollection.org', 'https://www.lambeth.gov.uk/libraries-0/carnegie-library']


In [22]:
index=0
while index< len(urls):
    start_url=urls[index]

    # Initialize dataset
    website = pd.DataFrame({
        'url_relative': ["__index__"],  # relative url from links
        'url_absolute': [start_url],    # absolute url
        'retrieved': [False],           # whether the page has been retrieved or not
        'page_html': [None],            # file where the HTML content has been saved
        'page_text': [None]             # text from the page paragraphs
    })

    # Function to clean and generate filename for HTML
    def generate_filename(url):
        return os.path.join(folder_html_files,
                            url.replace("http://", "").replace("https://", "")
                               .replace("/", "_").replace(".", "_")
                               .replace("#", "_").replace("-", "_") + ".html")

    # Function to scrape webpage
    def scrape_page(url):
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            return None

    website['filename'] = website['url_absolute'].apply(generate_filename)

    website_flag=-1
    while not website[website['retrieved'] == False].empty:
        current_url = website.loc[website['retrieved'] == False, 'url_absolute'].values[0]
        current_url_filename = generate_filename(current_url)

        if(website_flag==len(website[website['retrieved'] == True])):
            break
        else:
            website_flag=-1

        # Keep track while retrieving pages
        print(f"{len(website[website['retrieved'] == True])} / {len(website)} done, working on {current_url}")

        website_flag=len(website[website['retrieved'] == True])

        # Retrieve and parse webpage at start url
        html_content = scrape_page(current_url)

        if html_content:
            # Save HTML content to file
            with open(current_url_filename, 'w', encoding='utf-8') as file:
                file.write(html_content)

            # Extract text
            soup = BeautifulSoup(html_content, 'html.parser')
            page_text = "\n\n".join([p.get_text() for p in soup.find_all('p')])

            # Update the current page content in the dataset
            website.loc[website['url_absolute'] == current_url, 'page_html'] = current_url_filename
            website.loc[website['url_absolute'] == current_url, 'page_text'] = page_text
            website.loc[website['url_absolute'] == current_url, 'retrieved'] = True

            # Extracting all links
            links = [a.get('href') for a in soup.find_all('a', href=True)]
            links = [link for link in links if link and not link.startswith(('mailto:', 'tel:', '#', '/#'))]
            links = [urljoin(start_url, link) if not link.startswith(('http://', 'https://')) else link for link in links]
            links = list(set(filter(lambda x: x.startswith(start_url), links)))

            # Add new links to the dataset
            new_links = pd.DataFrame({'url_relative': links, 'url_absolute': links})
            new_links['retrieved'] = False
            new_links['page_html'] = None
            new_links['page_text'] = None

            website = pd.concat([website, new_links]).drop_duplicates(subset=['url_absolute']).reset_index(drop=True)

    # Save dataset to CSV and RDS (using pickle for RDS equivalent)
    csv_filename = os.path.join(folder_dataset,
                                current_url.replace("http://", "").replace("https://", "").replace("/", "_").replace(".",
                                                                                                                     "_").replace(
                                    "#", "_").replace("-", "_") + ".csv")
    website.to_csv(csv_filename, index=False)

    rds_filename = os.path.join(folder_dataset,
                                current_url.replace("http://", "").replace("https://", "").replace("/", "_").replace(".",
                                                                                                                     "_").replace(
                                    "#", "_").replace("-", "_") + ".pkl")
    website.to_pickle(rds_filename)

    print("... DONE!")

    index=index+1

0 / 1 done, working on https://www.vwml.org/
1 / 67 done, working on https://www.vwml.org/account
2 / 68 done, working on https://www.vwml.org/vwml-about-us/history-and-collections
3 / 70 done, working on https://www.vwml.org/vwml-catalogues-and-indexes/vwml-help/general-help
4 / 71 done, working on https://www.vwml.org/about-us/what-we-do/news/13732-new-vwml-website-set-to-launch
5 / 72 done, working on https://www.vwml.org/vwml-subject-guides/beginners-guide-music
6 / 73 done, working on https://www.vwml.org/vwml-digitised-resources/cecil-sharps-photographs
7 / 74 done, working on https://www.vwml.org/policies
8 / 190 done, working on https://www.vwml.org/vwml-events/past-events/past-special-conferences
9 / 191 done, working on https://www.vwml.org/vwml-catalogues-and-indexes/vwml-help/vwml-indexes-help
10 / 192 done, working on https://www.vwml.org/about-us/what-we-do/news/13637-exhibition-celebrates-cecil-sharps-singers
11 / 193 done, working on https://www.vwml.org/vwml-digitised-

In [1]:
import os
print(os.getcwd())

/Users/victoriachen/Documents/My Baby Thesis/webscrapping


In [2]:
print(os.path.abspath("web_scraping_data"))

/Users/victoriachen/Documents/My Baby Thesis/webscrapping/web_scraping_data
