In [71]:
# get urls related to a theme
import requests

# list of search terms 
search_terms = ['asthma', 'asthma trigger*', 'asthma symptoms', 'asthma inhalers', 'asthma treatments', 'asthma attack', 'asthma breath',
                'asthma warning', 'asthma type', 'asthma treatment', 'asthma diagnosis', 'asthma causes', 'asthma prevention', 'asthma information',
                'asthma facts', 'signs of asthma','how to control asthma', 'what is asthma', 'learn about asthma', 'asthma health']

def search_wikipedia(search_term):
    base_url = "https://en.wikipedia.org/w/api.php"
    action = "?action=query&list=search&srsearch="
    format_url = "&utf8=&format=json"

    complete_url = base_url + action + search_term + format_url

    response = requests.get(complete_url)
    result = response.json()

    page_urls = ["https://en.wikipedia.org/wiki/" + page['title'].replace(' ', '_') for page in result['query']['search']]

    return page_urls

# list to store urls over all search terms
urls = []

# loop over search terms to find urls 
for term in search_terms:
    searchterm_urls = search_wikipedia(str(term))
    for url in searchterm_urls:
        urls.append(url)

# remove duplicate results
urls = list(set(urls))
# remove "(disambiguation)" pages
urls = [url for url in urls if "(disambiguation)" not in url]

print(f"URLs found: {len(urls)}")

URLs found: 78


In [72]:
# confirm urls
# list to store confirmed URLs
confirmed_urls = []

# ask user to confirm each URL
for url in urls:
    print(f"URL: {url}")
    confirm = input("Is this URL useful? (yes/no): ")
    if confirm.lower() == "yes":
        confirmed_urls.append(url)

print(f"Confirmed URLs: {len(confirmed_urls)}")


URL: https://en.wikipedia.org/wiki/Obstructive_lung_disease
URL: https://en.wikipedia.org/wiki/Asthma_trigger
URL: https://en.wikipedia.org/wiki/Scuba_diving_fatalities
URL: https://en.wikipedia.org/wiki/Occupational_asthma
URL: https://en.wikipedia.org/wiki/Montelukast
URL: https://en.wikipedia.org/wiki/Leukotriene_C4
URL: https://en.wikipedia.org/wiki/Health
URL: https://en.wikipedia.org/wiki/Osteopathy
URL: https://en.wikipedia.org/wiki/Cell_extrusion
URL: https://en.wikipedia.org/wiki/Epidemiology_of_asthma
URL: https://en.wikipedia.org/wiki/Corticosteroid
URL: https://en.wikipedia.org/wiki/Euphorbia_hirta
URL: https://en.wikipedia.org/wiki/Asthma
URL: https://en.wikipedia.org/wiki/Diseases_of_poverty
URL: https://en.wikipedia.org/wiki/Oral_allergy_syndrome
URL: https://en.wikipedia.org/wiki/Salbutamol
URL: https://en.wikipedia.org/wiki/Brittle_asthma
URL: https://en.wikipedia.org/wiki/Inhaler_spacer
URL: https://en.wikipedia.org/wiki/Joyce_Vincent
URL: https://en.wikipedia.org/wik

In [74]:
# get page content
import requests
from bs4 import BeautifulSoup
import re
import os
from dotenv import load_dotenv

# init env variables
load_dotenv()

# specify location of output
output_dir = os.environ['OUTPUT_FILE_DIR']

# call url
for idx, url in enumerate(confirmed_urls):
    response = requests.get(
	    url=confirmed_urls[idx],
    )

    # print response code - should be 200 if its working correctly
    response_code = response.status_code
    print(f"Response code: {response_code}")

    # access content
    if response_code == 200:
        try:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get the title
            title = soup.find(id='firstHeading')
            print(f"Currently processing: {title.string}")

            # find body content
            body_content = soup.find(id='bodyContent').find_all('p')
            # remove empty sections and tags
            body_content = [p.text.strip() for p in body_content if p.text.strip() != '']
            # remove references to other sources
            body_content = [re.sub(r'\[\w+\]','', p) for p in body_content]
            # join the text
            body_content_joined = '\n\n'.join(body_content)
            # write the text to a markdown file
            with open(os.path.join(output_dir, f'{title.string}.txt'), 'w', encoding='utf-8') as f:
                f.write(body_content_joined)
        except:
            print('Something has gone wrong')

Response code: 200
Currently processing: Obstructive lung disease
Response code: 200
Currently processing: Asthma trigger
Response code: 200
Currently processing: Occupational asthma
Response code: 200
Currently processing: Montelukast
Response code: 200
Currently processing: Epidemiology of asthma
Response code: 200
Currently processing: Corticosteroid
Response code: 200
Currently processing: Asthma
Response code: 200
Currently processing: Diseases of poverty
Response code: 200
Currently processing: Oral allergy syndrome
Response code: 200
Currently processing: Brittle asthma
Response code: 200
Currently processing: Inhaler spacer
Response code: 200
Currently processing: Bronchodilator
Response code: 200
Currently processing: Thunderstorm asthma
Response code: 200
Currently processing: Rhinitis
Response code: 200
Currently processing: Anaphylaxis
Response code: 200
Currently processing: Chronic obstructive pulmonary disease
Response code: 200
Currently processing: Azithromycin
Respons