# Imports and other declarations

In [16]:
import json # used to store link - text content mapping into a .json file
import re # used regex to filter strings
import requests # used to access web page urls
from bs4 import BeautifulSoup # used to scrape web pages
from datetime import datetime # used to calculate processing time for some tasks

In [11]:
# constants and paths list

articles_names_url = 'https://www.news-medical.net/medical-a-z.aspx'

# only with x, y categories
test_links_articles_txt = "test_links_in_db.txt"
test_link_content_dictionary_txt = "test_link_content_dictionary.txt"
test_link_content_dictionary_json = "test_link_content_dictionary.json"
test_link_wordcount_dictionary_json = "test_link_wordcount_dictionary.json"

# with ALL categories
final_links_articles_txt = "final_links_in_db.txt"
final_link_content_dictionary_txt = "final_link_content_dictionary.txt"
final_link_content_without_bl_dictionary_json = "final_link_content_without_bl_dictionary.json"
final_link_title_without_bl_dictionary_json = "final_link_title_without_bl_dictionary.json"
final_link_content_dictionary_json = "final_link_content_dictionary.json"
final_link_wordcount_dictionary_json = "final_link_wordcount_dictionary.json"

# Getting the data

In [25]:
# Helper function to extract the url articles from a Category page and their subcategory (if needed)

def extract_urls_from_soup(soup):

    article_subcategs = [] # array of subcategories in Category A page
    for toggle in soup.body.findAll(class_='toggle-expand'):
        toggle_str = toggle.get_text()
        article_subcategs.append(toggle_str)

    # get the links for each subcateg
    article_json = {} # ex of json entry: link: subcategory
                    # array of links in Category A page that contain /health/ and ends with .aspx (see regex)
    index = 0
    for expand in soup.body.findAll(class_='expand-item'):
        for tag in expand:
            aTags = tag.find_all('a', href=True)
            for a in aTags:
                if a.text:
                    if '/health/' in a['href']:
                        link_in_href = a['href']
                        link_in_href = re.sub('[.]+aspx.*',  '.aspx', link_in_href)
                        link_in_href = re.sub('.*/health/', 'https://www.news-medical.net/health/', link_in_href)
                        article_json[link_in_href] = article_subcategs[index]
        index = index + 1 # increment subcategory

    return article_json # json: key = link of the article (unique), value = subcategory of the article

# getting text content of a Category page with BeautifulSoup
response = requests.get(articles_names_url) # url page just for Category A of medical articles
soup = BeautifulSoup(response.text, 'html.parser')

# call fcn
article_json = extract_urls_from_soup(soup)

In [26]:
# json: key = link of the article (unique), value = subcategory of the article
# article_json

In [27]:
# getting text content of an article page with BeautifulSoup
article_url_example = 'https://www.news-medical.net/health/Treatment-of-anemia.aspx'
response = requests.get(article_url_example) # url page for a specific medical article
soup = BeautifulSoup(response.text, 'html.parser')

# Helper function for text content extraction task

def extract_text_from_soup(soup):
    text_content = ''
    for divContent in soup.body.findAll(class_='content'):
        div_str = re.sub('<.*?>', ' ', str(divContent))
        div_str = re.sub('\n', ' ', div_str)
        div_str = re.sub('\r', ' ', div_str)
        div_str = re.sub(' +', ' ', div_str)
        text_content = text_content + div_str
    
    return text_content

# call fcn
text_content = extract_text_from_soup(soup)

In [28]:
# text content of the entire article page (with links)
# text_content

In [6]:
# getting text content of an article page with BeautifulSoup
article_url_example = 'https://www.news-medical.net//health/Role-of-Glycoscience-in-Viral-Surveillance.aspx'
response = requests.get(article_url_example)
soup = BeautifulSoup(response.text, 'html.parser')

# Helper function for title (headline) extraction task

def extract_title_from_soup(soup):
    # for h1 in soup.body.find_all('h1'): # same output as search by id
    for h1 in soup.body.find_all(id='ctl00_cphBody_h1Title'):
        title = h1.text

    return title

# call fcn
title = extract_title_from_soup(soup)

In [7]:
# title of the entire article page
title

'Role of Glycoscience in Viral Surveillance'

In [31]:
before_processing_datetime = datetime.now()
print(before_processing_datetime)

title = extract_title_from_soup(soup)
print(title)

after_processing_datetime = datetime.now()
print(after_processing_datetime)

diff_datetime = after_processing_datetime - before_processing_datetime
print(diff_datetime)

2022-06-07 22:32:19.234438
Role of Glycoscience in Viral Surveillance
2022-06-07 22:32:19.237926
0:00:00.003488


In [46]:
# create the url_list
url_list = []
categories = ['', '?l=b', '?l=c', '?l=d', '?l=e', '?l=f', '?l=g', '?l=h', '?l=i', '?l=j', '?l=k', '?l=l', '?l=m', '?l=n', '?l=o', '?l=p', '?l=q', '?l=r', '?l=s', '?l=t', '?l=u', '?l=v', '?l=w', '?l=x', '?l=y', '?l=z']
# categories = ['?l=x', '?l=y']

for category in categories:
    url = articles_names_url + category
    url_list.append(url)

In [47]:
# list of Category pages to be scraped for title and text content
url_list

['https://www.news-medical.net/medical-a-z.aspx',
 'https://www.news-medical.net/medical-a-z.aspx?l=b',
 'https://www.news-medical.net/medical-a-z.aspx?l=c',
 'https://www.news-medical.net/medical-a-z.aspx?l=d',
 'https://www.news-medical.net/medical-a-z.aspx?l=e',
 'https://www.news-medical.net/medical-a-z.aspx?l=f',
 'https://www.news-medical.net/medical-a-z.aspx?l=g',
 'https://www.news-medical.net/medical-a-z.aspx?l=h',
 'https://www.news-medical.net/medical-a-z.aspx?l=i',
 'https://www.news-medical.net/medical-a-z.aspx?l=j',
 'https://www.news-medical.net/medical-a-z.aspx?l=k',
 'https://www.news-medical.net/medical-a-z.aspx?l=l',
 'https://www.news-medical.net/medical-a-z.aspx?l=m',
 'https://www.news-medical.net/medical-a-z.aspx?l=n',
 'https://www.news-medical.net/medical-a-z.aspx?l=o',
 'https://www.news-medical.net/medical-a-z.aspx?l=p',
 'https://www.news-medical.net/medical-a-z.aspx?l=q',
 'https://www.news-medical.net/medical-a-z.aspx?l=r',
 'https://www.news-medical.net/m

In [48]:
# Helper function for crawling and writing the data into json, 
# also for obtaining other infos as processing time, broken links, title and wordcount mapping to link of the articles
# data = text content from each article link found in all article (sub)categories

def extract_raw_data(url_list):
    """
    Crawler function that extracts 
    ALL the text data from the page 
    and ALL it's links 
    """
    data_bag = {} # json, key = link of article, value = text content article
    category_link_mapping = {} # json, key = category of articles, value = array of article links for that category
    category_processingtime_mapping = {} # json, key = category of articles, value = processing time for url extraction task
    link_processingtime_mapping = {} # json, key = link of article, value = processing time for text extraction task
    link_wordcount_mapping = {} # json, key = link of article, value = total number of words for that article
    link_title_mapping = {} # json, key = link of article, value = title of article
    broken_links = [] # array of broken links = links that cannot be accessed
    
    starttime = datetime.now() # START processing time for crawling ALL web pages
    print('Script #1 starts at: ', starttime)
    
    for this_url in url_list:
        print("working on Category page:", this_url)
        try:
            # getting text content of a Category page with BeautifulSoup
            response = requests.get(this_url)
            this_soup = BeautifulSoup(response.text, 'html.parser')
        except:
            print("outdated link")
            broken_links.append(this_url) # review broken links (if needed)
            continue
        
        # crawling links found on Category page
        before_processing_datetime = datetime.now() # START processing time for a category of articles
        sublinks = set(extract_urls_from_soup(this_soup)) # store into a set only the keys of the article_json (link of articles)
        # article_json = extract_urls_from_soup(this_soup) # if subcategs are needed, try this way
        # sublinks = article_json.keys()
        # subcategs = article_json.values()
        print("sublinks found:", sublinks)
        category_link_mapping[this_url] = sublinks
        
        after_processing_datetime = datetime.now() # STOP processing time for a category of articles
        diff_datetime = after_processing_datetime - before_processing_datetime
        category_processingtime_mapping[this_url] = diff_datetime

        # crawling title and text content found on page
        for s_link in sublinks:
            print("working on article sublink:", s_link)
            try:
                before_processing_datetime = datetime.now() # START processing time for an article
                
                # getting text content of an article page with BeautifulSoup
                response = requests.get(s_link)
                this_bfs = BeautifulSoup(response.text, 'html.parser')
                
                # getting raw data as a json, where key = link of the article and value = text content of the article
                sentences = extract_text_from_soup(this_bfs)
                data_bag[s_link] = sentences
                
                # getting title (headline) of the article
                title = extract_title_from_soup(this_bfs)
                link_title_mapping[s_link] = title
                
                after_processing_datetime = datetime.now() # STOP processing time for an article
                diff_datetime = after_processing_datetime - before_processing_datetime
                link_processingtime_mapping[s_link] = diff_datetime
                
                # getting total number of words for the article
                word_count = len(sentences.split())
                link_wordcount_mapping[s_link] = word_count
            except:
                # message from web administrator stopping get requests on page
                print("link not working: ", s_link)
                broken_links.append(s_link) # review broken links (if needed)
                continue
                
    endtime = datetime.now() # STOP processing time for crawling ALL web pages
    print('Script #1 ends at: ', endtime)

    return (data_bag, category_link_mapping, category_processingtime_mapping, 
            link_processingtime_mapping, link_wordcount_mapping, 
            link_title_mapping, broken_links)

# call fcn
db, clm, cpm, lpm, lwm, ltm, bl = extract_raw_data(url_list)

Script #1 starts at:  2022-06-07 22:38:42.680904
working on Category page: https://www.news-medical.net/medical-a-z.aspx
sublinks found: {'https://www.news-medical.net/health/What-is-Aagenaes-Syndrome.aspx', 'https://www.news-medical.net/health/Milk-Allergy.aspx', 'https://www.news-medical.net/health/Asthma-Symptoms.aspx', 'https://www.news-medical.net/health/Alcohol-Withdrawal.aspx', 'https://www.news-medical.net/health/LDL-Cholesterol-and-Heart-Disease.aspx', 'https://www.news-medical.net/health/Antibiotic-Resistance-Influence-on-Wound-Care.aspx', 'https://www.news-medical.net/health/What-are-the-Complications-of-Acromegaly.aspx', 'https://www.news-medical.net/health/Alport-Syndrome-Signs-and-Symptoms.aspx', 'https://www.news-medical.net/health/Pathogenesis-of-Autoimmunity.aspx', 'https://www.news-medical.net/health/Acupuncture-Theories.aspx', 'https://www.news-medical.net/health/Acute-Lymphoblastic-Leukemia-Symptoms.aspx', 'https://www.news-medical.net/health/Anal-Fistula-An-Overvie

In [63]:
# data bag = text content of each article url found on each Category page accessed
# db

In [50]:
for category, links in clm.items():
    print('Found', len(links), 'articles for category: ', category)
#     for link in links:
#         print(link)

Found 737 articles for category:  https://www.news-medical.net/medical-a-z.aspx
Found 473 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=b
Found 902 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=c
Found 477 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=d
Found 344 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=e
Found 252 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=f
Found 278 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=g
Found 719 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=h
Found 255 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=i
Found 29 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=j
Found 107 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l=k
Found 284 articles for category:  https://www.news-medical.net/medical-a-z.aspx?l

In [51]:
for category, time in cpm.items():
    print(time, 'time spent to extract all articles from category: ', category)

this_max = max(cpm.values())
print(this_max)

0:00:00.021788 time spent to extract all articles from category:  https://www.news-medical.net/medical-a-z.aspx
0:00:00.014303 time spent to extract all articles from category:  https://www.news-medical.net/medical-a-z.aspx?l=b
0:00:00.024623 time spent to extract all articles from category:  https://www.news-medical.net/medical-a-z.aspx?l=c
0:00:00.016100 time spent to extract all articles from category:  https://www.news-medical.net/medical-a-z.aspx?l=d
0:00:00.014909 time spent to extract all articles from category:  https://www.news-medical.net/medical-a-z.aspx?l=e
0:00:00.010516 time spent to extract all articles from category:  https://www.news-medical.net/medical-a-z.aspx?l=f
0:00:00.011156 time spent to extract all articles from category:  https://www.news-medical.net/medical-a-z.aspx?l=g
0:00:00.022670 time spent to extract all articles from category:  https://www.news-medical.net/medical-a-z.aspx?l=h
0:00:00.009984 time spent to extract all articles from category:  https://ww

In [52]:
for link, time in lpm.items():
    print(time, 'time spent to extract title and content from link: ', link)

this_max = max(lpm.values())
print(this_max)

0:00:00.572009 time spent to extract title and content from link:  https://www.news-medical.net/health/What-is-Aagenaes-Syndrome.aspx
0:00:00.426413 time spent to extract title and content from link:  https://www.news-medical.net/health/Milk-Allergy.aspx
0:00:00.519664 time spent to extract title and content from link:  https://www.news-medical.net/health/Asthma-Symptoms.aspx
0:00:00.489498 time spent to extract title and content from link:  https://www.news-medical.net/health/Alcohol-Withdrawal.aspx
0:00:00.471794 time spent to extract title and content from link:  https://www.news-medical.net/health/LDL-Cholesterol-and-Heart-Disease.aspx
0:00:00.414025 time spent to extract title and content from link:  https://www.news-medical.net/health/Antibiotic-Resistance-Influence-on-Wound-Care.aspx
0:00:00.466893 time spent to extract title and content from link:  https://www.news-medical.net/health/What-are-the-Complications-of-Acromegaly.aspx
0:00:00.419521 time spent to extract title and co

In [53]:
for link, word_count in lwm.items():
    print(word_count, 'words found in link: ', link)

this_min = min(lwm.values())
print(this_min)

this_max = max(lwm.values())
print(this_max)

this_avg = 0 if len(lwm.values()) == 0 else sum(lwm.values())/len(lwm.values())
print(round(this_avg))

539 words found in link:  https://www.news-medical.net/health/What-is-Aagenaes-Syndrome.aspx
892 words found in link:  https://www.news-medical.net/health/Milk-Allergy.aspx
661 words found in link:  https://www.news-medical.net/health/Asthma-Symptoms.aspx
882 words found in link:  https://www.news-medical.net/health/Alcohol-Withdrawal.aspx
1349 words found in link:  https://www.news-medical.net/health/LDL-Cholesterol-and-Heart-Disease.aspx
850 words found in link:  https://www.news-medical.net/health/Antibiotic-Resistance-Influence-on-Wound-Care.aspx
459 words found in link:  https://www.news-medical.net/health/What-are-the-Complications-of-Acromegaly.aspx
688 words found in link:  https://www.news-medical.net/health/Alport-Syndrome-Signs-and-Symptoms.aspx
341 words found in link:  https://www.news-medical.net/health/Pathogenesis-of-Autoimmunity.aspx
695 words found in link:  https://www.news-medical.net/health/Acupuncture-Theories.aspx
446 words found in link:  https://www.news-medica

In [54]:
final_link_title_mapping = {}
for link, title in ltm.items():
    print(title, ':', link)
    final_link_title_mapping[link] = title

What is Aagenaes Syndrome? : https://www.news-medical.net/health/What-is-Aagenaes-Syndrome.aspx
Milk Allergy : https://www.news-medical.net/health/Milk-Allergy.aspx
Asthma Symptoms : https://www.news-medical.net/health/Asthma-Symptoms.aspx
Alcohol Withdrawal : https://www.news-medical.net/health/Alcohol-Withdrawal.aspx
LDL Cholesterol and Heart Disease : https://www.news-medical.net/health/LDL-Cholesterol-and-Heart-Disease.aspx
Antibiotic Resistance Influence on Wound Care : https://www.news-medical.net/health/Antibiotic-Resistance-Influence-on-Wound-Care.aspx
What are the Complications of Acromegaly? : https://www.news-medical.net/health/What-are-the-Complications-of-Acromegaly.aspx
Alport Syndrome Signs and Symptoms : https://www.news-medical.net/health/Alport-Syndrome-Signs-and-Symptoms.aspx
Pathogenesis of Autoimmunity : https://www.news-medical.net/health/Pathogenesis-of-Autoimmunity.aspx
Acupuncture Theories : https://www.news-medical.net/health/Acupuncture-Theories.aspx
Acute Ly

In [55]:
# number of articles extracted from all subcategories/categories
len(db)

7626

In [63]:
# broken links review (if needed)
# WHY BROKEN LINKS ARE IN DB ??
bl

['https://www.news-medical.net/health/The-Effect-of-Antibiotics-on-the-Gut-Microbiome.aspx',
 'https://www.news-medical.net/health/Autoantibodies-and-Autism.aspx',
 'https://www.news-medical.net/health/Taking-Anabolic-Steroids-After-a-Sport-Injury.asp',
 'https://www.news-medical.net/health/Blood-Type-and-Giving-Blood.aspx',
 'https://www.news-medical.net/health/What-is-Mechanobiology.aspx',
 'https://www.news-medical.net/health//life-sciences/Extracellular-Vesicles-as-Floating-Cancer-Biomarkers.aspx',
 'https://www.news-medical.net/health/SCHIP-Administration.aspx',
 'https://www.news-medical.net/health/COVID-19-A-Timeline-From-1st-Case-to-Vaccination.aspx',
 'https://www.news-medical.net/health/life-sciences/Chromatography-and-Nanotechnology.aspx',
 'https://www.news-medical.net/health/SCHIP-Debate.aspx',
 'https://www.news-medical.net/health/What-is-a-Semi-synthetic-Organism.aspx',
 'https://www.news-medical.net/health/Dysbiosis-and-Ageing.aspx',
 'https://www.news-medical.net/healt

In [68]:
# how many broken links
print(len(bl), 'out of', len(db), 'meaning', len(bl) / len(db) * 100, '%')

40 out of 7626 meaning 0.5245213742460005 %


In [4]:
# remove broken links from
brokens = ['https://www.news-medical.net/health/The-Effect-of-Antibiotics-on-the-Gut-Microbiome.aspx',
 'https://www.news-medical.net/health/Autoantibodies-and-Autism.aspx',
 'https://www.news-medical.net/health/Taking-Anabolic-Steroids-After-a-Sport-Injury.asp',
 'https://www.news-medical.net/health/Blood-Type-and-Giving-Blood.aspx',
 'https://www.news-medical.net/health/What-is-Mechanobiology.aspx',
 'https://www.news-medical.net/health//life-sciences/Extracellular-Vesicles-as-Floating-Cancer-Biomarkers.aspx',
 'https://www.news-medical.net/health/SCHIP-Administration.aspx',
 'https://www.news-medical.net/health/COVID-19-A-Timeline-From-1st-Case-to-Vaccination.aspx',
 'https://www.news-medical.net/health/life-sciences/Chromatography-and-Nanotechnology.aspx',
 'https://www.news-medical.net/health/SCHIP-Debate.aspx',
 'https://www.news-medical.net/health/What-is-a-Semi-synthetic-Organism.aspx',
 'https://www.news-medical.net/health/Dysbiosis-and-Ageing.aspx',
 'https://www.news-medical.net/health/Compassion-Fatigue-in-Healthcare-Professionals.aspx',
 'https://www.news-medical.net/health/Urbanization-and-Human-Health.aspx',
 'https://www.news-medical.net/health/Virtualization-within-Healthcare3b-Why-is-it-Important.aspx',
 'https://www.news-medical.net/health/Haemophilia-Differential-Diagnosis.aspx',
 'https://www.news-medical.net/health/Health-and-Wealth.aspx',
 'https://www.news-medical.net/health/Health-Technology-Around-the-World.aspx',
 'https://www.news-medical.net/health/An-Overview-Of-Sex-Hormones.aspx',
 'https://www.news-medical.net/health/How to Prevent Lung Cancer.aspx',
 'https://www.news-medical.net/health/The-Genetics-of-Schizophrenia.aspx',
 'https://www.news-medical.net/health/What-Does-Blood-in-Semen-Mean.aspx',
 'https://www.news-medical.net/health/An-Overview-of-Sperm-Sorting-Technologies.aspx',
 'https://www.news-medical.net/health/The-Importance-of-Regulating-Medical-Devices.aspx',
 'https://www.news-medical.net/health/Future-of-Sustainable-Pharmaceuticals.aspx',
 'https://www.news-medical.net/health/Dysbiosis-and-Ageing.aspx',
 'https://www.news-medical.net/health/Sensory-neuronopathy-and-Sjogrens-syndrome.aspx',
 'https://www.news-medical.net/health/Does-Drinking-10025-Fruit-Put-on-Weight.aspx',
 'https://www.news-medical.net/health/life-sciences/Chromatography-and-Nanotechnology.aspx',
 'https://www.news-medical.net/health/life-sciences/What-are-Astrocytes.aspx',
 'https://www.news-medical.net/health/Can-Men-Improve-the-Chances-of-a-Successful-Pregnancy.aspx',
 'https://www.news-medical.net/health/What-is-Bjornstad-Syndrome.aspx',
 'https://www.news-medical.net/health/Relenza-(Zanamivir)-Commercial-Issues.aspx',
 'https://www.news-medical.net/health/What-is-Carnosinemia.aspx',
 'https://www.news-medical.net/health/Relenza-Dosing-and-Side-Effects.aspx',
 'https://www.news-medical.net/health/What-is-Synaptic-Plasticity.aspx',
 'https://www.news-medical.net/health/The-Importance-of-Scientific-Communication.aspx',
 'https://www.news-medical.net/health/Using-Virtual-Reality-in-Medicine.aspx',
 'https://www.news-medical.net/health/What-are-Vocal-Cords.aspx',
 'https://www.news-medical.net/health/Warfarin-Pharmacology.aspx']
print(brokens)

['https://www.news-medical.net/health/The-Effect-of-Antibiotics-on-the-Gut-Microbiome.aspx',
 'https://www.news-medical.net/health/Autoantibodies-and-Autism.aspx',
 'https://www.news-medical.net/health/Taking-Anabolic-Steroids-After-a-Sport-Injury.asp',
 'https://www.news-medical.net/health/Blood-Type-and-Giving-Blood.aspx',
 'https://www.news-medical.net/health/What-is-Mechanobiology.aspx',
 'https://www.news-medical.net/health//life-sciences/Extracellular-Vesicles-as-Floating-Cancer-Biomarkers.aspx',
 'https://www.news-medical.net/health/SCHIP-Administration.aspx',
 'https://www.news-medical.net/health/COVID-19-A-Timeline-From-1st-Case-to-Vaccination.aspx',
 'https://www.news-medical.net/health/life-sciences/Chromatography-and-Nanotechnology.aspx',
 'https://www.news-medical.net/health/SCHIP-Debate.aspx',
 'https://www.news-medical.net/health/What-is-a-Semi-synthetic-Organism.aspx',
 'https://www.news-medical.net/health/Dysbiosis-and-Ageing.aspx',
 'https://www.news-medical.net/healt

In [35]:
len(brokens)

40

# Writting the data on disk

In [57]:
# write ALL article links into a single txt file
with open(final_links_articles_txt, 'w') as f:
    for key in db.keys():
        f.write('%s\n' % key)

In [59]:
# write ALL article contents into a single txt file
with open(final_link_content_dictionary_txt, 'w') as f: # with statements closes the file automatically
    for key, value in db.items(): 
        f.write('At link %s you can find the following content:\n%s\n\n' % (key, value))

In [60]:
# write JSON with ALL article links mapped to their text content into a single .json file
with open(final_link_content_dictionary_json, 'w') as f:
    jsonString = json.dumps(db)
    f.write(jsonString)

In [61]:
# write JSON with ALL article links mapped to their word count into a single .json file
with open(final_link_wordcount_dictionary_json, 'w') as f:
    jsonString = json.dumps(lwm)
    f.write(jsonString)

In [15]:
# write JSON with ALL article links (without the broken links) mapped to their title into a single .json file
with open(final_link_title_without_bl_dictionary_json, 'w') as f:
    jsonString = json.dumps(final_link_title_mapping)
    f.write(jsonString)

# Some url improvements

In [122]:
bl # before changes

['https://www.news-medical.net//health/Taking-Anabolic-Steroids-After-a-Sport-Injury.asp',
 'https://www.news-medical.net//health/Autoantibodies-and-Autism.aspx',
 'https://www.news-medical.net//health/Blood-Type-and-Giving-Blood.aspx',
 'https://www.news-medical.net//health/life-sciences/Chromatography-and-Nanotechnology.aspx',
 'https://www.news-medical.net//health//life-sciences/Extracellular-Vesicles-as-Floating-Cancer-Biomarkers.aspx.aspx',
 'https://www.news-medical.net//health//life-sciences/Extracellular-Vesicles-as-Floating-Cancer-Biomarkers.aspx',
 'https://www.news-medical.net//health/SCHIP-Debate.aspx',
 'https://www.news-medical.net//health/COVID-19-A-Timeline-From-1st-Case-to-Vaccination.aspx',
 'https://www.news-medical.net//health/SCHIP-Administration.aspx',
 'https://www.news-medical.net//health/What-is-Mechanobiology.aspx',
 'https://www.news-medical.net//health/What-is-a-Semi-synthetic-Organism.aspx',
 'https://www.news-medical.net//health/Dysbiosis-and-Ageing.aspx',

In [178]:
bl # after changes

['https://www.news-medical.net//health/Taking-Anabolic-Steroids-After-a-Sport-Injury.asp',
 'https://www.news-medical.net//health/Autoantibodies-and-Autism.aspx',
 'https://www.news-medical.net//health/Blood-Type-and-Giving-Blood.aspx',
 'https://www.news-medical.net//health/life-sciences/Chromatography-and-Nanotechnology.aspx',
 'https://www.news-medical.net//health//life-sciences/Extracellular-Vesicles-as-Floating-Cancer-Biomarkers.aspx',
 'https://www.news-medical.net//health/SCHIP-Debate.aspx',
 'https://www.news-medical.net//health/COVID-19-A-Timeline-From-1st-Case-to-Vaccination.aspx',
 'https://www.news-medical.net//health/SCHIP-Administration.aspx',
 'https://www.news-medical.net//health/What-is-Mechanobiology.aspx',
 'https://www.news-medical.net//health/What-is-a-Semi-synthetic-Organism.aspx',
 'https://www.news-medical.net//health/Dysbiosis-and-Ageing.aspx',
 'https://www.news-medical.net//health/Haemophilia-Differential-Diagnosis.aspx',
 'https://www.news-medical.net//healt

In [182]:
# uniform links - before changes
for link in db.keys():
    if 'https://www.news-medical.net//health/' not in link:
        print(link)

https://www.news-medical.net///health/What-is-Patau-Syndrome.aspx
https://www.news-medical.net/health/The-Application-of-Photonics-in-Healthcare.aspx
https://www.news-medical.net///health/Cervical-ectropion.aspx
https://www.news-medical.net/health/The-Role-of-Alternative-Polyadenylation-in-Disease.aspx
https://www.news-medical.net/health/The-Immunological-Function-of-Red-Blood-Cells.aspx
https://www.news-medical.net/health/The-Role-of-Physics-in-Medicine.aspx
https://www.news-medical.net/health/What-is-Social-Prescribing.aspx
https://www.news-medical.net/health/Proving-Painsomnia-is-Real-A-Case-Study.aspx
https://www.news-medical.net/health/What-is-Psychotherapy.aspx
https://www.news-medical.net/health/How-Soon-Can-You-Have-Sex-After-Having-a-Baby.aspx
https://www.news-medical.net/health/Condoms-for-Oral-and-Anal-Sex.aspx
https://www.news-medical.net/health/Autoimmunity-Sex.aspx
https://www.news-medical.net/health/Sex-Hormones-in-Meat-and-Dairy-Products.aspx
https://www.news-medical.ne

In [188]:
# uniform links - after changes
for link in db.keys():
    if 'https://www.news-medical.net//health/' not in link:
        print(link)