# How to extract internal and external links using Python
In this tutorial, you will learn how to indentify and extract internal and external links from websites through web scraping in python and get help identify internal linking opportunities and improve SEO.

In [36]:
# import libraries
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re

In [64]:
# url 
url = "https://www.coursera.org/"


In [65]:
# send an https request and get http reponse
response = requests.get(url)

In [30]:
response

<Response [200]>

In [66]:
html_page = bs(response.content, 'html.parser')


In [67]:
all_links = html_page.find_all('a', href=re.compile(r"^['http', 'https']"))

In [68]:
links = {'link':[], 'category':[]}
for link in all_links:
    href = link['href']
    if href:
        if r'coursera.com' in href:
            links['link'].append(href)
            links['category'].append('internal')
        if href[0] == "#":
            links['link'].append(f"{url}{href}")
            links['category'].append('internal')
            
        if href.split(":")[0] in ['https', 'http'] and not r"coursera.com" in href:
            links['link'].append(href)
            links['category'].append('external')

In [112]:
# get all links from the domain
def get_all_link(url):
    user_agent = {'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'}
    # make http request
    response = requests.get(url)
    # parse html content
    html_page = bs(response.content, 'html.parser')
    # get all links
    all_links = html_page.find_all('a')
    
    return all_links

In [113]:
def extract_link(all_links, url, name):
    
    links = {'link':[], 'category':[]}
    
    for link in all_links:
        href = link['href']
        if href:
            if name in href:
                links['link'].append(href)
                links['category'].append('internal')
            if href[0] == "#":
                links['link'].append(f"{url}{href}")
                links['category'].append('internal')

            if href.split(":")[0] in ['https', 'http'] and not name in href:
                links['link'].append(href)
                links['category'].append('external')
    return links

In [114]:
domain_name = r"alibaba.com"
url = "https://"+domain_name
url

'https://alibaba.com'

In [115]:
domain_name = r"alibaba.com"
url = "https://"+domain_name
all_links = get_all_link(url)

In [116]:
links = extract_link(all_links, url, domain_name)

In [117]:
df = pd.DataFrame(links)

In [118]:
df

Unnamed: 0,link,category
0,//www.alibaba.com/,internal
1,//www.alibaba.com/,internal
2,//passport.alibaba.com/icbu_login.htm?tracelog...,internal
3,//accounts.alibaba.com/register/register.htm?t...,internal
4,//message.alibaba.com?tracelog=2020NewHeader_h...,internal
...,...,...
111,http://rule.alibaba.com/rule/detail/2041.htm,internal
112,http://idinfo.zjamr.zj.gov.cn//bscx.do?method=...,external
113,//www.alibaba.com/trade/servlet/page/static/co...,internal
114,http://www.beian.gov.cn/portal/registerSystemI...,external


In [None]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

def get_broken_links(url):

	# Set root domain.
	root_domain = domain.com
	
	# Internal function for validating HTTP status code.
	def _validate_url(url):
		r = requests.head(url)
		if r.status_code == 404:
			broken_links.append(url)
			
	# Make request to URL.		
	data = requests.get(url).text
	
	# Parse HTML from request.
	soup = BeautifulSoup(data, features="html.parser")
	
	# Create a list containing all links with the root domain.
	links = [link.get("href") for link in soup.find_all("a") if f"//{root_domain}" in link.get("href")]
	
	# Initialize list for broken links.
	broken_links = []
	
	# Loop through links checking for 404 responses, and append to list.
	with ThreadPoolExecutor(max_workers=8) as executor:
		executor.map(_validate_url, links)
		
	return broken_links

In [96]:
df = pd.DataFrame(links)

In [98]:
df.tail()

Unnamed: 0,link,category
344,https://pages.ebay.com/help/policies/privacy-p...,internal
345,https://pages.ebay.com/help/account/cookies-we...,internal
346,https://www.ebay.com/adchoice/ccpa,internal
347,https://www.ebay.com/adchoice,internal
348,https://seal.digicert.com/seals/popup/?tag=BIE...,internal
