# How to Detect Backlinks using BeautifulSoup in Python

In [3]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd

### What are backlinks?

> **Backlinks** (also known as “inbound links”, “incoming links” or “one way links”) are links from one website to a page on another website. Google and other major search engines consider backlinks “votes” for a specific page. Pages with a high number of backlinks tend to have high organic search engine rankings.

In [4]:
# find the backlinks
def detect_backlink(url, name):
    
    # send an http request
    response = requests.get(url)
    # get the html content using bs
    html_content = bs(response.content, 'html.parser')
    
    # find links
    http_links = html_content.find_all('a', href=re.compile(r"^http"))
    
    back_links = [link for link in http_links if link['href'].find(name)==-1]
    
    
    return back_links


**Note** Regular expression: **A regular expression** is a sequence of characters that specifies a search pattern in text. Usually such patterns are used by string-searching algorithms for **"find" or "find and replace"** operations on strings, or for input validation.

In [17]:
# call the function and get list of detected backlinks
url = "https://alibaba.com"
backlinks = detect_backlink(url, 'alibaba')

In [19]:
# parse the backlinks
def parse_backlinks(backlinks=backlinks):
    
    # define an empty dctionary
    back_links = {'title':[], 'link':[]}
    
    for link in backlinks:
        url = link['href']
        title = link.text
        
        back_links['title'].append(title.strip())
        back_links['link'].append(url.strip())
        
    return back_links

In [20]:
data = parse_backlinks()

In [21]:
data

{'title': ['Get Paid for Your Feedback',
  'Taobao Marketplace',
  'Tmall.com',
  'Juhuasuan',
  'AliExpress',
  '1688.com',
  'Alimama',
  'Fliggy',
  'Tmall Taobao World',
  'AliOS',
  'AliTelecom',
  'Autonavi',
  'UCWeb',
  'Umeng',
  'Xiami',
  'DingTalk',
  'Alipay',
  'Lazada',
  '',
  '浙公网安备 33010002000092号',
  '浙B2-20120091-4'],
 'link': ['https://www.surveymonkey.com/s/Alibaba_test_participants?tracelog=footer_feedback',
  'http://www.taobao.com',
  'http://www.tmall.com/',
  'http://ju.taobao.com/',
  'http://www.aliexpress.com/',
  'http://www.1688.com',
  'http://www.alimama.com/index.htm',
  'https://www.fliggy.com/',
  'https://g-sellercenter.taobao.com/mail',
  'http://www.alios.cn/',
  'http://www.aliqin.cn/',
  'http://www.autonavi.com/',
  'http://www.ucweb.com/',
  'http://www.umeng.com/',
  'http://www.xiami.com/',
  'http://www.dingtalk.com/en',
  'https://global.alipay.com/',
  'http://taobao.lazada.sg/',
  'http://idinfo.zjamr.zj.gov.cn//bscx.do?method=lzxx&id=3

In [22]:
df = pd.DataFrame(data)
df

Unnamed: 0,title,link
0,Get Paid for Your Feedback,https://www.surveymonkey.com/s/Alibaba_test_pa...
1,Taobao Marketplace,http://www.taobao.com
2,Tmall.com,http://www.tmall.com/
3,Juhuasuan,http://ju.taobao.com/
4,AliExpress,http://www.aliexpress.com/
5,1688.com,http://www.1688.com
6,Alimama,http://www.alimama.com/index.htm
7,Fliggy,https://www.fliggy.com/
8,Tmall Taobao World,https://g-sellercenter.taobao.com/mail
9,AliOS,http://www.alios.cn/


In [9]:
# url to scrape
url = "https://www.ebay.com/sch/i.html?_from=R40&_nkw=laptop&_sacat=175672&_ipg=60"

In [6]:
# get url data
def get_url_data(url):
    
    # send a requests and get response
    response = requests.get(url)
    
    return response.text
        

In [10]:
# call the function
if __name__=="__main__":
    # assign url to variable
    url = "https://www.google.com"
    data = get_url_data(url)
    #print(data)

In [12]:
# check backlink
def check_backlink(url, backlink):
    response = requests.get(url)
    data = response.text
    return data.find(backlink)


In [14]:
# call function
if __name__=="__main__":
    url ="https://google.com"
    backlink = "https://amazon.com"
    res = check_backlink(url, backlink)
    if res == -1:
        print("Backlink not found")
    else:
        print("Backlink found")

Backlink not found


In [54]:
backlinks = find_backlink("https://ebay.com")

In [62]:
backlinks

[<a _exsp="m571.l2943" class="thrd gf-i" href="https://twitter.com/eBay"><i class="gspr ictwg"></i>Twitter</a>,
 <a aria-label="eBay Korea" class="gf-if-a" href="http://global.gmarket.co.kr/Home/Main"><b class="flkr gf-if gspr"></b><p>Korea</p></a>,
 <a aria-label="eBay Turkey" class="gf-if-a" href="https://www.gittigidiyor.com"><b class="fltr gf-if gspr"></b><p>Turkey</p></a>]

In [64]:
for link in backlinks:
    print(link.text)

Twitter
Korea
Turkey


In [69]:
data = parse_backlinks()

In [70]:
data

{'title': ['Twitter', 'Korea', 'Turkey'],
 'link': ['https://twitter.com/eBay',
  'http://global.gmarket.co.kr/Home/Main',
  'https://www.gittigidiyor.com']}

In [71]:
df = pd.DataFrame(data)

In [72]:
df

Unnamed: 0,title,link
0,Twitter,https://twitter.com/eBay
1,Korea,http://global.gmarket.co.kr/Home/Main
2,Turkey,https://www.gittigidiyor.com
