# How to Detect Backlinks using BeautifulSoup in Python

In [3]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd

### What are backlinks?

> **Backlinks** (also known as “inbound links”, “incoming links” or “one way links”) are links from one website to a page on another website. Google and other major search engines consider backlinks “votes” for a specific page. Pages with a high number of backlinks tend to have high organic search engine rankings.

In [4]:
# find the backlinks
def detect_backlink(url, name):
    
    # send an http request
    response = requests.get(url)
    # get the html content using bs
    html_content = bs(response.content, 'html.parser')
    
    # find links
    http_links = html_content.find_all('a', href=re.compile(r"^http"))
    
    back_links = [link for link in http_links if link['href'].find(name)==-1]
    
    
    return back_links


**Note** Regular expression: **A regular expression** is a sequence of characters that specifies a search pattern in text. Usually such patterns are used by string-searching algorithms for **"find" or "find and replace"** operations on strings, or for input validation.

In [5]:
# call the function and get list of detected backlinks
url = "https://alibaba.com"
backlinks = detect_backlink(url, 'alibaba')

In [6]:
# parse the backlinks
def parse_backlinks(backlinks=backlinks):
    
    # define an empty dctionary
    back_links = {'title':[], 'link':[]}
    
    for link in backlinks:
        url = link['href']
        title = link.text
        
        back_links['title'].append(title.strip())
        back_links['link'].append(url.strip())
        
    return back_links

In [7]:
data = parse_backlinks()

In [8]:
df = pd.DataFrame(data)


In [9]:
df

Unnamed: 0,title,link
0,Get Paid for Your Feedback,https://www.surveymonkey.com/s/Alibaba_test_pa...
1,Taobao Marketplace,http://www.taobao.com
2,Tmall.com,http://www.tmall.com/
3,Juhuasuan,http://ju.taobao.com/
4,AliExpress,http://www.aliexpress.com/
5,1688.com,http://www.1688.com
6,Alimama,http://www.alimama.com/index.htm
7,Fliggy,https://www.fliggy.com/
8,Tmall Taobao World,https://g-sellercenter.taobao.com/mail
9,AliOS,http://www.alios.cn/


In [1]:
import json
import pandas as pd

In [2]:
path = "/home/noh/Downloads/Telegram Desktop/Economic_Twitter_Data.json"

In [4]:
df = pd.read_json("Economic_Twitter_Data.json", lines=True)

In [5]:
df.head()

Unnamed: 0,created_at,id,id_str,text,truncated,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,...,favorite_count,favorited,retweeted,lang,extended_entities,possibly_sensitive,quoted_status_id,quoted_status_id_str,quoted_status,withheld_in_countries
0,2022-04-22 22:20:18+00:00,1517629396575784961,1517629396575784960,RT @nikitheblogger: Irre: Annalena Baerbock sa...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/android"" ...",,,,...,0,False,False,de,,,,,,
1,2022-04-22 22:19:16+00:00,1517629134012399616,1517629134012399616,RT @sagt_mit: Merkel schaffte es in 1 Jahr 1 M...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/android"" ...",,,,...,0,False,False,de,,,,,,
2,2022-04-22 22:17:28+00:00,1517628682659106822,1517628682659106816,RT @Kryptonoun: @WRi007 Pharma in Lebensmittel...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/android"" ...",,,,...,0,False,False,de,,,,,,
3,2022-04-22 22:17:20+00:00,1517628647892561924,1517628647892561920,RT @WRi007: Die #Deutschen sind ein braves Vol...,False,"{'hashtags': [{'text': 'Deutschen', 'indices':...","<a href=""http://twitter.com/download/android"" ...",,,,...,0,False,False,de,,,,,,
4,2022-04-22 22:13:15+00:00,1517627621135237127,1517627621135237120,RT @RolandTichy: Baerbock verkündet mal so neb...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/android"" ...",,,,...,0,False,False,de,,,,,,
