In [195]:
import pandas as pd

In [196]:
csv = "https://raw.githubusercontent.com/erickbytes/lofipy/main/pythonmarketer.wordpress.com_referrers_day_08_18_2023_08_18_2023.csv"
referrers = pd.read_csv(csv, names=["referrer", "visitors", "temp"])
referrers = referrers.drop("temp", axis=1)
referrers.head(5)

Unnamed: 0,referrer,visitors
0,Search Engines,36537
1,Search Engines > Google Search,13687
2,Search Engines > Google Search > google.com,12959
3,Search Engines > Google Search > google.co.in,73
4,Search Engines > Google Search > google.de,70


In [197]:
referrers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   referrer  507 non-null    object
 1   visitors  507 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 8.0+ KB


In [198]:
top_50 = referrers.nlargest(50, "visitors").style.hide()
top_50

referrer,visitors
Search Engines,36537
Search Engines > Google Search,13687
Search Engines > Bing,13154
Search Engines > Google Search > google.com,12959
Search Engines > Bing > bing.com,12129
Search Engines > duckduckgo.com,7955
Search Engines > duckduckgo.com > /,7945
Search Engines > Yahoo Search,1250
ecosia.org,906
Search Engines > Bing > bing.com,541


In [199]:
# Including the word search for generic search engine category / other smaller search engines.
search_engines = [
    "google",
    "duckduckgo",
    "baidu",
    "yahoo",
    "bing",
    "qwant",
    "yandex",
    "ecosia",
    "oceanhero",
    "dogpile",
    "swisscows",
    "lilo",
    "startpage",
    "metager",
    "neeva",
    "presearch",
    "gutefrage",
    "search"
]
print(f"These search engines have recorded at least 1 person sent in traffic to the blog.\n {search_engines}")
search_engines = "|".join(search_engines)
search_referrals = referrers[
    referrers.referrer.str.contains(pat=search_engines, regex=True, case=False)
].reset_index(drop=True)
search_referrals

These search engines have recorded at least 1 person sent in traffic to the blog.
 ['google', 'duckduckgo', 'baidu', 'yahoo', 'bing', 'qwant', 'yandex', 'ecosia', 'oceanhero', 'dogpile', 'swisscows', 'lilo', 'startpage', 'metager', 'neeva', 'presearch', 'gutefrage', 'search']


Unnamed: 0,referrer,visitors
0,Search Engines,36537
1,Search Engines > Google Search,13687
2,Search Engines > Google Search > google.com,12959
3,Search Engines > Google Search > google.co.in,73
4,Search Engines > Google Search > google.de,70
...,...,...
383,metager.org,1
384,googleweblight.com,1
385,keep.google.com,1
386,google-admin.corp.google.com,1


In [200]:
search_pct = search_referrals.index.size / referrers.index.size
print(f"{search_pct:.0%} of blog referral traffic sources were from search engines.")
print("This data was exported from Wordpress.")
print("The blog this data is based on is viewable at lofipython.com.")

77% of blog referral traffic sources were from search engines.
This data was exported from Wordpress.
The blog this data is based on is viewable at lofipython.com.


In [201]:
# Show the top non-search referral sources.
nonsearch_referrals = referrers[
    ~referrers.referrer.str.contains(pat=search_engines, regex=True, case=False)
].reset_index(drop=True)
print(f"Non-search referrals do not contain: {search_engines}")
nonsearch_referrals.nlargest(50, "visitors").reset_index(drop=True)

Non-search referrals do not contain: google|duckduckgo|baidu|yahoo|bing|qwant|yandex|ecosia|oceanhero|dogpile|swisscows|lilo|startpage|metager|neeva|presearch|gutefrage|search


Unnamed: 0,referrer,visitors
0,Instagram,506
1,Instagram > l.instagram.com,495
2,github.com,239
3,github.com > /tableau/server-client-python/iss...,225
4,WordPress.com Reader,177
5,Twitter,143
6,WordPress Android App,132
7,Facebook,76
8,joecodeswell.wordpress.com,31
9,joecodeswell.wordpress.com > /2011/05/25/web2p...,29


In [204]:
print("Grab 25 non-search referrers at random. Type ctrl + enter in this cell to randomize the view of sources.")
nonsearch_referrals.sample(n=25).sort_values(
    by="visitors", ascending=False
).reset_index(drop=True)

Grab 25 non-search referrers at random. Type ctrl + enter in this cell to randomize the view of sources.


Unnamed: 0,referrer,visitors
0,github.com,239
1,WordPress Android App,132
2,you.com,21
3,iframe-toloka.com,17
4,github.com > /,9
5,127.0.0.1:8000,4
6,myiotdashboard.nl,3
7,login.microsoftonline.com,2
8,adfs.contiwan.com,2
9,web.skype.com,2
