<a href="https://colab.research.google.com/github/gabordun/web_scraping/blob/master/web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Web scraping project to create a COVID-19 heat map**

The aim is to count the uniqe mentions of Coronavirus on the world's most popular websites.

The list of the most popular websites comes from:
http://www.ebizmba.com/articles/news-websites

The script **scrapes this site for the URLs** of the mostly visited news websites worldwide.

After getting the URLs, **the script creates a parsed object for every individual websites' startsite content**.

Then, according to a pre-defined '**keywords**' list, the script counts the number of mentioning of all the keywords and put the numbers in an **output table**.

The output table contains the name & and the exact URL of the corresponding website, the number of the keywords total appearances and a proxy for the size of the site (the last in order to get some comparable measure).

Finally, in the last section the output table is saved on Google Drive. After that, the saved file is intended to import into Tableau to create a visualization of the results.

In [40]:
#get the URLs of the news sites

#import packages

import numpy as np
import pandas as pd
import csv
from bs4 import BeautifulSoup
import re
import time

#import requests

from urllib import request
import urllib
import requests

#finding sources


url = 'http://www.ebizmba.com/articles/news-websites'

def getAllUrl(url):
    try:
        page = request.urlopen(url).read()
    except:
        return []
    urlList = []
    try:
        soup = BeautifulSoup(page)
        soup.prettify()
        for anchor in soup.findAll('a', href=True):
            if not 'http://' in anchor['href']:
                if urlparse.urljoin(url, anchor['href']) not in urlList:
                    urlList.append(urlparse.urljoin(url, anchor['href']))
            else:
                if anchor['href'] not in urlList:
                    urlList.append(anchor['href'])

        length = len(urlList)

        return urlList
    except request.HTTPError as e:
        print(e)

print(getAllUrl(url))

result_url_list= getAllUrl(url)
not_needed = ['ebizmba','alexa','quantcast','siteanalytics']

#filter unnecessary items

for item in getAllUrl(url):
 if 'ebizmba' in (item):
  result_url_list.remove(item)

for item in getAllUrl(url):
 if 'quantcast' in (item):
  result_url_list.remove(item)

for item in getAllUrl(url):
 if 'alexa' in (item):
  result_url_list.remove(item)

for item in getAllUrl(url):
 if 'siteanalytics' in (item):
  result_url_list.remove(item)
 
print(result_url_list)
len(result_url_list)



['http://www.ebizmba.com/', 'http://www.ebizmba.com/directory#admin', 'http://www.ebizmba.com/directory#design', 'http://www.ebizmba.com/directory#marketing', 'http://www.ebizmba.com/directory#ecommerce', 'http://www.ebizmba.com/directory', 'http://www.ebizmba.com/directory#media', 'http://www.ebizmba.com/directory#research', "javascript:bookmarksite('eBizMBA | The eBusiness Knowledgebase', 'http://www.ebizmba.com')", 'http://www.ebizmba.com/terms', 'http://www.ebizmba.com/privacy', 'http://news.yahoo.com', 'http://siteanalytics.compete.com/news.yahoo.com', 'http://quantcast.com/news.yahoo.com', 'http://www.alexa.com/siteinfo/news.yahoo.com/', 'http://news.google.com', 'http://siteanalytics.compete.com/news.google.com', 'http://quantcast.com/news.google.com', 'http://www.alexa.com/siteinfo/news.google.com/', 'http://www.huffingtonpost.com', 'http://siteanalytics.compete.com/huffingtonpost.com', 'http://quantcast.com/huffingtonpost.com', 'http://www.alexa.com/siteinfo/huffingtonpost.com

15

In [0]:
# run keyword(s) counter, create output table

table = {'name': [], 'number of mentioning':[] , 'website size':[],'url':[] }

for url in getAllUrl(url):
  try: 
   basic=requests.get(url).text
   soup=BeautifulSoup(basic, 'html.parser')
  except:
   pass

  
  keywords={'corona','Corona','coronavirus','corona virus','Corona Virus', 'Coronavirus', 'COVID'}

  for x in keywords:
   x = soup(text=re.compile('x'))
   counter = 0
   for item in x:
    counter += 1
   redditAll = soup.find_all("a")
  
  print(url)
  print(len(redditAll))
  print(counter)
  print('--')

  table['name'].append(urllib.parse.urlsplit(url)[1].replace('www.',''))
  table['number of mentioning'].append(counter)
  table['website size'].append(len(redditAll))
  table['url'].append(url)

output_df = pd.DataFrame(data = table)
print(output_df)
  

http://www.ebizmba.com/
70
18
--
http://www.ebizmba.com/directory#admin
143
42
--
http://www.ebizmba.com/directory#design
143
42
--
http://www.ebizmba.com/directory#marketing
143
42
--
http://www.ebizmba.com/directory#ecommerce
143
42
--
http://www.ebizmba.com/directory
143
42
--
http://www.ebizmba.com/directory#media
143
42
--
http://www.ebizmba.com/directory#research
143
42
--
javascript:bookmarksite('eBizMBA | The eBusiness Knowledgebase', 'http://www.ebizmba.com')
143
42
--
http://www.ebizmba.com/terms
61
7
--
http://www.ebizmba.com/privacy
61
10
--
http://news.yahoo.com
124
126
--
http://siteanalytics.compete.com/news.yahoo.com
0
0
--
http://quantcast.com/news.yahoo.com
0
1
--
http://www.alexa.com/siteinfo/news.yahoo.com/
173
71
--
http://news.google.com
718
74
--
http://siteanalytics.compete.com/news.google.com
0
0
--
http://quantcast.com/news.google.com
0
1
--
http://www.alexa.com/siteinfo/news.google.com/
172
71
--
http://www.huffingtonpost.com
0
2
--
http://siteanalytics.compe

In [0]:
#mounting google drive - need to be run only for the first time!

from google.colab import drive
drive.mount('/content/drive')

In [0]:
#save the output table

with open('heatmap.csv', 'w', newline='') as csvfile:
    heatmap = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    for name in output_df:
        heatmap.writerow(output_df[name])


In [0]:
#investigate the input website

url = 'http://www.ebizmba.com/articles/news-websites'
site = request.urlopen(url)
soupfile = BeautifulSoup(site)

soupfile.prettify()

'<!DOCTYPE html>\n<html class="no-js" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>\n  <meta content="width=device-width, initial-scale=1" name="viewport"/>\n  <script async="" src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js">\n  </script>\n  <script>\n   (adsbygoogle = window.adsbygoogle || []).push({\n          google_ad_client: "ca-pub-1997683441273857",\n          enable_page_level_ads: true\n     });\n  </script>\n  <title>\n   Top 15 Most Popular News Websites | February 2020\n  </title>\n  <meta content="Here are the top 15 Most Popular News Sites ranked by a combination of continually updated traffic statistics." name="description"/>\n  <link href="http://ebizmba.com/css/main.css" media="all" rel="stylesheet"/>\n  <link href="http://ebizmba.com/favicon.ico" rel="SHORTCUT ICON"/>\n  <link href="http://ebizmba.com/images/apple-touch.jpg" rel="apple-touch-icon"/>\n  <script src="http://www.ebizmba.com/j