<a href="https://colab.research.google.com/github/gabordun/web_scraping/blob/master/web_scraping2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Web scraping project to create a COVID-19 heat map**

The aim is to count the uniqe mentions of Coronavirus on the world's most popular websites.

The list of the most popular websites comes from:
http://www.ebizmba.com/articles/news-websites

The script **scrapes this site for the URLs** of the mostly visited news websites worldwide.

After getting the URLs, **the script creates a parsed object for every individual websites' startsite content**.

Then, according to a pre-defined '**keywords**' list, the script counts the number of mentioning of all the keywords and put the numbers in an **output table**.

The output table contains the name & and the exact URL of the corresponding website, the number of the keywords total appearances and a proxy for the size of the site (the last in order to get some comparable measure).

Finally, in the last section the output table is saved on Google Drive. After that, the saved file is intended to import into Tableau to create a visualization of the results.

In [15]:
#get the URLs of the news sites

#import packages

import numpy as np
import pandas as pd
import csv
from bs4 import BeautifulSoup
import re
import time
import datetime

#import requests

from urllib import request
import urllib
import requests

#finding sources


url = 'http://www.ebizmba.com/articles/news-websites'

def getAllUrl(url):
    try:
        page = request.urlopen(url).read()
    except:
        return []
    urlList = []
    try:
        soup = BeautifulSoup(page)
        soup.prettify()
        for anchor in soup.findAll('a', href=True):
            if not 'http://' in anchor['href']:
                if urlparse.urljoin(url, anchor['href']) not in urlList:
                    urlList.append(urlparse.urljoin(url, anchor['href']))
            else:
                if anchor['href'] not in urlList:
                    urlList.append(anchor['href'])

        length = len(urlList)

        return urlList
    except request.HTTPError as e:
        print(e)

print(getAllUrl(url))

result_url_list= getAllUrl(url)
not_needed = ['ebizmba','alexa','quantcast','siteanalytics']

#filter unnecessary items

for item in getAllUrl(url):
 for i in not_needed:
  if i in (item):
   result_url_list.remove(item)

print(result_url_list)
print(len(getAllUrl(url)))
len(result_url_list)



['http://www.ebizmba.com/', 'http://www.ebizmba.com/directory#admin', 'http://www.ebizmba.com/directory#design', 'http://www.ebizmba.com/directory#marketing', 'http://www.ebizmba.com/directory#ecommerce', 'http://www.ebizmba.com/directory', 'http://www.ebizmba.com/directory#media', 'http://www.ebizmba.com/directory#research', "javascript:bookmarksite('eBizMBA | The eBusiness Knowledgebase', 'http://www.ebizmba.com')", 'http://www.ebizmba.com/terms', 'http://www.ebizmba.com/privacy', 'http://news.yahoo.com', 'http://siteanalytics.compete.com/news.yahoo.com', 'http://quantcast.com/news.yahoo.com', 'http://www.alexa.com/siteinfo/news.yahoo.com/', 'http://news.google.com', 'http://siteanalytics.compete.com/news.google.com', 'http://quantcast.com/news.google.com', 'http://www.alexa.com/siteinfo/news.google.com/', 'http://www.huffingtonpost.com', 'http://siteanalytics.compete.com/huffingtonpost.com', 'http://quantcast.com/huffingtonpost.com', 'http://www.alexa.com/siteinfo/huffingtonpost.com

15

In [0]:
# get the texts from each URLs

corpus = []

for url in result_url_list:
  try: 
   basic=requests.get(url).text
   soup=BeautifulSoup(basic, 'html.parser')
  except:
   pass
  
  stripped_text=soup.get_text()
  corpus.append(stripped_text)  

   

In [25]:
#clear the text, remove special charachters

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

for item in corpus:
  item=remove_special_characters(item)

print(len(corpus))
corpus

15


 '\ntr.td\ntr.td\ntr.td\ntr.td403 - Forbiddentr.td\ntr.td\ntr.td\ntr.td403 - Forbiddentr.td\ntr.td\ntr.td\n',
 '\ntr.td\ntr.td\ntry {\n  Object.defineProperty(window, \'adverts\', {configurable: false, value:{}});\n}\ncatch(error) {\n  console.error(error);\n}\ntr.td\ntr.td\ntr.td\ntr.td\ntr.tdHome | Daily Mail Onlinetr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\n  var disableAds = false;\n  PageCriteria = window.PageCriteria || {};\n  PageCriteria.clientIP = \'35.229.134.192\';\n  PageCriteria.nonAdservable = \'\' === \'true\';\n  PageCriteria.device = \'other\';\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\ntr.td\

In [31]:
# run keyword(s) counter, create output table

table = {'name': [], 'number of mentioning':[] , 'url':[] }

keywords=['corona','Corona','coronavirus','corona virus','Corona Virus', 'Coronavirus', 'COVID', 'COVID-19']

for item in corpus:
  out=[]
  for i in keywords:
    x = soup.find_all(string=re.compile('i'))
    out.append(x)

  counter = 0
  for item in out:
   counter += 1
  
  #print(result_url_list[item])
  print(counter)
  print('--')

  table['name'].append(urllib.parse.urlsplit(url)[1].replace('www.','').replace('.com','').replace('.co.uk',''))
  table['number of mentioning'].append(counter)
  #table['website size'].append(len(redditAll))
  table['url'].append(url)

output_df = pd.DataFrame(data = table)
print(output_df)
  

8
--
8
--
8
--
8
--
8
--
8
--
8
--
8
--
8
--
8
--
8
--
8
--
8
--
8
--
8
--
       name  number of mentioning                     url
0   latimes                     8  http://www.latimes.com
1   latimes                     8  http://www.latimes.com
2   latimes                     8  http://www.latimes.com
3   latimes                     8  http://www.latimes.com
4   latimes                     8  http://www.latimes.com
5   latimes                     8  http://www.latimes.com
6   latimes                     8  http://www.latimes.com
7   latimes                     8  http://www.latimes.com
8   latimes                     8  http://www.latimes.com
9   latimes                     8  http://www.latimes.com
10  latimes                     8  http://www.latimes.com
11  latimes                     8  http://www.latimes.com
12  latimes                     8  http://www.latimes.com
13  latimes                     8  http://www.latimes.com
14  latimes                     8  http://www.latimes.c

In [0]:
#mounting google drive - need to be run only for the first time!

from google.colab import drive
drive.mount('/content/drive')

In [0]:
#save the output table

timestr = time.strftime("%Y%m%d")

with open('heatmap' + timestr + '.csv', 'w', newline='') as csvfile:
    heatmap = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    for name in output_df:
        heatmap.writerow(output_df[name])

print(timestr)
print(filename)


20200428
heatmap20200428


In [0]:
#investigate the input website

url = 'http://www.ebizmba.com/articles/news-websites'
site = request.urlopen(url)
soupfile = BeautifulSoup(site)

soupfile.prettify()