<a href="https://colab.research.google.com/github/gabordun/web_scraping/blob/master/web_scraping_ok.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Web scraping project to create a COVID-19 heat map**

The aim is to count the uniqe mentions of Coronavirus on the world's most popular websites.

The list of the most popular websites comes from:
http://www.ebizmba.com/articles/news-websites

The script **scrapes this site for the URLs** of the mostly visited news websites worldwide.

After getting the URLs, **the script creates a parsed object for every individual websites' startsite content**.

Then, according to a pre-defined '**keywords**' list, the script counts the number of mentioning of all the keywords and put the numbers in an **output table**.

The output table contains the name & and the exact URL of the corresponding website, the number of the keywords total appearances and a proxy for the size of the site (the last in order to get some comparable measure).

Finally, in the last section the output table is saved on Google Drive. After that, the saved file is intended to import into Tableau to create a visualization of the results.

In [0]:
#get the URLs of the news sites

#import packages

import numpy as np
import pandas as pd
import csv
from bs4 import BeautifulSoup
import re
import time
import datetime

#import requests

from urllib import request
import urllib
import requests

#finding sources


url = 'http://www.ebizmba.com/articles/news-websites'

def getAllUrl(url):
    try:
        page = request.urlopen(url).read()
    except:
        return []
    urlList = []
    try:
        soup = BeautifulSoup(page)
        soup.prettify()
        for anchor in soup.findAll('a', href=True):
            if not 'http://' in anchor['href']:
                if urlparse.urljoin(url, anchor['href']) not in urlList:
                    urlList.append(urlparse.urljoin(url, anchor['href']))
            else:
                if anchor['href'] not in urlList:
                    urlList.append(anchor['href'])

        length = len(urlList)

        return urlList
    except request.HTTPError as e:
        print(e)

print(getAllUrl(url))

result_url_list= getAllUrl(url)
not_needed = ['ebizmba','alexa','quantcast','siteanalytics']

#filter unnecessary items

for item in getAllUrl(url):
 for i in not_needed:
  if i in (item):
   result_url_list.remove(item)

print(result_url_list)
print(len(getAllUrl(url)))
len(result_url_list)



In [0]:
# get parsed objects from each URLs - optional

corpus = []

for url in result_url_list:
  try: 
   basic=requests.get(url).text
   soup=BeautifulSoup(basic, 'html.parser')
  except:
   pass
  
  #stripped_text=soup.get_text()
  corpus.append(soup)

len(corpus) 

   

In [0]:
# run keyword(s) counter, create output table

table = {'name': [], 'number of mentioning':[] , 'site size': [],'url':[] }

keywords=['corona','Corona','coronavirus','corona virus','Corona Virus', 'Coronavirus', 'COVID', 'COVID-19']

for url in result_url_list:
  try: 
   basic=requests.get(url).text
   soup=BeautifulSoup(basic, 'html.parser')
  except:
   pass
  
  counter=0

  for i in keywords:
   y=len(soup.find_all(text=re.compile(i)))
   counter += y
  
  size = len(soup.find_all('a'))

  print(url)
  print(counter)
  print(size)
  print('--')

  table['name'].append(urllib.parse.urlsplit(url)[1].replace('www.','').replace('.com','').replace('.co.uk',''))
  table['number of mentioning'].append(counter)
  table['site size'].append(size)
  table['url'].append(url)

output_df = pd.DataFrame(data = table)
print(output_df)

  

In [0]:
#mounting google drive - need to be run only for the first time!

from google.colab import drive
drive.mount('/content/drive')

In [0]:
#save the output table

timestr = time.strftime("%Y%m%d")

with open('heatmap' + timestr + '.csv', 'w', newline='') as csvfile:
    heatmap = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    for name in output_df:
        heatmap.writerow(output_df[name])

print(timestr)



In [0]:
#investigate the input website

url = 'http://www.ebizmba.com/articles/news-websites'
site = request.urlopen(url)
soupfile = BeautifulSoup(site)

soupfile.prettify()