Web Crawling & Scraping (Google Search)

This script performs a google search and returns URL that matches the search term provided.

After collecting the urls, it visit each URL and extract text content (both HTML and PDF) from the pages visited.

Author       : Eugene Choy (eugene.choy.wj@hotmail.com)
Last Updated : 8 Feb 2020

If you don't have already, you will need to nstall Python googlesearch package
1. pip install google
2. pip install textract

In [2]:
from googlesearch import search
import time

In [3]:
query = "wuhan"
my_results_list = []
for i in search(query,        # The query you want to run
                tld = 'com',  # The top level domain
                lang = 'en',  # The language
                num = 10,     # Number of results per page
                start = 0,    # First result to retrieve
                stop = 20,  # Last result to retrieve
                pause = 2.0,  # Lapse between HTTP requests
               ):
    my_results_list.append(i)
    print(i)

https://en.wikipedia.org/wiki/Wuhan
https://en.wikipedia.org/wiki/Wuhan#History
https://en.wikipedia.org/wiki/Wuhan#Geography
https://en.wikipedia.org/wiki/Wuhan#Transportation
https://en.wikipedia.org/wiki/Wuhan#Culture
https://www.globaltimes.cn/page/202202/1252865.shtml
https://www.globaltimes.cn/page/202202/1252719.shtml
https://www.scmp.com/news/china/science/article/3167675/coronavirus-beijing-expert-who-beat-wuhan-outbreak-now-hong-kong
https://www.britannica.com/place/Wuhan
https://www.eyeonasia.gov.sg/china/know/china-municipalities-provinces/wuhan-profile/
https://www.channelnewsasia.com/asia/china-covid-19-doctor-li-wenliang-whistleblower-wuhan-2407381
https://en.unesco.org/creative-cities/wuhan
https://www.bloomberg.com/news/features/2021-06-27/did-covid-come-from-a-lab-scientist-at-wuhan-institute-speaks-out
https://www.reuters.com/world/china/china-reports-90-new-coronavirus-cases-aug-2-vs-98-day-earlier-2021-08-03/
https://www.france24.com/en/tv-shows/focus/20220125-in-w

In [4]:
# output the URLS into a text file

import os 


# if directory don't exist
path = "./out/"
if not os.path.exists(path):
    os.makedirs(path)

# Write into txt file for record
time_string = str(int(time.time()))
f = open(path + query.replace(" ", "_") + time_string + ".txt","w")

for result in my_results_list:
    f.write(result +"\n")
    
f.close()
print(query.replace(" ", "_") + time_string + ".txt" + " written to out folder")

wuhan1645589920.txt written to out folder


In [5]:
import os
import requests
import time
from bs4 import BeautifulSoup
import textract

# Methods to extract content from webpage
def get_urls_from_path(path,url_list):
    for filename in os.listdir(path):
        f = open(path + '/' + filename,'r')
        for url in f:
            url_list.append(url.replace("\n",""))
    return url_list


def content_from_result(results_list):
    content = ""
    for result in results_list:
        text = result.get_text()
        text = text.replace("\n","")
        text = text.replace(",",";")
        text = text.replace("  ","")
        text = text.replace("\r"," ")
        text = text.replace("\xa0"," ")
        text = text.replace("©","copyrighted ")
        content += text
    return content

def get_pdf_content_from_url(url):
    myfile = requests.get(url)
    
    path = "./temp/"
    if not os.path.exists(path):
        os.makedirs(path)
    
    open(path + 'temp.pdf', 'wb').write(myfile.content)
    text = textract.process(path + "temp.pdf")
    
    text = text.decode()
    text = text.replace("\n","")
    text = text.replace("\x0c","")
    text = text.replace(",",";")
    text = text.replace("  ","")
    text = text.replace("©","copyrighted ")
    
    os.remove(path + "temp.pdf")
    return text

def get_content_from_url(url):

    content = ""
    if ".pdf" not in url:
        page = requests.get(url)
        page_content = page.content

        soup = BeautifulSoup(page_content, 'html.parser')

        results_list = soup.find_all('h2')
        content += content_from_result(results_list)

        results_list = soup.find_all('p')
        content += " " + content_from_result(results_list)
        return content
    else:
        return get_pdf_content_from_url(url)

In [6]:
url_list = []
url_list = get_urls_from_path("./out", url_list)
url_list

['https://en.wikipedia.org/wiki/Wuhan',
 'https://en.wikipedia.org/wiki/Wuhan#History',
 'https://en.wikipedia.org/wiki/Wuhan#Geography',
 'https://en.wikipedia.org/wiki/Wuhan#Transportation',
 'https://en.wikipedia.org/wiki/Wuhan#Culture',
 'https://www.globaltimes.cn/page/202202/1252865.shtml',
 'https://www.globaltimes.cn/page/202202/1252719.shtml',
 'https://www.scmp.com/news/china/science/article/3167675/coronavirus-beijing-expert-who-beat-wuhan-outbreak-now-hong-kong',
 'https://www.britannica.com/place/Wuhan',
 'https://www.eyeonasia.gov.sg/china/know/china-municipalities-provinces/wuhan-profile/',
 'https://www.channelnewsasia.com/asia/china-covid-19-doctor-li-wenliang-whistleblower-wuhan-2407381',
 'https://en.unesco.org/creative-cities/wuhan',
 'https://www.bloomberg.com/news/features/2021-06-27/did-covid-come-from-a-lab-scientist-at-wuhan-institute-speaks-out',
 'https://www.reuters.com/world/china/china-reports-90-new-coronavirus-cases-aug-2-vs-98-day-earlier-2021-08-03/',


In [7]:
path = "./csv/"
if not os.path.exists(path):
        os.makedirs(path)

csv = open(path + "out.csv","w")

csv.write("url,content\n")

for url in url_list:
    try:
        csv.write(url + "," + get_content_from_url(url) + "\n")
        print(url + " converted to text.")
        time.sleep(1)
    except:
        print(url + " failed to convert to text.")

csv.close()

https://en.wikipedia.org/wiki/Wuhan converted to text.
https://en.wikipedia.org/wiki/Wuhan#History converted to text.
https://en.wikipedia.org/wiki/Wuhan#Geography converted to text.
https://en.wikipedia.org/wiki/Wuhan#Transportation converted to text.
https://en.wikipedia.org/wiki/Wuhan#Culture converted to text.
https://www.globaltimes.cn/page/202202/1252865.shtml converted to text.
https://www.globaltimes.cn/page/202202/1252719.shtml converted to text.
https://www.scmp.com/news/china/science/article/3167675/coronavirus-beijing-expert-who-beat-wuhan-outbreak-now-hong-kong converted to text.
https://www.britannica.com/place/Wuhan converted to text.
https://www.eyeonasia.gov.sg/china/know/china-municipalities-provinces/wuhan-profile/ converted to text.
https://www.channelnewsasia.com/asia/china-covid-19-doctor-li-wenliang-whistleblower-wuhan-2407381 converted to text.
https://en.unesco.org/creative-cities/wuhan converted to text.
https://www.bloomberg.com/news/features/2021-06-27/did-c