## CORE

In [None]:
import requests
import time
from bs4 import BeautifulSoup
import os
import json

In [None]:
def get_tor_session():
    session = requests.session()
    # Tor uses the 9050 port as the default socks port
    session.proxies = {'http':  'socks5://127.0.0.1:9050',
                       'https': 'socks5://127.0.0.1:9050'}
    return session

In [None]:
from stem import Signal
from stem.control import Controller

# signal TOR for a new connection 
def renew_connection():
    with Controller.from_port(port = 9051) as controller:
        controller.authenticate(password="password")
        controller.signal(Signal.NEWNYM)
        pass

In [None]:
def normalize_url(url):
    url = url.split("?")[0]
    url = url.split("#")[0]
    url = url.split("&")[0]
    return url

In [None]:
ip_counter = 0
def make_get(url, print_ip=False):
    global ip_counter
    url = normalize_url(url)
    if ip_counter > 10:
        renew_connection()
        ip_counter = 0
    session = get_tor_session()
    #if print_ip:
    #    print(session.get("http://httpbin.org/ip").text)
    ip_counter += 1
    return session.get(url)

## CONFIG

In [None]:
configs = {}

first_url_key = "first_url"
content_selector_key = "content_selector"
title_selector_key = "title_selector"

configs["medium"] = {
    first_url_key: ["https://medium.com", "https://medium.com/"],
    content_selector_key: [".sectionLayout--insetColumn"],
    title_selector_key: ["h1.graf--title"]
}

configs["tutorialspoint"] = {
    first_url_key: ["https://www.tutorialspoint.com", "https://www.tutorialspoint.com/", "https://www.tutorialspoint.com/index.htm"],
    content_selector_key: [".content > div", ".tutorial-content > div", ".content > div > div"],
    title_selector_key: [".content > div > h1:first-of-type", ".tutorial-content > div > h1:first-of-type", ".content > div > div > h1:first-of-type"]
}

configs["kdnuggets"] = {
    first_url_key: ["https://www.kdnuggets.com", "https://www.kdnuggets.com/"],
    content_selector_key: ["#post- "],
    title_selector_key: ["#title"]
}

configs["datasciencecentral"] = {
    first_url_key: ["https://www.datasciencecentral.com"],
    content_selector_key: ["article .entry-content"],
    title_selector_key: ["article .entry-title"]
}

configs["smartdatacollective"] = {
    first_url_key: ["https://www.smartdatacollective.com", "https://www.smartdatacollective.com/"],
    content_selector_key: [".single-content"],
    title_selector_key: [".single-title"]
}

configs["machinelearningmastery"] = {
    first_url_key: ["https://www.kdnuggets.com"],
    content_selector_key: [],
    title_selector_key: []
}

configs["wikihow"] = {
    first_url_key: ["https://www.wikihow.com", "https://www.wikihow.com/"],
    content_selector_key: ["#bodycontents .steps .step"],
    title_selector_key: ["#bodycontents > #intro > h1:first-of-type"]
}

configs["splinters"] = {
    first_url_key: ["https://schwitzsplinters.blogspot.com", "https://schwitzsplinters.blogspot.com/"],
    content_selector_key: [".post-body"],
    title_selector_key: [".post-title"]
}

configs["thehistoryblog"] = {
    first_url_key: ["http://www.thehistoryblog.com", "http://www.thehistoryblog.com/"],
    content_selector_key: [".post > .entry"],
    title_selector_key: [".post > h3:first-of-type", ".post > h2:first-of-type"]
}

configs["chemistry-blog"] = {
    first_url_key: ["http://www.chemistry-blog.com", "http://www.chemistry-blog.com/"],
    content_selector_key: [".post > .entry"],
    title_selector_key: [".title"]
}

## LOOP

In [None]:
def check_first_urls(website, links):
    for url in config[website][first_url_key]:
        if url in links:
            return True
    return False

from bs4.element import Tag
def get_with_selector(selectors, soup, mode):
    res = []
    for selector in selectors:
        res = soup.select(selector)
        if len(res) > 0:
            if mode == "title":
                return str(res[0]), res[0].text
            else:
                return str(res[0]), " ".join([tag.text if type(tag) == Tag else str(tag) for tag in res[0].children])
    return None, ""

extensions_banned = [".jpg", ".png", ".zip"]
def check_link_extension(link):
    for extension in extensions_banned:
        if link[-len(extension):] == extension:
            return False
    return True

In [None]:
website = "chemistry-blog"

directory = website
if not os.path.exists(directory):
    os.makedirs(directory)

first_url = configs[website][first_url_key][0]
prefix_url = "/".join(first_url.split("/")[:3])
queue = [first_url]
already_considered = set()
already_considered.add(first_url)

counter = 0

while len(queue) > 0:
    # get url to visit
    url = queue.pop(0)
    print("Visiting " + url)
    print("URLs in queue: {0}".format(len(queue)))
    
    # visit url
    try:
        response = make_get(url, print_ip=True)
        soup = BeautifulSoup(response.text, "html.parser")
        data = {"url": url, "html": str(soup)}
        
        # get all outer links from url
        links = soup.find_all("a")
        links = [tag["href"] for tag in links if tag.has_attr("href")]
        
        # from relative to absolute links
        links = [prefix_url + link if len(link) > 0 and link[0] == "/" else link for link in links] # fix "/index.html"
        links = [prefix_url + "/" + link if "//" not in link else link for link in links] # fix "index.html"
        links = [link for link in links if website in link] # must contain the name of the website
        #links = [link if "//" in link else (prefix_url + "/" if  + link for link in links]
        #links = [link if link[-2] != "/" else link[:-1] for link in links] # remove ending double slash
        
        # if it does not contain a link to the homepage, then drop it
        #if not check_first_urls(website, links):
        #    print("Not found link to homepage: " + str(links))
        #    print("------------")
        #    continue
                
        links = [link for link in links if check_link_extension(link)] # remove links that end with banned extension
                
        links = [normalize_url(link) for link in links] # remove http parameters (after ?)
        
        links = list(set(links)) # remove duplicates
                
        # add links to queue if not already considered
        for link in links:
            if link not in already_considered:
                already_considered.add(link)
                queue.append(link)
                                
        # get titles
        data["title_html"], data["title"] = get_with_selector(configs[website][title_selector_key], soup, "title")

        # get texts
        data["content_html"], data["content"] = get_with_selector(configs[website][content_selector_key], soup, "content")

        # save article
        if data["title"] != "" and data["content"] != "": # we save only if we got the necessary info
            print("Extracted tutorial: " + data["title"])
            counter += 1
            sub_directory = directory + "/" + url.split("/")[2]
            if not os.path.exists(sub_directory):
                os.makedirs(sub_directory)
            with open(sub_directory + "/" + str(counter) + '.json', 'w') as outfile:
                json.dump(data, outfile)

        # sleep...
        time.sleep(0.2)
    except Exception as e:
        print(e)
        print("------------")
        time.sleep(1) # time to escape by KeywordInterrupt
        continue
        
    print("------------")

## EXPERIMENT

In [None]:
website = "tutorialspoint"

url = "https://www.tutorialspoint.com/blockchain_online_training/index.asp"
response = make_get(url, print_ip=True)
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
res = soup.select(configs[website][title_selector_key][1])
res

In [None]:
t, title = get_with_selector(configs[website][title_selector_key], soup, "title")
print(title)

# get texts
c, content = get_with_selector(configs[website][content_selector_key], soup, "content")
print(content)

In [None]:
soup

In [None]:
soup.select(".content")

In [None]:
data["content"]

In [None]:
[tag["href"] for tag in links if tag["href"]]

In [None]:
content = soup.select(".content > div")

In [None]:
# get titles
title = soup.select(".content > div > h1:first-of-type")

In [None]:
title

## SEE ALL TITLES

In [None]:
from os import listdir
from os.path import isfile, join

mypath = "wikihow/www.wikihow.com"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
for file in onlyfiles:
    json_file = mypath + "/" + file
    with open(json_file, 'r') as infile:
        data = json.load(infile)
        print(data["title"])