In [None]:
#Base code taken from https://github.com/HelloChatterbox/PyWikiHow

In [1]:
import requests
import bs4
from pywikihow.exceptions import ParseError
import re
import sys, os
from bs4 import BeautifulSoup
from bs4.element import Comment
from urllib.parse import unquote,quote

In [2]:
def get_html(url):
    headers = {'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0"}
    r = requests.get(url, headers=headers)
    html = r.text.encode("utf8")
    return html

In [3]:
class HowToStep:
    def __init__(self, number, summary=None, description=None, picture=None):
        self._number = number
        self._summary = summary.replace('/','_')
        self._description = description
        self._picture = picture

    @property
    def number(self):
        return self._number

    @property
    def summary(self):
        return self._summary

    @property
    def description(self):
        return self._description

    @property
    def picture(self):
        return self._picture

    def as_dict(self):
        return {"number": self.number,
                "summary": self.summary,
                "description": self.description,
                "picture": self.picture}

    def print(self, extended=False, file=sys.stdout):
        print(self.number, "-", self.summary, file=file)
        if extended:
            print(self.description, file=file)

In [4]:
#### Parse WikiHow, relevant functions named appropriately
class HowTo:
    def __init__(self, url="http://www.wikihow.com/Special:Randomizer", lazy=True):
        self._url = url
        self._title = None
        self._steps = []
        self.raw_html = ""
        self._parsed = False
        if not lazy:
            self._parse()

    def __repr__(self):
        return "HowTo:" + self.title

    @property
    def url(self):
        if not self._parsed:
            self._parse()
        return self._url

    @property
    def title(self):
        if not self._parsed:
            self._parse()
        return self._title

    @property
    def steps(self):
        if not self._parsed:
            self._parse()
        return self._steps

    @property
    def summary(self):
        summary = self.title + "\n"
        for step in self.steps:
            summary += "{n} - ".format(n=step.number) + step.summary + "\n"
        return summary

    @property
    def n_steps(self):
        return len(self._steps)

    def print(self, extended=False, file=sys.stdout):
        if not extended:
            print(self.summary, file=file)
        else:
            print(self.title, file=file)
            for s in self.steps:
                s.print(extended, file=file)

    def print_procedure(self, num=1,subfolder=''):
        num_blocks=0
        block = []
        for s in self.steps:
            if(s.number==0):
#                 print(block,s)
                if(len(block)>5):
                    num_blocks += 1
                    prefix = './Files/' + subfolder + str(num)+'_'+str(num_blocks)+'___'
                    filename = self.title.replace(' ','_').strip().replace('/','_')+'___'+block[0].summary.replace(' ','_').replace('/','_')
                    filename = re.sub('\W+','', filename )
                    file = open(prefix+filename+'.txt', 'w', encoding="utf-8")
                    print(self.title+'\n'+self.url+'\n', file=file)
                    for step in block:
                        step.print(True, file=file)
                    file.close()
                block = []
            block.append(s)
        if(len(block)>5):
            num_blocks += 1
            prefix = './Files/' + subfolder + str(num)+'_'+str(num_blocks)+'  '
            filename = self.title.replace(' ','_').strip().replace('/','_')+'___'+block[0].summary.replace(' ','_').replace('/','_')
            filename = ''.join(e for e in filename if e.isalnum())
            file = open(prefix+filename+'.txt', 'w', encoding="utf-8")
            print(self.title+'\n'+self.url+'\n', file=file)
            for step in block:
                step.print(True, file=file)
            file.close()
        return num_blocks
                
    def _parse_title(self, soup):
        # get title
        html = soup.findAll("h1")[0]#, {"class": "firstHeading"}
        a = html.find("a")
        if not a:
            raise ParseError
        else:
            self._url = html.find("a").get("href")
            if not self._url.startswith("http"):
                self._url = "http://" + self._url
            self._title = unquote(self._url.split("/")[-1].replace("-", " "))

    def _parse_steps(self, soup):
        self._steps = []
        step_html = soup.findAll("div", {"class": ['step','altblock','section-heading']})
        count = 0
        for body in soup.findAll("div", {"style": ['display:none;']}):
          body.decompose()
        for html in step_html:
            if("altblock" in html.get("class")):
                count=0
                step = HowToStep(count, html.find_next_sibling("span").text)
#                 if (len(step.summary)<=1):
#                     step = HowToStep(count, html.find_next_siblings("span")[1].text)
                step._description = ""
                if("Community Q&A" not in step.summary):
                    self._steps.append(step)
                continue
            if(count==0 and len(self._steps)==0):
                step = HowToStep(count, "Steps")
                step._description = ""
                if("Community Q&A" not in step.summary):
                    self._steps.append(step)
            count += 1
            text = ""
            if html.find("b") is not None:
                step = HowToStep(count, html.find("b").text)
            _ = str(html.find("script"))
            _ = _.replace("<script>", "").replace("</script>", "").replace(";", "")
            ex_step = html.text.replace(_, "")
            _2 = ex_step.find("//<![CDATA[")
            _3 = ex_step.find(">")
            _ = ex_step[_2:_3 + 1]
            ex_step = ex_step.replace(_, "")
            _2 = ex_step.find("http://")
            _3 = ex_step.find(".mp4")
            _ = ex_step[_2:_3 + 4]
            ex_step = ex_step.replace(_, "")
            _ = "WH.performance.mark('step1_rendered');"
            ex_step = ex_step.replace(_, "")
            ex_step = ex_step.strip().replace("\n\n", "\n")
            ex_step = ex_step.replace("\n", "\n- ")+"\n"

            # extended step is now clean
            step._description = ex_step
            self._steps.append(step)
#             return

    def _parse_pictures(self, soup):
        # get step pic
        count = 0
        for html in soup.findAll("a", {"class": "image lightbox"}):
            # one more ugly blob, nice :D
            html = html.find("img")
            i = str(html).find("data-src=")
            pic = str(html)[i:].replace('data-src="', "")
            pic = pic[:pic.find('"')]

            # save in step
            self._steps[count]._picture = pic
            count += 1

    def _parse(self):
        try:
            html = get_html(self._url)
            soup = bs4.BeautifulSoup(html, 'html.parser')
            self._parse_title(soup)
            self._parse_steps(soup)
            self.raw_html = soup.prettify()
#             self._parse_pictures(soup)
            self._parsed = True
        except Exception as e:
            raise ParseError

    def as_dict(self):
        return {
            "title": self.title,
            "url": self._url,
            "n_steps": len(self.steps),
            "steps": [s.as_dict() for s in self.steps]
        }

In [5]:
def RandomHowTo():
    return HowTo()

In [6]:
class WikiHow:
    search_url = "http://www.wikihow.com/wikiHowTo?search="

    @staticmethod
    def search(search_term, max_results=-1):
        html = get_html(WikiHow.search_url + search_term.replace(" ", "+"))
        soup = bs4.BeautifulSoup(html, 'html.parser').findAll('a', attrs={'class': "result_link"})
        count = 1
        for link in soup:
            url = link.get('href')
            if not url.startswith("http"):
                url = "http://" + url
            how_to = HowTo(url)
            try:
                how_to._parse()
            except ParseError:
                continue
            yield how_to
            count += 1
            if 0 < max_results < count:
                return

In [7]:
def search_wikihow(query, max_results=10):
    return list(WikiHow.search(query, max_results))

# if __name__ == "__main__":
#     for how_to in WikiHow.search("buy bitcoin"):
#         how_to.print()

In [8]:
# how_to = HowTo("https://www.wikihow.com/Set-up-Your-Laptop-to-Print-Wirelessly")

In [9]:
# how_to.print(extended=True)

In [10]:
# def link_to_name(link):
    #"https://www.wikihow.com/Set-up-Your-Laptop-to-Print-Wirelessly"

In [11]:
######## Given a link to WikiHow article, parse it and save the description to file
def parse_html(link):
    name = link.split('/')[3]
    how_to = HowTo(link)
    f = open('./Files/'+name+'.txt','w', encoding="utf-8")
    how_to.print(extended=True,file=f)
    f.close()

In [13]:
def parse_procedure(link, num, subfolder=''):
    how_to = HowTo(link)
#     print(how_to.summary)
    return how_to.print_procedure(num,subfolder)

In [15]:
########### Get all the articles from a given WikiHow category, given its link
def get_links(url):
    links = []
    headers = {'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0"}
    r = requests.get(url, headers=headers)
    html = r.text.encode("utf8")
    soup = BeautifulSoup(html, "html.parser")
    block = soup.find("div", {"id": 'cat_container'})
    blocks = block.find_all("a")
    for b in blocks:
        s = b.get("href")
        if(s is not None and s.startswith('https://www.wikihow.com/')):
            links.append(b.get("href"))
    return links

In [16]:
######## Given a link for a WikiHow category, parse all the articles from this category and store the files in given folder
######## Some categories may span multiple pages, indicated by suffix ?pg=2 and so on
def process_category(url, folder):
    global count
    if not os.path.exists('./Files/'+folder):
        os.makedirs('./Files/'+folder)
    links = get_links(url)
    for link in links:
        print(link)
        num_blocks = parse_procedure(link,count,folder)
        if(num_blocks > 0):
            count+=1
    print(url, count)
#         if(count%10==0):
#             print(count)

In [17]:
########### Process each category one by one
# count=1
# process_category('https://www.wikihow.com/Category:Computer-Monitors','Monitors/')
# process_category('https://www.wikihow.com/Category:Printers','Printers/')
# process_category('https://www.wikihow.com/Category:Printers?pg=2','Printers/')
# process_category('https://www.wikihow.com/Category:Computer-Keyboards','Keyboards/')
# process_category('https://www.wikihow.com/Category:Computer-Microphones','Microphones/')
# process_category('https://www.wikihow.com/Category:Headphones','Headphones/')
# process_category('https://www.wikihow.com/Category:Image-Scanners','Scanners/')
# process_category('https://www.wikihow.com/Category:Joysticks-and-Video-Game-Controllers','Controllers/')
# process_category('https://www.wikihow.com/Category:Webcams','Webcams/')


In [18]:
# count=223
# process_category('https://www.wikihow.com/Category:Operating-Systems','OS/')
# process_category('https://www.wikihow.com/Category:Windows','Windows/')
# process_category('https://www.wikihow.com/Category:Windows?pg=2','Windows/')
# process_category('https://www.wikihow.com/Category:Windows?pg=3','Windows/')
# process_category('https://www.wikihow.com/Category:Windows?pg=4','Windows/')
# process_category('https://www.wikihow.com/Category:Linux','Linux/')
# process_category('https://www.wikihow.com/Category:Linux?pg=2','Linux/')
# process_category('https://www.wikihow.com/Category:Ubuntu','Ubuntu/')
# process_category('https://www.wikihow.com/Category:Windows-10','Windows10/')
# process_category('https://www.wikihow.com/Category:Windows-10?pg=2','Windows10/')
# process_category('https://www.wikihow.com/Category:Mac','Mac/')
# process_category('https://www.wikihow.com/Category:Mac?pg=2','Mac/')
# process_category('https://www.wikihow.com/Category:Mac?pg=3','Mac/')
# process_category('https://www.wikihow.com/Category:Mac?pg=4','Mac/')
# process_category('https://www.wikihow.com/Category:Mac-OS-X','OSX/')
# process_category('https://www.wikihow.com/Category:Mac-OS-X?pg=2','OSX/')
# process_category('https://www.wikihow.com/Category:Mac-OS-X','OSX/')



https://www.wikihow.com/Find-the-SSID-on-a-Computer
https://www.wikihow.com/Open-EXE-Files
https://www.wikihow.com/Format-a-PC
https://www.wikihow.com/Install-an-Operating-System-on-a-Brand-New-Computer
https://www.wikihow.com/Install-a-New-Operating-System-on-Your-Computer
https://www.wikihow.com/Make-a-Computer-Operating-System
https://www.wikihow.com/Find-System-Specs
https://www.wikihow.com/Use-an-Operating-System-from-a-USB-Stick
https://www.wikihow.com/Check-Path-in-Unix
https://www.wikihow.com/Install-Chromium-OS
https://www.wikihow.com/Lock-Desktop-Icons-in-Place
https://www.wikihow.com/Set-Administrator-Password
https://www.wikihow.com/Install-Two-Operating-Systems-on-One-Computer
https://www.wikihow.com/Exit-out-of-a-Frozen-Computer-Program
https://www.wikihow.com/Enter-Data-in-SPSS
https://www.wikihow.com/Format-a-Hard-Drive
https://www.wikihow.com/Change-Startup-Programs-on-Your-Computer
https://www.wikihow.com/Use-Keyboard-Shortcuts
https://www.wikihow.com/Print-a-List-of-

https://www.wikihow.com/Fix-Windows-Shutdown-Problems
https://www.wikihow.com/Category:Windows 350
https://www.wikihow.com/Change-the-Default-Font-on-Windows-Notepad
https://www.wikihow.com/Check-Printer-Ink-Levels-in-Windows
https://www.wikihow.com/Clone-(Copy)--a-Hard-Drive-in-Windows-XP
https://www.wikihow.com/Clear-the-Thumbnail-Cache-in-Windows
https://www.wikihow.com/Register-OCX-Files
https://www.wikihow.com/Force-a-Program-to-Close-(Windows)
https://www.wikihow.com/Turn-Off-a-Personal-Computer
https://www.wikihow.com/Make-a-Shutdown-Shortcut-in-Windows
https://www.wikihow.com/Use-the-Steps-Recorder-in-Windows
https://www.wikihow.com/Move-Windows-to-Another-Drive
https://www.wikihow.com/Check-Direct-X-Version
https://www.wikihow.com/Delete-Run-History-in-Windows
https://www.wikihow.com/Check-an-IIS-Event-Log-on-Windows
https://www.wikihow.com/Change-a-Guest-Account-to-an-Administrator-in-Windows
https://www.wikihow.com/Move-the-Task-Bar-to-the-Top-of-the-Screen-in-Windows
https:

https://www.wikihow.com/Make-Windows-Unhackable
https://www.wikihow.com/Configure-Automatic-Updates-in-Windows
https://www.wikihow.com/Install-Windows-NT-3.51
https://www.wikihow.com/Make-an-Email-Photo-Your-Desktop-Background
https://www.wikihow.com/Change-Windows-Default-Telnet-Program
https://www.wikihow.com/Format-and-Reinstall-Windows
https://www.wikihow.com/Disable-Windows-Login-Screensaver
https://www.wikihow.com/Ensure-All-Users-Have-Local-PC-Administrator-Rights
https://www.wikihow.com/Find-All-Windows-Symbols-and-Fonts
https://www.wikihow.com/Download-and-Install-the-iPod-and-iPhone-Drivers-in-Windows-XP,-Vista,-7-and-8
https://www.wikihow.com/Create-a-New-User-Account-in-Windows-Vista-and-7
https://www.wikihow.com/Remove-Crapware-Bloatware-from-Your-Windows-Computer
https://www.wikihow.com/Upgrade-Ubuntu-to-Windows-Operating-System
https://www.wikihow.com/Set-Up-Windows-Hello
https://www.wikihow.com/Explore-the-Easter-Eggs-in-Windows-95
https://www.wikihow.com/Use-Video-as-D

https://www.wikihow.com/Convert-Text-to-Speech-on-Linux
https://www.wikihow.com/Install-Microsoft-Office-2007-on-Linux
https://www.wikihow.com/Recover-Deleted-Files-from-Pen-Drive-in-Linux
https://www.wikihow.com/Revive-an-Old-Computer-with-Linux
https://www.wikihow.com/Assign-an-IP-Address-on-a-Linux-Computer
https://www.wikihow.com/Format-a-Linux-Hard-Disk-to-Windows
https://www.wikihow.com/Attach-a-Swap-Partition-to-Linux
https://www.wikihow.com/Learn-Linux
https://www.wikihow.com/Install-Knoppix-Linux
https://www.wikihow.com/Check-Linux-Distribution
https://www.wikihow.com/Compile-a-Program-in-Linux
https://www.wikihow.com/Transfer-Files-from-One-Linux-Server-to-Another
https://www.wikihow.com/Check-Swap-Space-in-Linux
https://www.wikihow.com/Configure-Fluxbox
https://www.wikihow.com/Unzip-Files-in-Linux
https://www.wikihow.com/Install-Solus
https://www.wikihow.com/Bypass-Grub-Rescue-on-a-Linux-Computer
https://www.wikihow.com/Set-up-a-Crontab-File-on-Linux
https://www.wikihow.com/

https://www.wikihow.com/Set-up-a-DHCP-Server-on-Ubuntu
https://www.wikihow.com/Install-Blender-3D-on-Ubuntu
https://www.wikihow.com/Convert-Package-Files-in-Ubuntu-Using-Alien
https://www.wikihow.com/Capture-Adobe-Flash-Videos-on-Ubuntu-Linux
https://www.wikihow.com/Install-Kubernetes-on-Ubuntu
https://www.wikihow.com/Run-Ubuntu-from-Windows-as-an-Executable
https://www.wikihow.com/Get-a-Copy-of-Ubuntu-Linux-Shipped-to-You-for-Free
https://www.wikihow.com/Install-VMware-Workstation-on-Ubuntu
https://www.wikihow.com/Set-Up-an-Ubuntu-Based-HTPC-with-XBMC
https://www.wikihow.com/Clear-Ubuntu-Software-Center-History
https://www.wikihow.com/Install-My-Unity-Configuration-Tool-on-Ubuntu
https://www.wikihow.com/Automatically-Have-Wallpaper-Change-on-Ubuntu
https://www.wikihow.com/Create-an-Ubuntu-Virtual-Machine-with-VirtualBox-(Mac)
https://www.wikihow.com/Get-Help-for-Ubuntu
https://www.wikihow.com/Install-Postman-in-Ubuntu
https://www.wikihow.com/Compress-a-File-to-.Zip-in-Ubuntu
https://w

https://www.wikihow.com/Protect-Your-Privacy-in-Windows-10
https://www.wikihow.com/Make-Your-PC-a-Kiosk
https://www.wikihow.com/Disable-Driver-Signature-Enforcement-in-Windows-10
https://www.wikihow.com/Choose-Between-Windows-10-Home-and-Pro
https://www.wikihow.com/Upgrade-from-Windows-10-Home-to-Windows-10-Professional
https://www.wikihow.com/Enable-Night-Light-in-Windows-10
https://www.wikihow.com/Log-Into-the-Microsoft-Store
https://www.wikihow.com/Category:Windows-10?pg=2 839
https://www.wikihow.com/Zoom-Out-on-a-Mac
https://www.wikihow.com/Open-Exe-Files-on-Mac
https://www.wikihow.com/Uninstall-Programs-on-Mac-Computers
https://www.wikihow.com/Automatically-Shut-Down-Your-Computer-at-a-Specified-Time
https://www.wikihow.com/Copy-and-Paste-on-a-Mac
https://www.wikihow.com/Find-Your-IP-Address-on-a-Mac
https://www.wikihow.com/Block-and-Unblock-Internet-Sites-(On-a-Mac)
https://www.wikihow.com/Traceroute
https://www.wikihow.com/Force-Shut-Down-a-Mac
https://www.wikihow.com/Open-Appli

https://www.wikihow.com/Renew-a-DHCP-Lease-on-a-Mac
https://www.wikihow.com/Enable-Flash-Player-on-Mac
https://www.wikihow.com/Leave-a-Group-Chat-on-Facebook-Messenger-on-a-PC-or-Mac
https://www.wikihow.com/Change-Trackpad-Sensitivity-on-a-Mac
https://www.wikihow.com/Remove-an-Item-from-System-Preferences-on-a-Mac
https://www.wikihow.com/Play-Sounds-for-Calendar-Notifications-on-a-Mac
https://www.wikihow.com/Transfer-Files-from-Android-to-Mac
https://www.wikihow.com/Set-an-Alarm-on-Your-Mac
https://www.wikihow.com/Check-the-Quality-of-a-Video(Mac)
https://www.wikihow.com/Start-Screensaver-with-a-Keyboard-Shortcut-on-Mac
https://www.wikihow.com/Connect-a-Yamaha-PSR-E413-to-Garageband
https://www.wikihow.com/Add-Email-Accounts-to-a-Mac
https://www.wikihow.com/Run-Windows-On-a-Mac
https://www.wikihow.com/Share-a-Mac-Screen
https://www.wikihow.com/Open-the-Applications-Folder-on-Mac
https://www.wikihow.com/Convince-Your-Parents-to-Buy-You-a-Mac
https://www.wikihow.com/Use-an-iMac
https://w

https://www.wikihow.com/Remove-a-Preferred-Language-on-a-Mac
https://www.wikihow.com/Change-How-Applications-Minimize-on-a-Mac
https://www.wikihow.com/Stop-a-Mac%27s-Brightness-from-Auto-Adjusting
https://www.wikihow.com/Add-a-Preferred-Language-on-a-Mac
https://www.wikihow.com/Quickly-Hide-the-Dock-on-a-Mac
https://www.wikihow.com/Category:Mac?pg=3 1098
https://www.wikihow.com/Enable-Tap-to-Click-on-a-Mac
https://www.wikihow.com/Quickly-Open-the-Notification-Center-on-a-Mac
https://www.wikihow.com/Turn-on-Do-Not-Disturb-when-a-Mac-Is-Sleeping
https://www.wikihow.com/Protect-a-Zip-File-with-a-Password-on-PC-or-Mac
https://www.wikihow.com/Count-Characters-in-Excel-on-PC-or-Mac
https://www.wikihow.com/Update-Printer-Drivers-on-a-Mac
https://www.wikihow.com/Upgrade-Your-Power-Mac-G3-Blue-and-White
https://www.wikihow.com/Mute-Alert-Volume-on-a-Mac
https://www.wikihow.com/Install-macOS
https://www.wikihow.com/Download-Lightroom-on-Mac
https://www.wikihow.com/Set-Up-a-Bluetooth-Trackpad-on-

https://www.wikihow.com/Determine-If-You-Are-Running-the-32-Bit-or-64-Bit-Kernel-in-Mac-OS-X
https://www.wikihow.com/Install-Snow-Leopard-on-an-Intel-PC
https://www.wikihow.com/Enable-or-Disable-iCloud-Applications-on-a-Mac
https://www.wikihow.com/Search-in-Mail-on-a-Mac
https://www.wikihow.com/Defragment-Files-on-a-Mac-Computer
https://www.wikihow.com/Turn-Off-Inertia-Scrolling-on-a-Mac
https://www.wikihow.com/Create-an-OS-X-El-Capitan-Install-Disk-on-a-Flash-Drive
https://www.wikihow.com/Set-Your-Desktop-Color-on-a-Mac
https://www.wikihow.com/Embed-YouTube-Video-to-Keynote-on-Mac-OS-X
https://www.wikihow.com/Allow-Apps-to-Be-Downloaded-on-a-Mac
https://www.wikihow.com/Remotely-Access-a-Mac
https://www.wikihow.com/Clean-Install-Mac-OS-X
https://www.wikihow.com/Use-Parallels-Desktop
https://www.wikihow.com/Install-OS-X-Mountain-Lion
https://www.wikihow.com/Set-Up-Apple-Mail
https://www.wikihow.com/Use-Mission-Control-on-a-Mac
https://www.wikihow.com/FaceTime-on-Mac-OS-X
https://www.wik