### List all the files in the wiki folder, find the number of files in the folder, and display the HTML from a single file:

In [2]:
import os

for file in os.listdir("wiki"):
    print(file)

Furubira_District,_Hokkaido.html
Valentin_Yanin.html
Kings_XI_Punjab_in_2014.html
William_Harvey_Lillard.html
Radial_Road_3.html
George_Weldrick.html
Zgornji_Otok.html
Blue_Heelers_(season_8).html
Taggen_Nunatak.html
1951_National_League_tie-breaker_series.html
List_of_number-one_singles_of_1993_(Finland).html
Vrila.html
William_Henry_Porter.html
Clive_Brown_(footballer).html
2010_Karshi_Challenger_%E2%80%93_Singles.html
Blick_nach_Rechts.html
Central_District_(Rezvanshahr_County).html
Gal%C3%A1pagos,_Guadalajara.html
Campus_of_Texas_A%26M_University.html
Alexios_Aspietes.html
Mei_Lanfang.html
Thalkirchen-Obersendling-Forstenried-F%C3%BCrstenried-Solln.html
Coalville_Town_railway_station.html
Gennady_Lesun.html
Bartrum_Glacier.html
Victor_S._Mamatey.html
Gottfried_Keller.html
Table_Point_Formation.html
Nobuhiko_Ushiba.html
Master_of_Space_and_Time.html
Early_medieval_states_in_Kazakhstan.html
Eressa_aperiens.html
Companys,_proc%C3%A9s_a_Catalunya.html
Myrtle_(sternwheeler).html
Abanych

In [3]:
filenames = [f for f in os.listdir("wiki")]
print(len(filenames))

999


In [4]:
with open("wiki/Urban_chicken.html") as f:
    data = f.read()
print(data)

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Urban chicken - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Urban_chicken","wgTitle":"Urban chicken","wgCurRevisionId":766175951,"wgRevisionId":766175951,"wgArticleId":25069082,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 errors: external links","Chicken","Urban agriculture"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","Se

### Read all of the files into a list, all of the article names into a list.

### Multiprocessing
Using ProcessPoolExecutor, run time is ~1.75 seconds. This is true for max_workers up to about 6 and then the time starts increasing due to the increased overhead of creating new processess to run our tasks.
Since this code is mostly involved with I/O, we would expect better performance using multithreading instead of multiprocessing.



In [19]:
import concurrent.futures
import time

def get_content(file):
    with open(file) as f:
        content = f.read()
    return content
    
def get_article_names(file):
    name = file.replace("wiki/", "").replace(".html", "")
    return name

filenames = ["wiki/%s" % f for f in os.listdir("wiki")]

start = time.time()

pool = concurrent.futures.ProcessPoolExecutor(max_workers=8)
content_list = pool.map(get_content, filenames)
content_list = list(content_list)
name_list = pool.map(get_article_names, filenames)
name_list = list(name_list)

end = time.time()
print(end - start)


    

2.244086742401123


### Multithreading
We see the run time reduced to around .5 seconds using Threads instead of Processes. I see the biggest performance boost with a low amount of workers (1 to 2).

In [71]:
import concurrent.futures
import time

def get_content(file):
    with open(file) as f:
        content = f.read()
    return content
    
def get_article_names(file):
    name = file.replace("wiki/", "").replace(".html", "")
    return name

filenames = ["wiki/%s" % f for f in os.listdir("wiki")]

start = time.time()

pool = concurrent.futures.ThreadPoolExecutor(max_workers=2)
content_list = pool.map(get_content, filenames)
content_list = list(content_list)
name_list = pool.map(get_article_names, filenames)
name_list = list(name_list)

end = time.time()
print(end - start)


0.40656328201293945


### No parallel processing
Without multithreading at all, the program runs the fastest. This is most likely because we are not doing much processing in the code thus far.

In [102]:
import concurrent.futures
import time

def get_content(file):
    with open(file) as f:
        content = f.read()
    return content
    
def get_article_names(file):
    name = file.replace("wiki/", "").replace(".html", "")
    return name

filenames = ["wiki/%s" % f for f in os.listdir("wiki")]

start = time.time()

content_list = []
name_list = []
for file in filenames:
    content_list.append(get_content(file))
    name_list.append(get_article_names(file))
    
end = time.time()
print(end - start)

0.13643598556518555


### Strip the content div from the html and store it in a list:

In [141]:
from bs4 import BeautifulSoup

def convert_content_to_str(html):
    soup = BeautifulSoup(html, "html.parser")
    return str(soup.find_all("div", id="content")[0])

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=None)
parsed = pool.map(convert_content_to_str, content_list)
parsed = list(parsed)
print(time.time() - start)

83.43215227127075


In [142]:
parsed[0]

'<div class="mw-body" id="content" role="main">\n<a id="top"></a>\n<div id="siteNotice"><!-- CentralNotice --></div>\n<div class="mw-indicators">\n</div>\n<h1 class="firstHeading" id="firstHeading" lang="en">Furubira District, Hokkaido</h1>\n<div class="mw-body-content" id="bodyContent">\n<div id="siteSub">From Wikipedia, the free encyclopedia</div>\n<div id="contentSub"></div>\n<div class="mw-jump" id="jump-to-nav">\n\t\t\t\t\tJump to:\t\t\t\t\t<a href="#mw-head">navigation</a>, \t\t\t\t\t<a href="#p-search">search</a>\n</div>\n<div class="mw-content-ltr" dir="ltr" id="mw-content-text" lang="en"><table class="plainlinks metadata ambox ambox-content ambox-Unreferenced" role="presentation">\n<tr>\n<td class="mbox-image">\n<div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upl

### Count the number of each tag in each document and then combine them to find the most common tags:

In [148]:
def count_tags(html):
    soup = BeautifulSoup(html, "html.parser")
    tags = {}
    for tag in soup.find_all():
        if tag.name not in tags:
            tags[tag.name] = 0
        tags[tag.name] += 1
    return tags

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
tags_list = pool.map(count_tags, content_list)
tags_list = list(tags_list)

tags_total = {}
for tl in tags_list:
    for k,v in tl.items():
        if k not in tags_total:
            tags_total[k] = 0
        tags_total[k] += v
end = time.time()
print(end - start)
tags_total
        

52.65461754798889


{'a': 214557,
 'abbr': 3665,
 'annotation': 2,
 'area': 39,
 'audio': 2,
 'b': 14455,
 'bdi': 4,
 'big': 75,
 'blockquote': 58,
 'body': 999,
 'br': 4986,
 'caption': 200,
 'center': 64,
 'cite': 3563,
 'code': 108,
 'dd': 1376,
 'del': 2,
 'div': 58927,
 'dl': 457,
 'dt': 334,
 'font': 40,
 'form': 999,
 'h1': 999,
 'h2': 5044,
 'h3': 11954,
 'h4': 117,
 'h5': 4,
 'h6': 1,
 'head': 999,
 'hr': 51,
 'html': 999,
 'i': 18246,
 'img': 8699,
 'input': 3996,
 'label': 999,
 'li': 133277,
 'link': 12985,
 'map': 2,
 'math': 2,
 'meta': 4499,
 'mo': 2,
 'mrow': 2,
 'mstyle': 2,
 'noscript': 999,
 'ol': 858,
 'p': 7998,
 'pre': 1,
 'q': 76,
 'rb': 16,
 'rp': 32,
 'rt': 16,
 'ruby': 16,
 's': 10,
 'samp': 2,
 'script': 4995,
 'semantics': 2,
 'small': 3272,
 'source': 2,
 'span': 75342,
 'strong': 599,
 'sub': 151,
 'sup': 11157,
 'table': 4010,
 'td': 57673,
 'th': 14472,
 'title': 999,
 'tr': 27300,
 'u': 51,
 'ul': 24147,
 'wbr': 85}

### Find common words:

In [151]:
from bs4 import BeautifulSoup
from collections import Counter
import re

def count_words(html):
    soup = BeautifulSoup(html, 'html.parser')
    words = {}
    text = soup.get_text()
    text = re.sub("\W+", " ", text.lower())
    words = text.split(" ")
    words = [w for w in words if len(w) >= 5]
    return Counter(words).most_common(10)

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
words = pool.map(count_words, parsed)
words = list(words)

word_counts = {}
for wc in words:
    for word, count in wc:
        if word not in word_counts:
            word_counts[word] = 0
        word_counts[word] += 1
end = time.time()

print(end - start)
word_counts

37.1418571472168


{'ixtlán': 1,
 'france': 9,
 'admiral': 1,
 'poland': 8,
 'chiddingstone': 1,
 'brahma': 2,
 'saratov': 1,
 'clash': 1,
 'desai': 1,
 'global': 3,
 'sofia': 1,
 'armançon': 1,
 'banovina': 1,
 'audacity': 1,
 'forrest': 1,
 'photography': 1,
 'angiosperm': 1,
 'series': 18,
 'siromahov': 1,
 'undrafted': 1,
 'swedish': 4,
 'devarampally': 1,
 'rolli': 1,
 'community': 13,
 'deformity': 1,
 'apertura': 1,
 'phorids': 1,
 'wright': 1,
 '59861': 1,
 'touyour': 1,
 'square': 6,
 'niigata': 2,
 'pokal': 1,
 'kansas': 4,
 'mcdonald': 2,
 'immune': 1,
 'defense': 5,
 'between': 4,
 'malchin': 1,
 'stang': 1,
 'maccabees': 1,
 'clade': 6,
 'lötschberg': 1,
 'watsonville': 1,
 'plains': 2,
 'touring': 1,
 'homeobox': 1,
 'sciences': 1,
 'kunqu': 1,
 'economics': 1,
 'columns': 1,
 'mughals': 1,
 'casino': 1,
 'bohra': 1,
 'revolt': 1,
 'impressionist': 1,
 'swathi': 1,
 'coordinates': 29,
 'amphibian': 1,
 'compound': 1,
 'record': 5,
 'trusts': 1,
 'shooting': 1,
 'aliabad': 2,
 'quarter': 2,
