# Analyzing Wikipedia Pages

In this guided project, you'll work with data scraped from Wikipedia.The data pulls all html lines of code within a file. The goal here will be to use map reduce functions to run word count within each line

In [22]:
# listing all files in wiki folder
import pandas as pd
import os
file_names = os.listdir("wiki")
for file in file_names:
    print(file)

Bay_of_ConcepciC3B3n.html
Bye_My_Boy.html
Valentin_Yanin.html
Kings_XI_Punjab_in_2014.html
William_Harvey_Lillard.html
Radial_Road_3.html
George_Weldrick.html
Zgornji_Otok.html
Blue_Heelers_(season_8).html
Taggen_Nunatak.html
Henri_BraqueniC3A9.html
Vrila.html
William_Henry_Porter.html
Clive_Brown_(footballer).html
Blick_nach_Rechts.html
Central_District_(Rezvanshahr_County).html
Alexios_Aspietes.html
Mei_Lanfang.html
Wangeroogeclass_tug.html
Dowell_Philip_O27Reilly.html
Coalville_Town_railway_station.html
Gennady_Lesun.html
Bartrum_Glacier.html
Victor_S._Mamatey.html
Gottfried_Keller.html
Table_Point_Formation.html
Nobuhiko_Ushiba.html
Master_of_Space_and_Time.html
Early_medieval_states_in_Kazakhstan.html
Eressa_aperiens.html
Myrtle_(sternwheeler).html
Abanycha_bicolor.html
JeecyVea.html
Aubrey_Fair.html
Ingrid_GuimarC3A3es.html
Urban_chicken.html
Elgin_National_Watch_Company.html
AlMidan.html
Antae_temple.html
Metis_Institute_of_Polytechnic.html
Sverre_Solberg.html
John_Reid_(British

In [2]:
# total amount of wiki files
import os
file_names = os.listdir("wiki")
print(len(file_names))

999


In [3]:
# creating a list of lines for a file
folder_name = "wiki"
file_name = "Bay_of_ConcepciC3B3n.html"
with open(os.path.join(folder_name, file_name)) as f:
    lines = [line for line in f.readlines()]
lines

['<!DOCTYPE html>\n',
 '<html class="client-nojs" lang="en" dir="ltr">\n',
 '<head>\n',
 '<meta charset="UTF-8"/>\n',
 '<title>Bay of Concepción - Wikipedia</title>\n',
 '<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n',
 '<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Bay_of_Concepción","wgTitle":"Bay of Concepción","wgCurRevisionId":647460156,"wgRevisionId":647460156,"wgArticleId":16044270,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Coordinates on Wikidata","All stub articles","Landforms of Bío Bío Region","Bays of Chile","Bío Bío Region geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""

### Creating chunk and map reduce functions

In [4]:
# defining the map_chunks and map_reduce functions
import math
import functools
from multiprocessing import Pool

def make_chunks(data, num_chunks):
    chunk_size = math.ceil(len(data) / num_chunks)
    return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]

def map_reduce(data, num_processes, mapper, reducer):
    chunks = make_chunks(data, num_processes)
    pool = Pool(num_processes)
    chunk_results = pool.map(mapper, chunks)
    return functools.reduce(reducer, chunk_results)

### Using map reduce to count all lines

In [5]:
# defining a function to count all lines in each file and a reduce function
def count_lines(file_names):
    total_lines = 0
    for file in file_names:
        with open(os.path.join(folder_name, file)) as f:
            lines = [line for line in f.readlines()]
            total_lines += len(lines)
    return total_lines

def lines_reduce(count1,count2):
    total = count1+count2
    return total

In [6]:
count_lines(file_names) # checking total lines w/ the function

499797

In [7]:
# running total lines with map reduce and confirming it equals the same as the count_lines function
line_count_reduce = map_reduce(file_names, 4, count_lines, lines_reduce)
line_count_reduce

499797

### Using map reduce to find word counts in all lines

In [8]:
# defining a function to find a specific word

def find_word(file_names):
    file_counts = {}
    for file in file_names:
        with open(os.path.join(folder_name, file)) as f:
            lines = [line for line in f.readlines()]
            for index, line in enumerate(lines):
                if "data" in line:
                    if file not in file_counts:
                        file_counts[file] = [index]
                    file_counts[file].append(index)
    return file_counts

In [9]:
print(find_word(file_names)) # checking to make sure the function works

{'Bay_of_ConcepciC3B3n.html': [6, 6, 45, 58, 60, 62, 105, 188, 205], 'Bye_My_Boy.html': [276, 276, 359, 376], 'Valentin_Yanin.html': [101, 101, 144, 227, 244], 'Kings_XI_Punjab_in_2014.html': [221, 221, 229, 237, 245, 253, 269, 277, 293, 301, 317, 325, 341, 374, 376, 381, 383, 388, 390, 395, 397, 402, 564, 647, 664], 'William_Harvey_Lillard.html': [45, 45, 65, 81, 129, 212, 229], 'Radial_Road_3.html': [52, 52, 103, 301, 505, 588, 605], 'George_Weldrick.html': [194, 194, 277, 294], 'Zgornji_Otok.html': [6, 6, 53, 55, 65, 69, 211, 260, 262, 311, 394, 411], 'Blue_Heelers_(season_8).html': [49, 49, 79, 82, 105, 107, 125, 127, 133, 135, 141, 143, 660, 695, 730, 739, 886, 969, 986], 'Taggen_Nunatak.html': [6, 6, 44, 46, 48, 93, 176, 193], 'Henri_BraqueniC3A9.html': [43, 43, 46, 92, 175, 192], 'Vrila.html': [6, 6, 57, 59, 69, 73, 99, 100, 102, 151, 234, 251], 'William_Henry_Porter.html': [48, 48, 88, 171, 188], 'Clive_Brown_(footballer).html': [146, 146, 229, 246], 'Blick_nach_Rechts.html': [

In [10]:
# creating a reduce function

def reduce_counts(lines1, lines2):
    lines1.update(lines2)
    return lines1

case_sens_counts = map_reduce(file_names, 4, find_word, reduce_counts)


In [11]:
# confirming the result matches running all data at once
map_reduce(file_names, 4, find_word, reduce_counts) == find_word(file_names)

True

### Updating word find to input words and ignore case

In [12]:
# defining a function to find a specific word

word = "data"

def find_word_lower(file_names):
    file_counts = {}
    for file in file_names:
        with open(os.path.join(folder_name, file)) as f:
            lines = [line for line in f.readlines()]
            for index, line in enumerate(lines):
                line_lower = line.lower()
                if word in line_lower:
                    if file not in file_counts:
                        file_counts[file] = [index]
                    file_counts[file].append(index)
    return file_counts

In [13]:
ignore_case_counts = map_reduce(file_names, 4, find_word_lower, reduce_counts)

In [14]:
# confirming the total counts without case were more (one for each line)
total_no_case = sum(len(values) for values in map_reduce(file_names, 4, find_word, reduce_counts).values())
total_lower = sum(len(values) for values in map_reduce(file_names, 4, find_word_lower, reduce_counts).values())
print(f"Total with case {total_no_case:,}")
print(f"Total ignoring case {total_lower:,}")

Total with case 11,338
Total ignoring case 11,503


### Printing the file name and increase in matches ignoring case

In [15]:
# note len is used bc the dictionaries are indice values
for file in ignore_case_counts:
    if len(ignore_case_counts[file]) > len(case_sens_counts[file]):
        print(f" {len(ignore_case_counts[file]) - len(case_sens_counts[file])} more word count in {file}")
     

 1 more word count in Table_Point_Formation.html
 1 more word count in Ingrid_GuimarC3A3es.html
 2 more word count in Jules_Verne_ATV.html
 1 more word count in Pictogram.html
 2 more word count in Claire_Danes.html
 1 more word count in PTPRS.html
 1 more word count in A_Beautiful_Valley.html
 1 more word count in Mudramothiram.html
 2 more word count in Gordon_Bau.html
 1 more word count in Embraer_Unidade_GaviC3A3o_Peixoto_Airport.html
 3 more word count in Code_page_1023.html
 1 more word count in Cryptographic_primitive.html
 1 more word count in Alex_Kurtzman.html
 1 more word count in Filip_Pyrochta.html
 1 more word count in Morgana_King.html
 1 more word count in Don_Parsons_(ice_hockey).html
 1 more word count in Bias.html
 2 more word count in Tomohiko_ItC58D_(director).html
 1 more word count in Imperial_Venus_(film).html
 1 more word count in Camp_Nelson_Confederate_Cemetery.html
 1 more word count in Benny_Lee.html
 1 more word count in Kul_Gul.html
 1 more word count in 

### Updating mapper to find indice every every occurance of a word

In [16]:
# creating function to return indices each time a word occurs in a string
def find_indices(string, word):
    start = 0
    indices = []
    for i in range(len(string)):
        index = string.find(word, start)
        if index == -1:
            break
        else:
            indices.append(index)
            start = index + 1
    return indices

In [17]:
find_indices("i like tacos they're good taco taco", "taco")

[7, 33, 38]

In [50]:
# updating function to include indices of occurances for each line in each file

word = "science"

def find_word_lower(file_names):
    file_counts = {}
    for file in file_names:
        with open(os.path.join(folder_name, file)) as f:
            lines = [line for line in f.readlines()]
            file_counts[file] = []
            for index, line in enumerate(lines):
                line_lower = line.lower()
                matches = find_indices(line_lower, word)
                for match in matches:
                    file_counts[file].append((index,match))
    return file_counts

In [59]:
occurances = map_reduce(file_names, 4, find_word_lower, reduce_counts)

In [60]:
# looping through dictionary to create a readable pandas table
rows = []
for file, tuples in occurances.items():
    folder_name = "wiki"
    file_name = file
    with open(os.path.join(folder_name, file_name)) as f:
        lines = [line for line in f.readlines()]
    for tup in tuples:
        row = {
            "file" : file,
            "line" : tup[0],
            "index" : tup[1],
            "context": lines[tup[0]][tup[1]:]
        }
        rows.append(row)

In [64]:
# displaying results
df = pd.DataFrame(rows)
df

Unnamed: 0,file,line,index,context
0,Valentin_Yanin.html,6,840,"Sciences"",""Full Members of the Russian Academy..."
1,Valentin_Yanin.html,6,890,"Sciences"",""Demidov Prize laureates"",""Solzhenit..."
2,Valentin_Yanin.html,66,90,"Sciences"" class=""mw-redirect"" title=""Soviet Ac..."
3,Valentin_Yanin.html,66,145,"Sciences"">Soviet Academy of Sciences</a>; he b..."
4,Valentin_Yanin.html,66,173,Sciences</a>; he became a full academician in ...
5,Valentin_Yanin.html,144,1440,"Sciences"" title=""Category:Full Members of the ..."
6,Valentin_Yanin.html,144,1502,"Sciences"">Full Members of the USSR Academy of ..."
7,Valentin_Yanin.html,144,1548,"Sciences</a></li><li><a href=""/wiki/Category:F..."
8,Valentin_Yanin.html,144,1632,"Sciences"" title=""Category:Full Members of the ..."
9,Valentin_Yanin.html,144,1697,"Sciences"">Full Members of the Russian Academy ..."
