In [149]:
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/

PATH_WIKI_XML = 'D:\\things'
FILENAME_WIKI = 'simplewiki-20191101-pages-meta-current.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
FILENAME_FULL_ARTICLES = 'full_articles.csv'
ENCODING = "utf-8"


In [359]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
from bs4 import BeautifulSoup

In [3]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [4]:
def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

In [35]:


pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathArticlesRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)

totalCount = 0
articleCount = 0
redirectCount = 0
templateCount = 0
title = None
start_time = time.time()

with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH, \
        codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
    templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)

    articlesWriter.writerow(['id', 'title', 'text'])
    redirectWriter.writerow(['id', 'title', 'text'])
    templateWriter.writerow(['id', 'title'])

    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)

        if event == 'start':
            if tname == 'page':
                title = ''
                id = -1
                redirect = ''
                text = ''
                inrevision = False
                ns = 0
            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True
        else:
            if tname == 'title':
                title = elem.text
            elif tname == 'id' and not inrevision:
                id = int(elem.text)
            elif tname == 'redirect':
                redirect = elem.attrib['title']
            elif tname == 'ns':
                ns = int(elem.text)
            elif tname == 'text' and elem.text:
                text += elem.text
            elif tname == 'page':
                totalCount += 1

                if ns == 10:
                    templateCount += 1
                    templateWriter.writerow([id, title])
                elif len(redirect) > 0:
                    articleCount += 1
                    articlesWriter.writerow([id, title, text.replace(',', ',')])
                    
                else:
                    redirectCount += 1
                    redirectWriter.writerow([id, title, text.replace(',', ',')])
#                     if title == "Australia":
#                         OzText = text
                    

#                 if totalCount > 100:
#                  break

                if totalCount > 1 and (totalCount % 100000) == 0:
                    print("{:,}".format(totalCount))

            elem.clear()

elapsed_time = time.time() - start_time

print("Total pages: {:,}".format(totalCount))
print("Template pages: {:,}".format(templateCount))
print("Article pages: {:,}".format(articleCount))
print("Redirect pages: {:,}".format(redirectCount))
print("Elapsed time: {}".format(hms_string(elapsed_time)))

100,000
200,000
300,000
400,000
500,000
Total pages: 521,853
Template pages: 23,708
Article pages: 65,201
Redirect pages: 432,944
Elapsed time: 0:01:29.27


In [13]:
import pandas as pd

In [36]:
dfArticlesRedirect = pd.read_csv(pathArticlesRedirect)

In [37]:
dfArticlesRedirect.shape

(432944, 3)

In [57]:
dfArticlesRedirect.head()

Unnamed: 0,id,title,text
0,1,April,{{monththisyear|4}}\n'''April''' is the 4th [[...
1,2,August,{{monththisyear|8}}\n'''August''' (Aug.) is th...
2,3,User:Angela,"<div style=""background-color: #F8F2F2; margin:..."
3,4,User talk:Angela,Older talk has been [[User talk:Angela/Archive...
4,5,User:Anthere,"Hello :-)\n\nIf you want to contact me, please..."


In [67]:
dfArticlesRedirect['text'] = [str(t) for t in dfArticlesRedirect['text']]

In [52]:
sum([len(str(t)) for t in dfArticlesRedirect['text']])/dfArticlesRedirect.shape[0]

2014.2192662330463

In [69]:
sum([l < 100 for l in [len(t) for t in dfArticlesRedirect['text']]])

57384

In [81]:
sum(['User' in title[:5] for title in dfArticlesRedirect['title']])

203924

In [68]:
sum(['{{Infobox' in t for t in dfArticlesRedirect['text']])

52099

In [83]:
dfArticlesRedirect.loc[[t[:5] != 'User:' for t in dfArticlesRedirect['title']]].shape

(410158, 3)

In [137]:
dfArticlesRedirect.loc[[t[:7] != 'User ta' for t in dfArticlesRedirect['title']]].shape

(251811, 3)

In [143]:
dfOnlyArticles = dfArticlesRedirect.loc[[t[:7] != 'User ta' for t in dfArticlesRedirect['title']]]

In [145]:
dfOnlyArticles.shape

(251811, 3)

In [146]:
dfOnlyArticles = dfOnlyArticles.loc[[t[:5] != 'User:' for t in dfOnlyArticles['title']]]

In [400]:
dfOnlyArticles.shape

(229025, 3)

In [401]:
dfOnlyArticles = dfOnlyArticles.loc[[t[:5] != 'Talk:' for t in dfOnlyArticles['title']]]

In [402]:
dfOnlyArticles.shape

(202607, 3)

In [405]:
dfOnlyArticles = dfOnlyArticles.loc[[t[:10] != 'Wikipedia:' for t in dfOnlyArticles['title']]]

In [406]:
dfOnlyArticles.shape

(197377, 3)

In [148]:
dfOnlyArticles.head()

Unnamed: 0,id,title,text
0,1,April,{{monththisyear|4}}\n'''April''' is the 4th [[...
1,2,August,{{monththisyear|8}}\n'''August''' (Aug.) is th...
5,6,Art,[[Category:Art| ]]\n[[Category:Non-verbal comm...
6,8,A,{{more sources|date=February 2012}}\n{{about| ...
7,9,Air,{{dablink|Air is one of the four [[classical e...


In [150]:
pathFullArticles = os.path.join(PATH_WIKI_XML, FILENAME_FULL_ARTICLES)
dfOnlyArticles.to_csv(pathFullArticles)

In [128]:
infobox_start = OzText.find("nfobox") - 1
if infobox_start is -1:
    print("No infobox")
bracket_count = 0
end_index = infobox_start
for i in range(infobox_start, len(OzText)):
    char = OzText[i]
    if char is "}":
        bracket_count -= 1
    elif char is "{":
        bracket_count += 1
    if bracket_count is -2:
        # reached end of info box
        end_index = i - 1
        break

In [151]:
len(OzText[infobox_start:end_index])

7845

In [168]:
OzInfobox = OzText[infobox_start:end_index]

In [152]:
len(OzText[end_index+2:])

32002

In [169]:
OzArticle = OzText[end_index+2:]

In [153]:
infobox_content_list = OzText[infobox_start:end_index].splitlines()

In [154]:
infobox_content_list

['Infobox country',
 '|conventional_long_name = Commonwealth of Australia',
 '|common_name        = Australia',
 '|image_flag         = Flag of Australia (converted).svg',
 '|alt_flag           = A blue field with the onion Flag in the upper hoist quarter, a large white seven-pointed star in the lower hoist quarter, and constellation of five white stars in the fly – one small five-pointed star and four, larger, seven-pointed stars.',
 '|image_coat         = Coat of Arms of Australia.svg',
 '|alt_coat           = <!--alt text for coat of arms-->',
 '|national_anthem    = "[[Advance Australia Fair]]"{{lower|0.2em|{{refn|Australia\'s [[royal anthem]] is "[[God Save the Queen]]", played in the presence of a member of the [[House of Windsor|Royal family]] when they are in Australia. In other contexts, the [[national anthem]] of Australia, "[[Advance Australia Fair]]", is played.<ref>[http://www.itsanhonour.gov.au/symbols/anthem.cfm It\'s an Honour – Symbols – Australian National Anthem]  {{

In [158]:
def unmatched_bracket(text):
    """
    Returns true if there is an unmatched bracket
        this is a sentence {with a bracket } - false
        this is a sentence {with a bracket } and {this - true
    """
    for c in reversed(text):
        if c is "}":
            return False
        elif c is "{":
            return True

In [159]:
page_title = 'Australia'
page_text = OzText


# need to handle the first line special
# this could be of the forms:
"""
{{infobox| above       = Arizona v. California
{{Infobox SCOTUS case
or many other variations
"""
if len(infobox_content_list[0].split("|", 1)) == 2:
    # need to add pipe back in after the split
    # pipe is needed later on
    infobox_content_list[0] = "|" + infobox_content_list[0].split("|")[1]
else:
    infobox_content_list = infobox_content_list[1:]

infobox_merged_content = []
for line in infobox_content_list:
    line = line.strip()
    if len(line) is 0:
        continue
    elif line[0] is "|":
        infobox_merged_content.append(line)
    else:
        infobox_merged_content[-1] += " " + line

infobox = {"title": page_title}
for entry in infobox_merged_content:
    key_data = entry.strip().split("=", 1)  # only split on first "="

    # multiple strips because it might look like
    # "    | Key = Text"
    # Need to remove up to |, remove bar, and then strip again
    key = key_data[0].lstrip()[1:].strip()  # removes "|"

    if len(key) > 0 and key[0] is "|":
        # sometimes have duplicate |'s because why would this be easy
        # eg: ||NotParticipating=Stewart and Fortas
        key = key[1:]

    data = ""

    if len(infobox) > 0 and unmatched_bracket(infobox[list(infobox.keys())[-1]]):
        infobox[list(infobox.keys())[-1]] += " " + key
        continue
        # there is no "="
        # this could be due to being part of a list
        """
            |Holding={{Ordered list|style=text-align: left
              | The forced extraction and analysis of a blood sample is not compelled testimony and therefore does not violate the Fifth Amendment Right against self-incrimination
              | Intrusions into the human body require a warrant
              | Here, the warrantless blood test was permissible under the exigent circumstances exception to prevent the destruction of alcohol in the blood stream through the body's natural metabolic processes
          }}

          or

            | Holding           = {{ordered list |style=text-align: left;
                |1=States may not prohibit citizens from contracting insurance out of state for acts performed outside the state.
                |2=States may not prohibit citizens from contracting insurance out of state by written communication, even if the property to be insured is within the state.
                }}

          Note the above has "="
        """

    if len(key_data) == 2:
        data = key_data[1].strip()

    infobox[key] = data

In [167]:
for k, v in infobox.items():
    print ("\t\t***", k)
    print ("\t", v)

		*** population_estimate_year
	 {{CURRENTYEAR}}
		*** Gini_change
	 <!--increase/decrease/steady-->
		*** leader_name3
	 [[Scott Morrison]]
		*** leader_title3
	 [[Prime Minister of Australia|Prime Minister]]
		*** population_density_rank
	 236th
		*** sovereignty_note
	 from the [[United Kingdom of Great Britain and Ireland|United Kingdom]]
		*** Gini_rank
	 19th
		*** drives_on
	 [[Right- and left-hand traffic#Australia|left]]
		*** established_event2
	 [[Statute of Westminster Adoption Act 1942|Statute of Westminster Adoption Act]]
		*** demonym
	 {{hlist|[[Australians|Australian]] <br />[[Aussie]] (colloquial)<ref>See entry in the [[Macquarie Dictionary]].</ref><ref>{{cite book |title=[[Collins English Dictionary]] |year=2009 |publisher=[[HarperCollins]] |location=Bishopbriggs, Glasgow |isbn=978-0-00-786171-2 |page=18 }}</ref><!--end hlist:-->}}
		*** alt_map
	 <!--alt text for map-->
		*** currency_code
	 AUD
		*** GDP_nominal_year
	 2017
		*** calling_code
	 [[+61]]
		*** Gini_y

In [170]:
import re

In [257]:
re.findall("\[\[[\w]*\]\]", OzArticle)[:10]

['[[country]]',
 '[[Oceania]]',
 '[[Canberra]]',
 '[[Sydney]]',
 '[[Australasia]]',
 '[[Australasia]]',
 '[[ecozone]]',
 '[[Oceania]]',
 '[[population]]',
 '[[Sydney]]']

In [256]:
[item.strip("[]") for item in re.findall("\[\[[(\w\|)*?\w\s]*\]\]", OzArticle)][:10]

['country',
 'sovereign state',
 'southern hemisphere',
 'Oceania',
 'capital city',
 'Canberra',
 'Sydney',
 'List of countries by area|sixth biggest country in the world',
 'Oceania|Oceanic',
 'Australasia']

In [254]:
entities = []
for entity in [item.strip("[]") for item in re.findall("\[\[[(\w\|)*?\w]*\]\]", OzArticle)]:
    if "|" in entity:
        for sub in entity.split("|"):
            entities.append(sub)
    else:
        entities.append(entity)

In [224]:
article_titles = [title.lower() for title in list(dfOnlyArticles.title)]

In [255]:
for entity in entities:
    if entity.lower() not in article_titles:
        print(entity)

Oceanic
animals
plants
Britain
cities
Batavia
gaol
jail
colonies
multicultural
immigrant
Colonisation
colonised
Britain
multicultural
digeridoo
cycling
WNBL
sports
swimmer


In [218]:
s = "multiculturalism|multicultural"

In [220]:
s.split("|")

['multiculturalism', 'multicultural']

In [231]:
def remove_text_inside_brackets(text, brackets="{}"):
    count = [0] * (len(brackets) // 2) # count open/close brackets
    saved_chars = []
    for character in text:
        for i, b in enumerate(brackets):
            if character == b: # found bracket
                kind, is_close = divmod(i, 2)
                count[kind] += (-1)**is_close # `+1`: open, `-1`: close
                if count[kind] < 0: # unbalanced bracket
                    count[kind] = 0  # keep it
                else:  # found bracket to remove
                    break
        else: # character is not a [balanced] bracket
            if not any(count): # outside brackets
                saved_chars.append(character)
    return ''.join(saved_chars)

In [232]:
remove_text_inside_brackets(OzArticle)

'\n\'\'\'Australia\'\'\', formally the \'\'\'Commonwealth of Australia\'\'\', is a [[country]] and [[sovereign state]] in the [[southern hemisphere]], located in [[Oceania]]. Its [[capital city]] is [[Canberra]], and its largest city is [[Sydney]].\n\nAustralia is the [[List of countries by area|sixth biggest country in the world]] by land area, and is part of the [[Oceania|Oceanic]] and [[Australasia]]n regions. Australia, [[New Zealand]], [[New Guinea]] and other islands on the Australian [[tectonic plate]] are together called [[Australasia]], which is one of the world\'s great [[ecozone]]s. When other Pacific islands are included with Australasia, it is called [[Oceania]].\n\n25 million<ref></ref> people live in Australia, and about 80% of them live on the east coast. The country is divided up into six [[States of Australia|states]] and two territories, and more than half of Australia\'s [[population]] lives in and around the cities of [[Sydney]], [[Melbourne]], [[Brisbane]], [[Pert

In [233]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [234]:
strip_tags(remove_text_inside_brackets(OzArticle))

'\n\'\'\'Australia\'\'\', formally the \'\'\'Commonwealth of Australia\'\'\', is a [[country]] and [[sovereign state]] in the [[southern hemisphere]], located in [[Oceania]]. Its [[capital city]] is [[Canberra]], and its largest city is [[Sydney]].\n\nAustralia is the [[List of countries by area|sixth biggest country in the world]] by land area, and is part of the [[Oceania|Oceanic]] and [[Australasia]]n regions. Australia, [[New Zealand]], [[New Guinea]] and other islands on the Australian [[tectonic plate]] are together called [[Australasia]], which is one of the world\'s great [[ecozone]]s. When other Pacific islands are included with Australasia, it is called [[Oceania]].\n\n25 million people live in Australia, and about 80% of them live on the east coast. The country is divided up into six [[States of Australia|states]] and two territories, and more than half of Australia\'s [[population]] lives in and around the cities of [[Sydney]], [[Melbourne]], [[Brisbane]], [[Perth, Western 

In [347]:
print(OzArticle)


'''Australia''', formally the '''Commonwealth of Australia''', is a [[country]] and [[sovereign state]] in the [[southern hemisphere]], located in [[Oceania]]. Its [[capital city]] is [[Canberra]], and its largest city is [[Sydney]].

Australia is the [[List of countries by area|sixth biggest country in the world]] by land area, and is part of the [[Oceania|Oceanic]] and [[Australasia]]n regions. Australia, [[New Zealand]], [[New Guinea]] and other islands on the Australian [[tectonic plate]] are together called [[Australasia]], which is one of the world's great [[ecozone]]s. When other Pacific islands are included with Australasia, it is called [[Oceania]].

25 million<ref>{{Cite web|url=http://www.abs.gov.au/websitedbs/D3310114.nsf/home/25+Million+Population+Milestone|title=Australian Bureau of Statistics web site|last=Statistics|first=c=AU; o=Commonwealth of Australia; ou=Australian Bureau of|website=www.abs.gov.au|language=en|access-date=2018-09-25}}</ref> people live in Australia

In [263]:
[item.strip("[]") for item in re.findall("\[\[[(\w\|)*?\w\s]*\]\]", strip_tags(remove_text_inside_brackets(OzArticle)))]

['country',
 'sovereign state',
 'southern hemisphere',
 'Oceania',
 'capital city',
 'Canberra',
 'Sydney',
 'List of countries by area|sixth biggest country in the world',
 'Oceania|Oceanic',
 'Australasia',
 'New Zealand',
 'New Guinea',
 'tectonic plate',
 'Australasia',
 'ecozone',
 'Oceania',
 'States of Australia|states',
 'population',
 'Sydney',
 'Melbourne',
 'Brisbane',
 'Adelaide',
 'mining',
 'wool',
 'bauxite',
 'emblem',
 'animals',
 'plants',
 'kangaroo',
 'koala',
 'emu',
 'kookaburra',
 'platypus',
 'Australian Aborigine',
 'History of Australia',
 'Britain',
 'Prime Minister of Australia',
 'Edmund Barton',
 'United Nations',
 'Commonwealth of Nations',
 'Parliamentary system|parliamentary democracy',
 'constitutional monarchy',
 'Queen Elizabeth II|Elizabeth II',
 'New South Wales',
 'Queensland',
 'South Australia',
 'Tasmania',
 'Victoria (Australia)|Victoria',
 'Western Australia',
 'Northern Territory',
 'Australian Capital Territory',
 'Sydney',
 'Melbourne',
 

In [252]:
print(len([i for i in [item.strip("[]") for item in re.findall("\[\[[(\w\|)*?\w\s]*\]\]", strip_tags(remove_text_inside_brackets(OzArticle)))]]),
len([i for i in [item.strip("[]") for item in re.findall("\[\[[(\w\|)*?\w\s]*\]\]", strip_tags(remove_text_inside_brackets(OzArticle)))] if "|" not in i]),
len([i for i in [item.strip("[]") for item in re.findall("\[\[[(\w\|)*?\w\s]*\]\]", strip_tags(remove_text_inside_brackets(OzArticle)))] if "|" in i]))

267 227 40


In [253]:
len(re.findall("\[", strip_tags(remove_text_inside_brackets(OzArticle))))/2

312.0

In [259]:
remove_text_inside_brackets(strip_tags(remove_text_inside_brackets(OzArticle)),brackets="[]")

'\n\'\'\'Australia\'\'\', formally the \'\'\'Commonwealth of Australia\'\'\', is a  and  in the , located in . Its  is , and its largest city is .\n\nAustralia is the  by land area, and is part of the  and n regions. Australia, ,  and other islands on the Australian  are together called , which is one of the world\'s great s. When other Pacific islands are included with Australasia, it is called .\n\n25 million people live in Australia, and about 80% of them live on the east coast. The country is divided up into six  and two territories, and more than half of Australia\'s  lives in and around the cities of , , ,  and .\n\nAustralia is known for its (coal, iron, gold, diamonds and crystals), its production of , and as the world\'s largest producer of . Its  is a flower called the Golden Wattle.\n\n== Geography ==\n\n\n\nAustralia\'s landmass of 7,617,930 square kilometers is on the .\nThe continent of Australia, including the island of Tasmania, was separated from the other continents o

In [269]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(OzArticle)

blob = remove_text_inside_brackets(soup.get_text())


In [271]:
regex = re.compile("\[\[[\w\s]*\]\]")
for line in blob:
    print(regex.sub(,,line))

'''Australia''', formally the '''Commonwealth of Australia''', is a [[country]] and [[sovereign state]] in the [[southern hemisphere]], located in [[Oceania]]. Its [[capital city]] is [[Canberra]], and its largest city is [[Sydney]].

Australia is the [[List of countries by area|sixth biggest country in the world]] by land area, and is part of the [[Oceania|Oceanic]] and [[Australasia]]n regions. Australia, [[New Zealand]], [[New Guinea]] and other islands on the Australian [[tectonic plate]] are together called [[Australasia]], which is one of the world's great [[ecozone]]s. When other Pacific islands are included with Australasia, it is called [[Oceania]].

25 million people live in Australia, and about 80% of them live on the east coast. The country is divided up into six [[States of Australia|states]] and two territories, and more than half of Australia's [[population]] lives in and around the cities of [[Sydney]], [[Melbourne]], [[Brisbane]], [[Perth, Western Australia|Perth]] and

In [279]:
regex = re.compile("\[\[[\w\s]*\]\]")
match = regex.search(OzArticle)

In [364]:
def clean_article(article):
    soup = BeautifulSoup(article)
    blob = remove_text_inside_brackets(soup.get_text())
    for result in re.findall("\[\[[\w\s',-.\?]*\]\]", blob):
        blob = blob.replace(result, result[2:-2])
    # TODO: handle the split("|")[1] as alternate names for NER, see if in list of articles, add to list of entities, associate with
    # main name via is-a dictionary?
    for result in re.findall("\[\[[(\w\|,'-.\?)*?\w\s]*\]\]", blob):
        blob = blob.replace(result, result[2:-2].split("|")[0])
    for result in re.findall("\[?\[\w*:.*\]", blob):
        blob = blob.replace(result, '')
    if re.search("=*?=\s?(R|r)eferences\s?=*?=", blob):
        blob = blob[:re.search("=*?=\s?(R|r)eferences\s?=*?=", blob).start()]
    return blob

In [361]:
import copy
dfCleanArticles = copy.copy(dfOnlyArticles)

In [365]:
dfCleanArticles['text'] = dfCleanArticles['text'].apply(lambda x: clean_article(x))

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [420]:
# clean up id numbers because we deleted a lot of stuff
dfCleanArticles['id'] = [t + 1 for t in range(len(dfCleanArticles['id']))]

In [421]:
dfCleanArticles.set_index('id')

Unnamed: 0_level_0,title,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,April,"\n'''April''' is the 4th month of the year, an..."
2,August,\n'''August''' (Aug.) is the 8th month of the ...
3,Art,\n\n\n\nArt and crafts is a creative activity ...
4,A,\n\n\n\n\n'''A''' is the first letter of the E...
5,Air,\n\n\n\n'''Air''' is the Earth's atmosphere. A...
6,Autonomous communities of Spain,Spain is divided in 17 parts called '''autonom...
7,Alan Turing,\n\n\n'''Alan Mathison Turing''' Order of the ...
8,Alanis Morissette,\n'''Alanis Nadine Morissette''' (born 1 June ...
9,Adobe Illustrator,\n\n'''''Adobe Illustrator''''' is a computer ...
10,Andouille,\n\n\n'''Andouille''' is a type of pork sausag...


In [425]:
dfCleanArticles.to_csv('CleanFullArticles.csv')

In [404]:
sum(["Wikipedia:" in title for title in dfCleanArticles.title])

5230

In [399]:
for title in dfCleanArticles.title:
    if "Talk:" not in title[:7] and "Talk:" in title:
        print(title)

Wikipedia:Requests for deletion/Requests/2014/Talk:Battle of Saratoga
Wikipedia:Requests for deletion/Requests/2014/Talk:Combat sport
Wikipedia:Requests for deletion/Requests/2015/Talk:Hilbert's paradox of the Grand Hotel
Wikipedia:Requests for deletion/Requests/2016/Talk:Melody Thomas Scott


In [403]:
for title in dfCleanArticles.title:
    if "Wikipedia:" in title:
        print(title)

Wikipedia:Administrators
Wikipedia:Basic English alphabetical wordlist
Wikipedia:Basic English international wordlist
Wikipedia:Wikipedia down
Wikipedia:Copyrights
Wikipedia:List of 1000 basic words
Wikipedia:Neutral point of view
Wikipedia:Standard messages
Wikipedia:Simple English GFDL
Wikipedia:Simple English Wikipedia
Wikipedia:Spelling
Wikipedia:Text of the GNU Free Documentation License
Wikipedia:Useful
Wikipedia:Username
Wikipedia:Wikipedians
Wikipedia:What Wikipedia is not
Wikipedia:Mailing lists
Wikipedia:MediaWiki namespace
Wikipedia:Special pages
Wikipedia:Deletion log
Wikipedia:Basic English ordered wordlist
Wikipedia:Protection log
Wikipedia:Statistics
Wikipedia:List of common misspellings
Wikipedia:Template messages
Wikipedia:RecentChanges
Wikipedia:Vandalism in progress
Wikipedia:Announcements
Wikipedia:Rules
Wikipedia:About
Wikipedia:Bug reports
Wikipedia:Contact us
Wikipedia:Deletion review
Wikipedia:Disambiguation
Wikipedia:Blocks and bans
Wikipedia:General disclaimer

In [407]:
dfCleanArticles = dfCleanArticles.loc[[t[:5] != 'Talk:' for t in dfCleanArticles['title']]]
dfCleanArticles = dfCleanArticles.loc[[t[:10] != 'Wikipedia:' for t in dfCleanArticles['title']]]

In [345]:
re.findall("\[?\[\w*:.*\]", blob)

['[[File:As-map.png|thumbnail|left|Map of Australia]]',
 '[[File:Corroboree.jpg|thumb|Photograph of Arrernte men of Central Australia in a Corroboree in 1900.]]',
 '[[File:The Founding of Australia. By Capt. Arthur Phillip R.N. Sydney Cove, Jan. 26th 1788.jpg|thumb|Captain Arthur Phillip raises the British flag at Sydney in 1788.]]',
 '[[File:Ln-Governor-Lachlan macquarie.jpg|thumb|Governor Lachlan Macquarie was the 5th governor of New South Wales and one who though that Australia could be a rich and free place.]]',
 '[http://foundingdocs.gov.au/item.asp?dID=8 Documenting Democracy]',
 '[[File:Opening of the first parliament.jpg|thumb|300px|A painting of the opening of the first Parliament of Australia, 9 May 1901, painted by Tom Roberts. Australia has had democracy since the 1850s.]]',
 '[[File:Australian PR COB 2006.PNG|right|300px|thumb|Countries of birth of Australian estimated resident population, 2006.Source:Australian Bureau of Statistics[http://www.ausstats.abs.gov.au/ausstats/

In [309]:
for result in re.findall("\[\[[(\w\|)*?\w\s]*\]\]", blob):
    print(result[2:-2].split("|")[0])

List of countries by area
Oceania
States of Australia
Parliamentary system
Queen Elizabeth II
Victoria (Australia)
Australian Aborigine
Dutch people
Venus (planet)
George III of the United Kingdom
Blue Mountains (New South Wales)
William Lawson (explorer)
Henty Brothers
Parliamentary democracy
colony
World War I
Gallipoli Campaign
Japanese Occupation of Singapore
Liberal Party (Australia)
Australian Labor Party
Australian Senate
Liberal Party of Australia
Australian cinema
Australian music
2000 Summer Olympics
Federation
Colonisation
multiculturalism
English language
Australian literature
Peter Carey (novelist)
Paul Kelly (musician)
Australian cinema
Charles Chauvel (filmmaker)
The Man from Snowy River (1982 movie)
Australia (2008 movie)
1956 Summer Olympics
Grand Slam (tennis)
National Basketball League (Australasia)
swimming
Sportsperson
