In [1]:
from py2neo import Graph
from bs4 import BeautifulSoup
import pywikibot
import pandas as pd
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
import wikipedia
import spacy
import os
nlp = spacy.load("en_core_web_sm")
import gensim
from collections import defaultdict
from time import sleep
from gensim.parsing.preprocessing import strip_multiple_whitespaces, preprocess_string, remove_stopwords, strip_tags, strip_punctuation 

In [2]:
#verify if category actually exists on online Wikipedia
def categoryExist(name):
    res = requests.get("https://en.wikipedia.org/wiki/Category:" + name)
    soup = BeautifulSoup(res.content, "html.parser")
    div = soup.find("div", {"id": "mw-normal-catlinks"})
    if(div):
        return True
    else:
        return False

#Use beautifulsoup to scrape the bottom categories of a parent category (these we don't want)
def getBadCategories(parent):
    res = requests.get("https://en.wikipedia.org/wiki/Category:" + parent)
    soup = BeautifulSoup(res.content, "html.parser")
    div = soup.find("div", {"id": "mw-normal-catlinks"})
    if(div):
        to_return = []
        for item in div.findAll("li"):
            to_return.append("_".join(item.get_text().split(" ")))
        return to_return
    else:
        #parent does not exist
        return []

def getSubCategories(name):
    site = pywikibot.Site()
    gen = pywikibot.Category(site,'Category:' + name).subcategories(recurse=False)
    subcats = []
    for item in gen:
        subcats.append("_".join(item.aslink().split(":")[1][:-2].split(" ")))
    return subcats  

In [8]:
blacklist = ["July_events"] #nodes that should not be visited
all_nodes = []
all_relations = [] #child parent
visited = []
def buildGraph(root, path):
    path = path + "/" + root
    print("Current Path: " + path)
    print()
    if((not categoryExist(root)) or (root in blacklist)):
        print("Category DNE or is BLACKLISTED")
        print()
        return
    if(root in visited):
        print("VISITED ALREADY")
        print()
        return
    
    visited.append(root)
    
    #add the node
    all_nodes.append(root)
    
    #get all subcategories
    sub_cat = getSubCategories(root)
    if(len(sub_cat) == 0): #base case
        return

    
    #add all the relationships
    for item in sub_cat:
        #check if this relationship already exists (cycle)
        relation = item + " " + root
        if(relation in all_relations):
            print("Relation already exists: ", relation)
            print()
            return
        all_relations.append(item + " " + root)
   
    #recurse on all sub nodes
    for item in sub_cat:
        buildGraph(item, path)


In [9]:
buildGraph("Roman_Republic", "")

Current Path: /Roman_Republic

Current Path: /Roman_Republic/Ancient_Roman_Republican_art

Current Path: /Roman_Republic/Government_of_the_Roman_Republic

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Ancient_Roman_governors_by_province

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Ancient_Roman_governors_by_province/Lists_of_Roman_governors

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Ancient_Roman_governors_by_province/Roman_governors_of_Achaea

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Ancient_Roman_governors_by_province/Roman_governors_of_Africa

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Ancient_Roman_governors_by_province/Roman_governors_of_Arabia_Petraea

Current Path: /Roman_Repub

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Ancient_Roman_governors_by_province/Roman_governors_of_Syria/2nd-century_Roman_governors_of_Syria

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Ancient_Roman_governors_by_province/Roman_governors_of_Syria/3rd-century_Roman_governors_of_Syria

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Ancient_Roman_governors_by_province/Roman_governors_of_Thracia

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Ancient_Roman_governors_by_province/Roman_governors_of_Tuscia_et_Umbria

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Byzantine_governors

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancient_Roman_governors/Byzantine_governors/Governors_of_the_Anatolic_Theme

Current Path: /Roman_Republic/Government_of_the_Roman_Republic/Ancien

KeyboardInterrupt: 

In [None]:
site = pywikibot.Site('en', 'wikipedia')  # The site we want to run our bot on
#page = pywikibot.Page(site, 'Support_vector_machines')
#page.text

page = wikipedia.page("Markov_networks")
page.content

In [2]:
fileName = "data/RR"

In [None]:
#save graph to files
with open(fileName + '_nodes.txt', 'w') as f:
    for item in all_nodes:
        f.write("%s\n" % item)

with open(fileName + '_relations.txt', 'w') as f:
    for item in all_relations:
        f.write("%s\n" % item)        

In [72]:
def fetch_wiki_pages(fileName, pagesToSearch = None):
    wiki_pages_dir = fileName + "_pages"
    if not os.path.exists(wiki_pages_dir):
        os.makedirs(wiki_pages_dir)
    file = open(fileName + '_nodes.txt', "r")
    articles_cleaned = []
    failures = []
    if(pagesToSearch is None):
        pagesToSearch = file
    for line in pagesToSearch:
        line = line.strip()
        wiki_page_path = wiki_pages_dir + "/" + line + ".txt"
        print(wiki_page_path)
        if(os.path.isfile(wiki_page_path)):
            print(wiki_page_path + " already exists")
            continue
        #sleep(10)
        print("fetching ==" + line + "==")
    #     all_good = True
    #     while(True):
        try:
            search = wikipedia.search(line.rstrip("\n"))[0]
            page = wikipedia.page(search)
        except wikipedia.DisambiguationError as e:
            page = wikipedia.page(e.options[0])
        except wikipedia.WikipediaException as e: # search is too busy
            print("ERROR: search too busy")
            failures.append(line)
            continue
            #all_good = False
        except Exception:
            print("ERROR: random error")
            articles_cleaned.append("")
            failures.append(line)
            continue
    #         if all_good:
    #             break;
        #page = wikipedia.page(wikipedia.search(line.rstrip("\n"))[0])
        content = page.content
        
        text_file = open(wiki_page_path, "w")
        text_file.write(content)
        text_file.close()
        print("finished")
    file.close()
    return failures

def preprocess(x):
    x = gensim.utils.simple_preprocess(x)
    x = " ".join(x)
    
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, remove_stopwords]
    x = preprocess_string(x, CUSTOM_FILTERS)
    return x

def clean_wiki_pages(fileName):
    wiki_pages_dir = fileName + "_pages"
    articles_cleaned = []
    file = open(fileName + '_nodes.txt', "r")
    FREQ_THRESHOLD = 3
    for line in file:
        line = line.strip()
        wiki_page_path = wiki_pages_dir + "/" + line + ".txt"
        if(not os.path.isfile(wiki_page_path)):
            print("ERROR: " + wiki_page_path + " does not exist")
            articles_cleaned.append("")
            continue
        
        print("cleaning: " + wiki_page_path)
        f = open(wiki_page_path, "r")
        content = f.read()

        cleaned = preprocess(content) #strip punctuations and stopwords and weird characters
        cleaned = " ".join(cleaned)
        doc = nlp(cleaned)
        lemm = [token.lemma_ for token in doc]
        d = defaultdict(int)
        for item in lemm:
            d[item] += 1
        tokens=[key for key,value in d.items() if value>FREQ_THRESHOLD] #all words with frequency of more than some number
        texts = [word for word in lemm if word in tokens]
        texts = " ".join(texts)
        articles_cleaned.append(texts)
    return articles_cleaned

In [67]:
failures = fetch_wiki_pages(fileName)
print(len(failures))
while(len(failures) > 0):
    failures = fetch_wiki_pages(fileName, failures)

data/RR_pages/Roman_Republic.txt already exists
data/RR_pages/Ancient_Roman_Republican_art.txt already exists
data/RR_pages/Government_of_the_Roman_Republic.txt already exists
data/RR_pages/Ancient_Roman_governors.txt already exists
data/RR_pages/Ancient_Roman_governors_by_province.txt already exists
data/RR_pages/Lists_of_Roman_governors.txt already exists
data/RR_pages/Roman_governors_of_Achaea.txt already exists
data/RR_pages/Roman_governors_of_Africa.txt already exists
data/RR_pages/Roman_governors_of_Arabia_Petraea.txt already exists
data/RR_pages/Roman_governors_of_Asia.txt already exists
data/RR_pages/Roman_governors_of_Bithynia_and_Pontus.txt already exists
data/RR_pages/Roman_governors_of_Britain.txt already exists
data/RR_pages/Roman_governors_of_Campania.txt already exists
data/RR_pages/Roman_governors_of_Cappadocia.txt already exists
data/RR_pages/Roman_governors_of_Cilicia.txt already exists
data/RR_pages/Roman_governors_of_Crete_and_Cyrenaica.txt already exists
data/RR_pa

ERROR: search too busy
data/RR_pages/Burial_sites_of_the_Herbertien_dynasty.txt already exists
data/RR_pages/Lethings.txt already exists
data/RR_pages/Kingdom_of_the_Lombards.txt already exists
data/RR_pages/Lombard_kings.txt already exists
data/RR_pages/Lombard_people.txt already exists
data/RR_pages/6th-century_Lombard_people.txt already exists
data/RR_pages/7th-century_Lombard_people.txt already exists
data/RR_pages/8th-century_Lombard_people.txt already exists
data/RR_pages/8th-century_dukes_of_Spoleto.txt already exists
data/RR_pages/9th-century_Lombard_people.txt already exists
data/RR_pages/9th-century_Germanic_people.txt already exists
data/RR_pages/9th-century_Danish_people.txt already exists
data/RR_pages/9th-century_English_people.txt already exists
data/RR_pages/9th-century_English_clergy.txt already exists
data/RR_pages/9th-century_English_monarchs.txt already exists
data/RR_pages/Alfred_the_Great.txt already exists
data/RR_pages/Cultural_depictions_of_Alfred_the_Great.txt

In [21]:
articles_cleaned = clean_wiki_pages(fileName)
print(len(articles_cleaned))

cleaning: data/RR_pages/Roman_Republic.txt


NameError: name 'preprocess' is not defined

In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(articles_cleaned)
print(vectorizer.get_feature_names())
print(vectors.shape)

In [None]:
article_names = []
file = open(fileName + '_nodes.txt', "r")
for line in file:
    article_names.append(line.rstrip("\n"))
file.close()

df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
df["article_name"] = article_names
#save to a file
df.to_hdf(fileName + "_embeddings.h5", key='df')

data_table = pd.read_hdf(fileName + "_embeddings.h5", 'df')
data_table.loc[data_table["article_name"] == "Charlemagne"]