In [None]:
%load_ext watermark
%load_ext jupyternotify
%watermark  -a Filippo_Valle -v -m -g -r -v -p pandas,numpy,matplotlib,regex,wikipediaapi,sklearn

In [None]:
import requests
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import multiprocessing as mp
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
%load_ext autoreload
%autoreload 2
from nlp import process_phrase

In [None]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.WIKI)

In [None]:
def zeromean(arr):
    return np.mean(arr[arr>0])

In [None]:
def print_categories(page):
        categories = page.categories
        for title in sorted(categories.keys()):
            print("%s: %s" % (title, categories[title]))


print("Categories")
page_py = wiki_wiki.page('Ansatz')
print_categories(page_py)
process_phrase(str(page_py.text))

In [None]:
def print_categorymembers(categorymembers, level=0, max_level=1):
        for c in categorymembers.values():
            print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            ##c.ns == 14 -> category
            ##c.ns == 0 -> article
            if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
                print_categorymembers(c.categorymembers, level=level + 1, max_level=max_level)


cat = wiki_wiki.page("Category:Physics")
print("Category members: Category:Physics")
print_categorymembers(cat.categorymembers, max_level=1)

In [None]:
def get_titles(cat):
    titles = []

    def append_title(title):
        if not (title in titles): #avoid duplicates
            titles.append(title)
    
    def append_titles(cat, level=0, max_level=1):
        global titles
        for c in cat.categorymembers.values():
            #print("%s (ns: %d)" % (c.title, c.ns))
            ##c.ns == 14 -> category
            ##c.ns == 0 -> article
            if c.ns == wikipediaapi.Namespace.MAIN:
                append_title(c.title)
            elif c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
                    append_titles(c, level=level + 1, max_level=max_level)

    append_titles(cat, 0, 1)
    return titles

cat = wiki_wiki.page("Category:Physics")

titles = np.unique(get_titles(cat) + get_titles(wiki_wiki.page("Category:Biology")))
gc.collect()
titles[:5], len(titles)

In [None]:
def get_subcats(cat):
    cats = []

    def append_cat(cat):
        if not (cat in cats): #avoid duplicates
            cats.append(cat.split(":")[1])
    
    def append_cats(cat, level=0, max_level=1):
        global titles
        for c in cat.categorymembers.values():
            #print("%s (ns: %d)" % (c.title, c.ns))
            ##c.ns == 14 -> category
            ##c.ns == 0 -> article
            if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
                append_cat(c.title)
                append_cats(c, level=level + 1, max_level=max_level)

    append_cats(cat, 0, 1)
    return cats

categories = get_subcats(cat) + get_subcats(wiki_wiki.page("Category:Biology"))
gc.collect()
categories[:5], len(categories)

In [None]:
def get_text(title):
    try:
        page_py = wiki_wiki.page(title)
        text = process_phrase(str(page_py.text))
        return text
    except:
        return ""
    
pool = mp.Pool(12)
w = pool.map_async(get_text, titles)
pool.close()
pool.join()

corpus = w.get()

In [None]:
vectorizer = CountVectorizer()
df = pd.DataFrame(data = vectorizer.fit_transform(corpus).toarray().T,
                  index = vectorizer.get_feature_names_out(),
                  columns = titles,
                  dtype=int)
df.head()

In [None]:
df = df.reindex(columns=df.columns[df.apply(lambda x: (x>0).sum()>20, 0)], #documents with 20 words
           index=df.index[df.apply(lambda x: (x>0).sum()/len(x)>0.01, 1)]) #words in 1% of documents
df.shape

In [None]:
url = "https://en.wikipedia.org/w/api.php"

def get_categories(title):
    params = {
        "action": "query",
        "format": "json",
        "prop": "categories",
        "titles": title
    }
    to_ret = []
    
    try:
        with requests.get(url=url, params=params, timeout=1) as req:
            if req.status_code==200:
                data = req.json()
                pages = data["query"]["pages"]
                for k, v in pages.items():
                    if "categories" not in v.keys():
                        continue
                    for cat in v['categories'][:10]: #some articles have an enourmous number of categories ([:100])
                        to_ret.append(cat["title"].split(":")[1])
            else:
                print(req.status_code)
                return None
    except:
        print("Timed out")
        return None
    gc.collect()
    return to_ret

In [None]:
get_categories(titles[40])

In [None]:
pool = mp.Pool(12)
w = pool.map_async(get_categories, titles, error_callback=lambda err: print(err))
pool.close()
pool.join()
gc.collect()

corpus_categories = w.get()
max(map(len, filter(lambda x: x is not None, corpus_categories)))

In [None]:
df_files = pd.DataFrame()
count_skipped = 0
old_shape = 0
for i, (title, categories) in enumerate(zip(titles, corpus_categories)):
    if (i % 250) == 0:
        print(i)
        print(df_files.shape)
        gc.collect()
    if categories is None:
        count_skipped += 1
        print(f"skipping {title}")
        continue
    if title not in df_files.columns: #avoid duplicates on this cell
        if len(categories) > 10:
            print("*************")
            break
        df_files = df_files.join(pd.Series(name=title, index=categories, data = 1, dtype=int), how = "outer")
print(f"{count_skipped} were skipped due to time out req")
print(df_files.shape)

In [None]:
df_files = df_files.fillna(0).astype(int).drop_duplicates()

In [None]:
df_files = df_files.reindex(index=df_files.index[(df_files.sum(1)>1)].drop_duplicates()).transpose()

In [None]:
df_files.columns[df_files.sum(0) > 2]

In [None]:
df_files.head()

In [None]:
df = df.divide(df.sum(0),1)

In [None]:
plt.plot(df.mean(1).sort_values(ascending=False).values.ravel())
plt.plot([1, df.shape[0]], [0.1, 0.1/df.shape[0]])
plt.xlim(1, df.shape[0])
plt.yscale("log")
plt.xscale("log")

In [None]:
df.mean(1).sort_values(ascending=False)[10:20]

In [None]:
fig, ax = plt.subplots()

bins = np.linspace(0, 0.2)

df.loc["the", :].hist(ax=ax, histtype="step", bins=bins, density=True, label = "the")
df.loc["atom", :].hist(ax=ax, histtype="step", bins=bins, density=True, label="atom")
plt.xlabel("f")
plt.ylabel("pdf")
plt.yscale("log")
plt.legend()

In [None]:
df.to_csv("df.csv", index=True, header=True)
df_files.to_csv("df_files.csv", index=True, header=True)

In [None]:
categories = get_subcats(wiki_wiki.page("Category:Physics stubs"))
print(categories)
list(filter(lambda x: x in df_files.columns, categories))

In [None]:
df_files[df_files.columns[df_files.sum(0)>50]].sum(0)

In [None]:
[file for file in df_files.columns if "theo" in file]

In [None]:
latent_categories = [['Physics stubs', "Biology"], ["Theoretical physics", "Statistical mechanics"], ["String theory", "Quantum field theory"]]

In [None]:
def get_files(category):
    subfiles = df_files[df_files.columns[df_files.columns==category]]
    subfiles = subfiles[subfiles.sum(1)>=1].index
    return subfiles

In [None]:
subfiles = get_files(latent_categories[2][0])
df[df.columns[df.columns.isin(subfiles)]].mean(1).sort_values(ascending=False)[:15]

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(21, 7))

word = "field"

ax.set_title(word)

bins = np.linspace(0, df.loc[word, :].max(), 15)
##all
subdf = df.loc[word, :]
subdf = subdf[subdf>0]
subdf.hist(ax=axs[0], histtype="step", bins=bins, lw=5, density=True, label=word)

axs[0].set_title("Wikipedia", fontsize=35)

def append_plot(ax, cat):
    subdf = df[df.columns[df.columns.isin(get_files(cat))]].loc[word, :]
    subdf = subdf[subdf>0]
    subdf.hist(ax=ax, histtype="step", bins=bins, lw=5, density=True, label=cat)

#coarse
append_plot(axs[1], latent_categories[0][0])
append_plot(axs[1], latent_categories[0][1])
axs[1].set_title("coarse", fontsize=35)

#fine
append_plot(axs[2], latent_categories[1][0])
append_plot(axs[2], latent_categories[1][1])
axs[2].set_title("fine", fontsize=35)

#ultra fine
append_plot(axs[3], latent_categories[2][0])
append_plot(axs[3], latent_categories[2][1])
axs[3].set_title("ultra-fine", fontsize=35)


for ax in axs:
        ax.set_xlabel("mean word f", fontsize=25)
        ax.tick_params(labelsize=12, length=8, width=5)
        ax.tick_params(which="minor", length=5, width=3)
        ax.set_yscale("log")
        ax.legend(fontsize=15)

plt.show()