In [1]:
!pip install wikipedia

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import numpy as np
import re
import pandas as pd

from pathlib import Path
import wikipedia

from urllib.request import urlopen
from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

### Read data

In [3]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()

In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')

In [5]:
train_url_df = train_df[train_df['url_legal'].notnull()]

In [6]:
all_wiki_pages = set(train_url_df[train_url_df['url_legal'].str.contains('wiki')]['url_legal'].values)

In [7]:
list(all_wiki_pages)[:100]

['https://en.wikipedia.org/wiki/Boiling_point',
 'https://en.wikipedia.org/wiki/Temperature-programmed_reduction',
 'https://en.wikipedia.org/wiki/Galaxy',
 'https://en.wikipedia.org/wiki/Artificial_intelligence',
 'https://simple.wikipedia.org/wiki/Geothermal_energy',
 'https://simple.wikipedia.org/wiki/Constellation',
 'https://simple.wikipedia.org/wiki/Steam_engine',
 'https://simple.wikipedia.org/wiki/Andromeda_galaxy',
 'https://simple.wikipedia.org/wiki/Cuneiform',
 'https://en.wikipedia.org/wiki/Domain_name',
 'https://simple.wikipedia.org/wiki/Antenna',
 'https://en.wikipedia.org/wiki/Jet_pack',
 'https://simple.wikipedia.org/wiki/Cold_War',
 'https://simple.wikipedia.org/wiki/Virus',
 'https://simple.wikipedia.org/wiki/Paleontology',
 'https://simple.wikipedia.org/wiki/Radar',
 'https://simple.wikipedia.org/wiki/Gene_therapy',
 'https://en.wikipedia.org/wiki/Pluton',
 'https://simple.wikipedia.org/wiki/Ancient_Egypt',
 'https://simple.wikipedia.org/wiki/Nanotechnology',
 'http

In [8]:
def extract_and_clean_wiki(page_name):
    page = wikipedia.page(page_name)
    text = page.content
    text = text.replace('==', '')
    # Drop footnote superscripts in brackets
    text = re.sub(r"\[.*?\]+", '', text)
    text = re.sub(r"\{.*?\}+", '', text)
    return text

In [9]:
text = extract_and_clean_wiki('Geometry_for_Elementary_School')

In [10]:
def extract_text_bits(text):
    text_bits = []
    text_bit = ""
    for i, s in enumerate(text.split('. ')):
        text_bit += s + "."
        if text_bit.find('See also') > -1:
            break
        if (i + 1) % 5 == 0:
            text_bit = re.sub(r'^[^A-Za-z0-9]+', '', text_bit)
            text_bit = text_bit.strip()
            if text_bit.count('\n') < 10:
                text_bits.append(text_bit)
            text_bit = ""
    return text_bits

In [11]:
text_bits = extract_text_bits(text)

In [12]:
text_bits

["Euclidean geometry is a mathematical system attributed to Alexandrian Greek mathematician Euclid, which he described in his textbook on geometry: the Elements.Euclid's method consists in assuming a small set of intuitively appealing axioms, and deducing many other propositions (theorems) from these.Although many of Euclid's results had been stated by earlier mathematicians, Euclid was the first to show how these propositions could fit into a comprehensive deductive and logical system.The Elements begins with plane geometry, still taught in secondary school (high school) as the first axiomatic system and the first examples of mathematical proofs.It goes on to the solid geometry of three dimensions.",
 'Much of the Elements states results of what are now called algebra and number theory, explained in geometrical language.For more than two thousand years, the adjective "Euclidean" was unnecessary because no other sort of geometry had been conceived.Euclid\'s axioms seemed so intuitively

In [13]:
LM_FOLDER = DATA_PATH/'commonlit_lm'
if not LM_FOLDER.exists():
    LM_FOLDER.mkdir()
    
wiki_examples = LM_FOLDER/'wiki_examples.txt'
            

In [14]:
failed_examples = []
with open(wiki_examples, 'w') as file:
    for page in tqdm(all_wiki_pages, total=len(all_wiki_pages)):
        topic = re.sub(r'.+/wiki/', '', page)
        if topic.find('/') > -1:
            topic = re.sub(r'(.+?)/.*', r'\1', topic)
        try:
            text = extract_and_clean_wiki(topic)
            print(topic)
            text_bits = extract_text_bits(text)
            for text_bit in text_bits:
                file.write(f'{text_bit}\n')
        except: 
            print(f'* Failed {topic} *')
            failed_examples.append(topic)

HBox(children=(FloatProgress(value=0.0, max=379.0), HTML(value='')))

Boiling_point
* Failed Temperature-programmed_reduction *
Galaxy
Artificial_intelligence
Geothermal_energy
Constellation
Steam_engine
Andromeda_galaxy
Cuneiform
Domain_name




  lis = BeautifulSoup(html).find_all('li')


* Failed Antenna *
Jet_pack
Cold_War
Virus
Paleontology
Radar
Gene_therapy
Pluton
Ancient_Egypt
Nanotechnology
Plague_of_Athens
Carbon_dioxide
Cave_painting
Ultrasound
Nebula
Enzyme
Lithosphere
Extinction
Databending
Implantable_cardioverter-defibrillator
Interplanetary_dust_cloud
Augmented_reality
* Failed Mitosis *
Kingdom_of_Prussia
Exoplanet
Hydroelectricity
Protective_tariff
Mars
Cathode_ray_tube
Seven_Years%27_War
Artificial_muscle
Unstructured_data
Homo_sapiens
Asteroid
Absolute_monarchy
Glucose
Defibrillator
Motion_(physics)
Buoyancy
* Failed Compass *
* Failed Blu-ray_Disc *
Protein
Creationism
Tyrant
Absolute_zero
Ottoman_Empire
Dictatorship
Synthesizer
Orbit
Wikijunior:The_Elements
Radiosurgery
Gene
Ostracism
Great_Awakening
Napoleon
Fossil_fuel
Jacobitism
Crystal
* Failed Silk_Road *
Hard_disk
Abyssal_plain
Constitution
Astronomy
Printing
Biotechnology
Colosseum
Wind_turbine
Cabinet_(government)
* Failed Ozone_layer *
Mobile_phone
Bronze_Age
Podcasting
Nutrition
Multimedia


In [15]:
wiki_examples_final = LM_FOLDER/'wiki_examples_final.txt'

In [51]:
from collections import Counter

max_token_len = 0
counter = 0
with open(wiki_examples, 'r') as read_file:
    content = read_file.readlines()
    with open(wiki_examples_final, 'w') as write_file:
        for c in content:
            token_len = len(re.split(r'\W', c))
            if len(c) > 256 and token_len < 300:
                c = re.sub(r'\.(?!\s|\d)', r'. ', c)
                write_file.write(c.strip())
                write_file.write('\n')
                counter += 1
            max_token_len = max(max_token_len, token_len)

In [52]:
max_token_len, counter

(480, 14417)

In [41]:
'Failed examples', len(failed_examples)

('Failed examples', 50)