In [185]:
import requests, wikipedia, spacy
from bs4 import BeautifulSoup

import ast # for converting string representation of list to a list

In [85]:
URL = "https://en.wikipedia.org/wiki/Earth"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find(id="firstHeading").text
print(title)

nlp = spacy.load("en_core_web_sm")

Earth


## First thing

Just first paragraph, choose sentence number when presented, and choose to edit or use links.

In [86]:
all_paragraphs = soup.find_all("p")

In [87]:
# getting first paragraph
for i in range(len(all_paragraphs)):
    p = all_paragraphs[i]
    if p.text == "\n": # avoids \n paragraph which comes first
        continue
    else:
        break # only one for now

In [203]:
p_text = p.text.strip()
doc = nlp(p_text)

sents = list(doc.sents) # all sentences are type spacy.tokens.span.Span
number_of_sents = len(list(doc.sents))

print("Choose which line to learn from, or go back with b.")

for i in range(1, len(sents) + 1):
    print(f"{i}) {sents[i - 1]}")

# ask user, line 6 is interesting enough to create a flashcard for
line_number_to_learn = input("What line do you want to learn? ")

if line_number_to_learn == "b":
    raise # go back here!!
else:
    line_number_to_learn = int(line_number_to_learn)
    
if line_number_to_learn not in list(range(1, number_of_sents + 1)):
    print("Please try again and choose a valid line, or go back with b.")
    raise # instead of exit as it kills kernel # ask again!!

print("\nWhich words would you like to hide?")

line_to_learn = sents[line_number_to_learn - 1]
for i in range(1, len(line_to_learn) + 1):
    print(f"{i} {line_to_learn[i - 1]}")

# [[14], [18,19]] for line 6 generates two interesting answers
selected_spans_str = input(f"Select a span (e.g. \"[1,2,3]\"), a single word (e.g. \"[5]\"), or even a multiple spans (e.g. \"[[1,2], [5,6]]\"\n")

all_spans = [] # to track all the spans mentioned
span_numbers = {} # to track, for a given index, which span group (i.e. the answer) it corresponds to
answers = [] # to collect the answers for the given sub_spans

try:
    selected_spans = ast.literal_eval(selected_spans_str)
    assert isinstance(selected_spans, list)
    
    if isinstance(selected_spans[0], int):
        # if only have one span, e.g. "[1,2,3]" or "[5]"
        selected_spans = [selected_spans] # can trivially treat as a nested span
        
    elif not isinstance(selected_spans[0], list):
        raise
    
    # now have a list of spans "[[1,2], [5,6]]", or simply "[[1,2,3]]"
    all_spans = [idx for sub_span in selected_spans for idx in sub_span] # simply extracting every word index from our list of lists
    for i, sub_span in enumerate(selected_spans, start = 1):
        # now building answer for given sub_span, and assigning span numbers (corresponding answer) for each index of the sub-span
        answer = ""

        for j, idx in enumerate(sub_span, start=1):
            span_numbers[idx] = i # assigns indiviual index to a specific span (answer), to use when creating the flashcard
            
            if j != len(sub_span):
                answer += line_to_learn[idx - 1].text_with_ws
            else:
                # don't add whitespace if at the last sub_span index (don't want an answer with a space at the end)
                answer += line_to_learn[idx - 1].text
        
        answers.append(answer)


except:
    print("Please try again with a valid span.")


print("\nYour new flashcard will look like this:")

question = ""
for i in range(1, len(line_to_learn) + 1):
    token = line_to_learn[i - 1]
    if i in all_spans:
        span_number = span_numbers[i] # like a reverse dict search
        question += f"[{span_number}]"
        question += token.whitespace_
    else:
        question += token.text_with_ws
print(question)
print("\nAnd the answers are:")
for i, answer in enumerate(answers, start = 1):
    print(f"{i} {answer}")

Choose which line to learn from, or go back with b.
1) Earth is the third planet from the Sun and the only astronomical object known to harbor life.
2) While large volumes of water can be found throughout the Solar System, only Earth sustains liquid surface water.
3) About 71% of Earth's surface is made up of the ocean, dwarfing Earth's polar ice, lakes, and rivers.
4) The remaining 29% of Earth's surface is land, consisting of continents and islands.
5) Earth's surface layer is formed of several slowly moving tectonic plates, interacting to produce mountain ranges, volcanoes, and earthquakes.
6) Earth's liquid outer core generates the magnetic field that shapes Earth's magnetosphere, deflecting destructive solar winds.
What line do you want to learn? 6

Which words would you like to hide?
1 Earth
2 's
3 liquid
4 outer
5 core
6 generates
7 the
8 magnetic
9 field
10 that
11 shapes
12 Earth
13 's
14 magnetosphere
15 ,
16 deflecting
17 destructive
18 solar
19 winds
20 .
Select a span (e.g

In [183]:
all_spans = [14] # all spans the user has chosen
question = ""
for i in range(1, len(line_to_learn) + 1):
    token = line_to_learn[i - 1]
    if i in all_spans:
        span_number = 1 # reverse dict search here
        question += f"[{span_number}]"
        question += token.whitespace_
    else:
        question += token.text_with_ws
question

"Earth's liquid outer core generates the magnetic field that shapes Earth's [1], deflecting destructive solar winds."

In [10]:
## need javascript to edit stuff!! ##

# for i in range(len(all_paragraphs)):
#     p = all_paragraphs[i]
#     if p.text == "\n": # avoids \n first line
#         continue
    
#     tag = soup.new_tag("b") # making a bold element
#     tag.string = f"PARAGRAPH {i}:"
#     p.insert_before(tag)
    
# #     print(p)
# #     break

In [11]:
# with open(f"../{title} - Modified.html", "wb") as f_output:
#     f_output.write(soup.prettify("utf-8"))

## First things after first useful thing

In [None]:
# press 0.1, 0.2, 0.3 for summary paragraphs and 1.1, 2.1 etc for section/subsection


# otherwise want summaries of different links, want search function through them
# or else to move to a different site

# Retrieving all useful elements

In [None]:
# title to be placed at top of every flashcard...
# infobox is <table class="infobox">
# key_paragraphs are <p>s before table of contents
# tocs is <div id="toc" class="toc">
# h2s after that are different section names (can add to flashcards too)
# <p>s between are the text
# h3s are subsubsections etc -> recursively get these after 

# UI Building

https://en.wikipedia.org/wiki/Earth

In [83]:
# wikipedia_url = "https://en.wikipedia.org/wiki/Barack" # an example where the redirected address != the given address
wikipedia_url = input("What page do you want to learn from?")
try:
    r = requests.get(wikipedia_url, allow_redirects=True)
    soup = BeautifulSoup(r.content, "html.parser")
    redirected_address = soup.find("link", rel="canonical").get("href")
    assert wikipedia_url == redirected_address
except:
    print("Please try again with a valid Wikipedia page.\nThis includes the correct redirected address.")

What page do you want to learn from?https://en.wikipedia.org/wiki/Earth


In [None]:
accepted_inputs = [
    "h", # home
    "c", # contents
    "b", # back
    "1.1.1", #??
    "i", # infobox
    "pp", # page previews and select by typing it, then select how many sentences, give number or list of sentences then edit text
]

### Useful extra code

In [13]:
# from bs4 import BeautifulSoup
# import os

# base = os.path.dirname(os.path.abspath(__file__))
# html = open(os.path.join(base, 'example.html'))
# soup = BeautifulSoup(html, 'html.parser')

# for i in soup.find('div', {"id":None}).findChildren():
#     i.replace_with('##')

# with open("example_modified.html", "wb") as f_output:
#     f_output.write(soup.prettify("utf-8"))

In [12]:
# cleaned_text = re.sub('(\[[0-9]+\])', '', unicodedata.normalize('NFKD', p.text)).strip()
#         if cleaned_text:
#             yield cleaned_text