In [22]:
# Task 1.1
from bs4 import BeautifulSoup
import requests
import unicodedata

def get_page_content(url):
    result = ""
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "lxml")
    content = soup.find(id="content")
    pars = content.find_all("p")
    for p in pars:
        result += p.text
    return unicodedata.normalize("NFKD", result)

URL = "https://en.wikipedia.org/wiki/George_Washington"
content = get_page_content(URL)
content



In [23]:
# Task 1.2.1
# en_core_web_md was not working on my machine. I tried finding workarounds but only one I could find was using "en_core_web_sm"
import en_core_web_sm
from collections import Counter

nlp = en_core_web_sm.load()

doc = nlp(content)
labels = [ent.label_ for ent in doc.ents]

Counter(labels)

Counter({'GPE': 607,
         'PERSON': 340,
         'DATE': 304,
         'ORG': 262,
         'NORP': 238,
         'CARDINAL': 154,
         'LOC': 64,
         'ORDINAL': 38,
         'EVENT': 22,
         'FAC': 21,
         'QUANTITY': 21,
         'WORK_OF_ART': 19,
         'LAW': 18,
         'MONEY': 13,
         'PRODUCT': 12,
         'TIME': 7,
         'LANGUAGE': 1,
         'PERCENT': 1})

In [24]:
# Task 1.2.2
import re

num_reg = re.compile('^-?\\d*(\\.\\d+)?$')

tokens = [token.text for token in doc if
          (not nlp.vocab[token.text].is_stop) and 
          (num_reg.match(token.text)is None) and 
          (not nlp.vocab[token.text].is_punct)]

ct = Counter(tokens)
ct.most_common(20)

[('Washington', 392),
 ('British', 81),
 ('French', 48),
 ('American', 47),
 ('Congress', 44),
 ('General', 41),
 ('New', 41),
 ('slaves', 38),
 ('Virginia', 37),
 ('Hamilton', 36),
 ('Mount', 34),
 ('army', 33),
 ('Vernon', 27),
 ('York', 25),
 ('troops', 25),
 ('led', 23),
 ('forces', 22),
 ('Jefferson', 21),
 ('including', 20),
 ('John', 20)]

In [25]:
# Task 1.2.3

import random

sents = list(doc.sents)

k = random.randint(0, len(sents)-2)
print("random selected k = {}\n".format(k))

selected_sents=[]
for i in range(k, k+3):
    selected_sents.append(sents[i])
    print("sent #{} : {}".format(i, sents[i]))

random selected k = 72

sent #72 : Crawford allotted 23,200 acres (9,400 ha) to Washington; Washington told the veterans that their land was hilly and unsuitable for farming, and he agreed to purchase 20,147 acres (8,153 ha), leaving some feeling they had been duped.[54] He also doubled the size of Mount Vernon to 6,500 acres (2,600 ha) and, by 1775, had increased its slave population by more than a hundred.[55]
As a respected military hero and large landowner, Washington held local offices and was elected to the Virginia provincial legislature, representing Frederick County in the House of Burgesses for seven years beginning in 1758.[55]
sent #73 : He first ran for the seat in 1755 but was soundly beaten by Hugh West.[56][57] When he ran in 1758, Washington plied voters with beer, brandy, and other beverages.
sent #74 : Despite being away serving on the Forbes Expedition, he won the election with roughly 40 percent of the vote, defeating three opponents with the help of local supporte

In [26]:
# Task 1.2.4
pos_lemma = []
for sent in selected_sents:
    for token in sent:
        if not token.is_stop and token.pos_!='PUNCT':
            pos_lemma.append((token.orth_, token.pos, token.lemma))
            
print(pos_lemma)            

[('Crawford', 96, 192638075852601008), ('allotted', 100, 5126340158369994666), ('23,200', 93, 4445622139694000437), ('acres', 92, 17674329845845936116), ('9,400', 93, 3083487823071013439), ('ha', 92, 8521863001484601166), ('Washington', 96, 8812112232444892211), ('Washington', 96, 8812112232444892211), ('told', 100, 63172552626595070), ('veterans', 92, 15684110667284750590), ('land', 92, 9223110792950820932), ('hilly', 84, 17012569883275227654), ('unsuitable', 84, 2993394340582898698), ('farming', 92, 12624199841640396543), ('agreed', 100, 3895058773709925002), ('purchase', 100, 5286280168732561260), ('20,147', 93, 14869262480423655030), ('acres', 92, 17674329845845936116), ('8,153', 93, 18401747165490669014), ('ha', 92, 8521863001484601166), ('leaving', 100, 9707179535890930240), ('feeling', 92, 15873186554868950905), ('duped.[54', 85, 13423045531151013782), (']', 101, 3806482680584466996), ('doubled', 100, 11754589693395242851), ('size', 92, 18236024651932686414), ('Mount', 96, 51017

In [27]:
# Task 1.2.5
ents = []
for sent in selected_sents:
    for ent in sent.ents:
        ents.append((ent.text, ent.label_))
        
print(ents)

[('Crawford', 'ORG'), ('23,200 acres', 'QUANTITY'), ('9,400', 'CARDINAL'), ('Washington', 'GPE'), ('Washington', 'GPE'), ('20,147 acres', 'QUANTITY'), ('8,153', 'CARDINAL'), ('Mount Vernon', 'GPE'), ('6,500 acres', 'QUANTITY'), ('2,600', 'CARDINAL'), ('1775', 'DATE'), ('Washington', 'GPE'), ('Virginia', 'GPE'), ('Frederick County', 'GPE'), ('the House of Burgesses', 'ORG'), ('seven years', 'DATE'), ('first', 'ORDINAL'), ('1755', 'DATE'), ('Hugh', 'PERSON'), ('1758', 'DATE'), ('Washington', 'GPE'), ('the Forbes Expedition', 'ORG'), ('roughly 40 percent', 'PERCENT'), ('three', 'CARDINAL'), ('Washington', 'GPE')]


In [28]:
# Task 1.2.6
from spacy import displacy

displacy.render(selected_sents[0], style="ent")

In [29]:
displacy.render(selected_sents[0], style="dep")

In [30]:
# Task 1.2.7

displacy.render(doc, style="ent")

In [31]:
# Task 2.1
def make_redacted(txt):
    doc = nlp(txt)
    result = []
    for token in doc:
        if token.ent_type_ == 'PERSON':
            result.append("[REDACTED]")
        else:
            result.append(token.text)  # Use token.text instead of token.string
    return " ".join(result)

redacted_text = make_redacted(content)
print(redacted_text)


 [REDACTED] [REDACTED] ( February 22 , 1732 – December 14 , 1799 ) was an American military officer , statesman , and Founding Father who served as the first president of the United States from 1789 to 1797 . Appointed by the Second Continental Congress as commander of the Continental Army in June 1775 , Washington led Patriot forces to victory in the American Revolutionary War and then served as president of the Constitutional Convention in 1787 , which drafted and ratified the Constitution of the United States and established the American federal government . Washington has thus been called the " Father of his Country " . 
 Washington 's first public office , from 1749 to 1750 , was as surveyor of Culpeper County in the Colony of Virginia . He subsequently received military training and was assigned command of the Virginia Regiment during the French and Indian War . He was later elected to the Virginia House of Burgesses and was named a delegate to the Continental Congress in Philad

In [32]:
# Task 2.2
displacy.render(nlp(redacted_text), style="ent")