# Extracting Facts

In [9]:
# Loading libraries

import spacy
import textacy.extract

from pathlib import Path

# Python library textacy implements several common data extraction algorithms on top of spaCy, 
# including semi-structured statement extraction

In [4]:
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

In [10]:
# Sample text that we want to examine
text = Path("london.txt").read_text()

In [11]:
# Parse the document with spaCy
doc = nlp(text)

In [27]:
# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, "London")

In [28]:
# Print the results
print("Here are the things I know about London:\n")

for statement in statements:
    subject, verb, fact = statement
    print(f"- {fact}")

Here are the things I know about London:

- the capital and most populous city of England and the United Kingdom.

- a major settlement for two millennia
- the world's most populous city from around 1831 to 1925
- beyond all comparison the largest
town in England
- still very compact
- the world's largest city from about 1831 to 1925
- the seat of the
Government of the United Kingdom
- vulnerable to flooding.

- "one of the World's
Greenest Cities" with more than 40 percent green space or open water
- the most
populous city and metropolitan area of the European Union and the second most
populous in Europe
- the 19th largest city and the 18th largest
metropolitan region in the world
- Christian, and has a large number of churches, particularly
in the City of London
- also home to sizeable Muslim, Hindu, Sikh, and Jewish
communities
- also home to 42
Hindu temples
- one of the pre-eminent financial centres of the world as the most
important location for international finance
- the world 

# Extracting Noun Chunks

In [None]:
# One of the algorithms included in textacy is called noun chunk extraction. It looks for chunks of words that 
# seem to belong together and seem to refer to a single idea. These kinds of words are often the kinds of 
# keywords that a user will type into a search box. We can use them to populate our autocomplete system

In [38]:
# Extract semi-structured statements
noun_chunks = textacy.extract.noun_chunks(doc, min_freq=3)

# The min_freq=3 parameter tells textacy to ignore any noun chunks that don’t appear at least three times in the 
# document. We are collecting common terms, so we don’t need every possible noun in the document.

In [39]:
# Convert noun chunks to lowercase strings
noun_chunks = map(str, noun_chunks)
noun_chunks = map(str.lower, noun_chunks)

# The list of noun chunks we get back will be spaCy tokens and the words will be a mix of uppercase and lowercase.
# So we need to convert them to lowercase strings 

In [40]:
# Print out any nouns that are at least 2 words long
for noun_chunk in set(noun_chunks):
    if len(noun_chunk.split(" ")) > 1:
        print(noun_chunk)


city centre
eight royal parks
european union
inner london
new york city
westminster abbey

london school

national gallery
large number
south london

eight royal parks
london's population
london eye
london school
national statistics
outer london
west london

major centre
regent's park
second world war
greater london's population
royal opera house
london underground
greater london authority
central london
major centre
greater london
2011 census
trafalgar square

population density
other city

other city
river thames
great fire
canary wharf
royal albert hall
city centre
hampstead heath

london underground
national gallery
tate modern
office space
united kingdom
british museum
east end
population density
epping forest
