<a href="https://colab.research.google.com/github/guilhermelaviola/NaturalLanguageProcessing/blob/main/Class13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Natural Language Pipelines and Batch Processing**

In [3]:
# Importing all the necessary resources:
! pip3 install spacy
! python3 -m spacy download en_core_web_sm # Download the English model 'en_core_web_sm'
! pip3 install wikipedia

import spacy
import wikipedia
import json
import os
import uuid
import datetime
import tqdm

wikipedia.set_lang('en')
nlp = spacy.load('en_core_web_sm') # Load the downloaded English model

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
# Loading the text:
text = wikipedia.page('NLP')
text
text.content



  lis = BeautifulSoup(html).find_all('li')


DisambiguationError: "LP" may refer to: 
Limited partnership
Labour Party (disambiguation)
Liberal Party
Libertarian Party (United States)
Liberty Party (disambiguation)
Lycée professionnel
primary schools
Lorne Park Secondary School
LAN Perú
Louisiana-Pacific
lowercase people
Ladakh Police
Lonely Planet
Liberapay
lp (Unix)
Lp space
ℓp space
LPMud
printer
line printer
Larch Prover
Linear programming
LivePerson
Logic programming
Psychologist
Lumbar puncture
Liquefied petroleum gas
Liquid propane
Low power electronics
Low-power broadcasting
Low precipitation supercell
Sound pressure level
Star catalogue#Proper motion catalogues
Logic of Paradox
LP record
LP (singer)
El-P
Latin Percussion
Laxmikant–Pyarelal
Gibson Les Paul
Linkin Park
Liam Payne
Lil Peep
LP (Ambulance LTD album)
LP (Discovery album)
LP (Holy Fuck album)
LP (Insomniac Folklore album)
LP!
LP1 (Liam Payne album)
LP (Landon Pigg album)
The LP
L.P. (The Rembrandts album)
LP (Soviettes album)
Lateral pass
Lesson plan
Let's Play
Liquidity provider
Listening post
dwarfism
Lower Peninsula of Michigan
United Nations laissez-passer

In [6]:
# Processing text with spaCy and exploring functions:
doc = nlp(text.content)
type(doc)

doc[0]
len(doc)

print(doc.to_json())
print(json.dumps(doc.to_json(), indent=4, ensure_ascii=False))

doc[0].pos_
doc[0].dep_
doc[0].head

(doc[0].text, doc[0].pos_ + "/" + doc[0].dep_, doc[0].head)

NameError: name 'text' is not defined

In [None]:
# Defining functions:
def extract_syntax(doc):
  output = []
  for token in doc:
    output.append((token.text, token.pos_ + '/' + token.dep_, token.head.text))
    return output

extract_syntax(doc)

def extract_subjects(doc):
  output = []
  for token in doc:
    if token.dep_ == 'nsubj':
      output.append((token.text, token.head.text))
      return output

extract_subjects(doc)

In [None]:
# Defining the pipeline:
def pipeline(text, nlp):
  doc = nlp(text)
  output = doc.to_json()
  output['syntax'] = extract_syntax(doc)
  output['subjects'] = extract_subjects(doc)
  return output

pipeline('Natural Language Processing is a sub-area of ​​artificial intelligence.', nlp)

In [None]:
# Creating folders and log file:
if not os.path.isdir('raw_texts'):
  os.mkdir('raw_texts')

if not os.path.isdir('processed_texts'):
  os.mkdir('processed_texts')

log_file = open('log.txt', 'a')

In [None]:
# Extracting documents:
pages = []
for letra in 'abcdefghijklmnopqrstuvwxyz':
  pages.extend(wikipedia.search(letter, results=5))

len(pages)
pages
uuid.uuid4()

for page in tqdm.tqdm(pages):
  text = wikipedia.page(page).content
  except wikipedia.DisambiguationError:
    log_file.write(f'[{str(datetime.datetime.now())}] DisambiguationError: {page}\n')
    except wikipedia.PageError:
      log_file.write(f'[{str(datetime.datetime.now())}] PageError: {page}\n')
      else:
        with open(f'raw_texts/{uuid.uuid4()}.txt', 'w') as f:
          f.write(text)
          log_file.write(f'[{str(datetime.datetime.now())}] OK: {page}\n')

len(os.listdir('raw_texts'))

In [None]:
# Transforming and storing documents:
nlp = spacy.load('en_core_web_sm')

for file in tqdm.tqdm(os.listdir('raw_texts')):
  with open('raw_texts/' + file) as f:
    text= f.read()

    processed = pipeline(text, nlp)
    timestamp = str(datetime.datetime.now())
    processed['timestamp'] = timestamp
    log_file.write(f"[{str(datetime.datetime.now())}] Pipeline: {file}\n")

    with open('processed_texts/' + file + '.json', 'w') as f:
      json.dump(processed, f)

    os.remove('raw_texts/' + file)

log_file.close()