<a href="https://colab.research.google.com/github/dhsgisc/cp4ai/blob/main/NLP_with_Python_First_Principles_Edition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

- Tokenizing
- Filtering Stop Words
- Stemming
- Tagging Parts of Speech
- Lemmatizing
- Chunking
- Chinking
- Using Named Entity Recognition (NER)
- Getting Text to Analyze
- Using a Concordance
- Making a Dispersion Plot
- Making a Frequency Distribution
- Finding Collocations

Reference: https://realpython.com/nltk-nlp-python/

In [1]:
# Tokenizing
# break up text into sentences
# break up sentence into words

text = "Muad'Dib learned rapidly because his first training was in how to learn. And the first lesson of all was the basic trust that he could learn. It's shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult."

# a sentence ends with . ! ?
terminators = ['.', '!', '?']

# convert to lowercase
text = text.lower()

# replace terminators in sentence with period
for terminator in terminators[1:]:
  text = text.replace(terminator, terminator[0])
  text = text.replace(terminator, terminator[0])

# split text into sentence by period 
sentences = text.split('.')

# remove last empty string 
sentences.pop()

# split sentence into words by space
words_all = []
for sentence in sentences:
  words_all.extend(sentence.split(' '))
# words now contain duplicates including empty strings
print(words_all)
# remove duplicates
words = []
for word in words_all:
  if word not in words:
    words.append(word)
# remove empty string
words.remove('')    
print(words)
print(len(words))

# alternatively, use set to remove all duplicates
#print(len(words))
#words = set(words)
#print(len(words))
#print(words)

["muad'dib", 'learned', 'rapidly', 'because', 'his', 'first', 'training', 'was', 'in', 'how', 'to', 'learn', '', 'and', 'the', 'first', 'lesson', 'of', 'all', 'was', 'the', 'basic', 'trust', 'that', 'he', 'could', 'learn', '', "it's", 'shocking', 'to', 'find', 'how', 'many', 'people', 'do', 'not', 'believe', 'they', 'can', 'learn,', 'and', 'how', 'many', 'more', 'believe', 'learning', 'to', 'be', 'difficult']
["muad'dib", 'learned', 'rapidly', 'because', 'his', 'first', 'training', 'was', 'in', 'how', 'to', 'learn', 'and', 'the', 'lesson', 'of', 'all', 'basic', 'trust', 'that', 'he', 'could', "it's", 'shocking', 'find', 'many', 'people', 'do', 'not', 'believe', 'they', 'can', 'learn,', 'more', 'learning', 'be', 'difficult']
37


In [None]:
def tokenise(text):
  # convert to lowercase
  text = text.lower()

  # replace terminators in sentence with period
  text = text.replace('!', '.')
  text = text.replace('?', '.')

  # split text into sentence by period 
  sentences = text.split('.')

  # remove last empty string 
  sentences.pop()

  # split sentence into words by space
  words_all = []
  for sentence in sentences:
    words_all.extend(sentence.split(' '))
  # words now contain duplicates including empty strings
  #print(words_all)
  # remove duplicates
  words = []
  for word in words_all:
    if word not in words:
      words.append(word)
  # remove empty string
  if '' in words:
    words.remove('')    
  return words
  #print(len(words)) 

# main
text = "Muad'Dib learned rapidly because his first training was in how to learn. And the first lesson of all was the basic trust that he could learn. It's shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult."
words = tokenise(text)
print(words)

["muad'dib", 'learned', 'rapidly', 'because', 'his', 'first', 'training', 'was', 'in', 'how', 'to', 'learn', 'and', 'the', 'lesson', 'of', 'all', 'basic', 'trust', 'that', 'he', 'could', "it's", 'shocking', 'find', 'many', 'people', 'do', 'not', 'believe', 'they', 'can', 'learn,', 'more', 'learning', 'be', 'difficult']


In [None]:
# Filtering stop words
stop_words = ['a', 'an', 'the'] # build up iteratively

for word in words:
  if word in stop_words:
    words.remove(word)
print(words)

["muad'dib", 'learned', 'rapidly', 'because', 'his', 'first', 'training', 'was', 'in', 'how', 'to', 'learn', 'and', 'lesson', 'of', 'all', 'basic', 'trust', 'that', 'he', 'could', "it's", 'shocking', 'find', 'many', 'people', 'do', 'not', 'believe', 'they', 'can', 'learn,', 'more', 'learning', 'be', 'difficult']


In [None]:
def filter_stop_words(words):
  stop_words = ['a', 'an', 'the'] # build up iteratively

  for word in words:
    if word in stop_words:
      words.remove(word)
  return words  

In [None]:
# Stemming
text_to_stem = "The crew of the USS Discovery discovered many discoveries. Discovering is what explorers do."

# convert to lowercase
text_to_stem = text_to_stem.lower()

words = tokenise(text_to_stem)
words = filter_stop_words(words)

# array
#stemmer = ['learn', 'learns', 'learned', 'learnt', 'learner', 'learning']
stemmer = ['discover', 'discovers', 'discovered', 'discovery', 'discoveries', 'discovering']
stem = stemmer[0] # first item
#for i in range(len(words)):
#  if words[i] in stemmer:
#    words[i] = stem
#print(words)

# dictionary
# learn - learn, learned, learnt, learner, learning
#stemmer_dict = {'learn': ['learn', 'learns', 'learned', 'learnt', 'learner', 'learning']}
stemmer_dict = {'discover': ['discover', 'discovers', 'discovered', 'discovery', 'discoveries', 'discovering']}
stem = list(stemmer_dict.keys())[0]
for i in range(len(words)):
  if words[i] in stemmer_dict[stem]:
    words[i] = stem
print(words)

['the', 'crew', 'of', 'the', 'uss', 'discovery', 'discovered', 'many', 'discoveries', '', 'discovering', 'is', 'what', 'explorers', 'do']
['crew', 'of', 'uss', 'discover', 'discover', 'many', 'discover', 'discover', 'is', 'what', 'explorers', 'do']


In [None]:
# main 
words = tokenise(text_to_stem)
words = filter_stop_words(words)
words

In [None]:
# Tagging Parts of Speech
'''
JJ	Adjectives
NN	Nouns
RB	Adverbs
PRP	Pronouns
VB	Verbs
'''

text_to_tag_pos = "If you wish to make an apple pie from scratch, you must first invent the universe."

words = tokenise(text_to_tag_pos)
words = filter_stop_words(words)

# array
#pos_tags = []
#pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'my', 'your', 'his', 'her', 'our', 'us', 'their']
#tag = 'PRP'
#for word in words:
#  if word in pronouns:
#    pos_tags.append([word, tag])
#print(pos_tags)

# dictionary
pos_tags = []
pronouns_dict = {'PRP': ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'my', 'your', 'his', 'her', 'our', 'us', 'their']}
tag = list(pronouns_dict.keys())[0]
for word in words:
  if word in pronouns_dict[tag]:
    pos_tags.append([word, tag])
print(pos_tags)

PRP
[['you', 'PRP']]


In [None]:
# Lemmatizing
# scarf - scarves
# knife - knives
# bad - worse, worst

# array
lemma = ['scarf', 'scarves']

# dictionary
lemma = {'scarf': 'scarves', 'knife': 'knives'}

In [None]:
# Chunking


In [None]:
# Chinking


In [None]:
# Using Named Entity Recognition (NER)
'''
NE	          Examples
ORGANIZATION	Georgia-Pacific Corp., WHO
PERSON	      Eddy Bonte, President Obama
LOCATION	    Murray River, Mount Everest
DATE	        June, 2008-06-29
TIME	        two fifty a m, 1:30 p.m.
MONEY	        175 million Canadian dollars, GBP 10.40
PERCENT	      twenty pct, 18.75 %
FACILITY	    Washington Monument, Stonehenge
GPE	          South East Asia, Midlothian
'''

In [None]:
# Getting Text to Analyze


In [None]:
# Using a Concordance


In [None]:
# Making a Dispersion Plot


In [None]:
# Making a Frequency Distribution
# store words with their frequencies

text = "the quick brown fox jumps over the lazy dog"
words = text.split(' ')

# array

freqs = [0 for i in range(len(words))]
for i in range(len(words)):
  words[]

# dictionary
word_freq = {}



# sort descending


In [None]:
# Finding Collocations
# common word combinations
# medium build; social drinker; non smoker; long term; would like; easy going; 
# financially secure; quiet night; well presented; never married; single mum; 
# permanent relationship; slim build; year old; similar interest; fun time; 

text = "the quick brown fox jumps over the lazy dog"
words = text.split(' ')
print(words)
word_pairs = []
for i in range(len(words)-1): # how many pairs?
  word_pairs.append([words[i], words[i+1]])
print(word_pairs)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
[['the', 'quick'], ['quick', 'brown'], ['brown', 'fox'], ['fox', 'jumps'], ['jumps', 'over'], ['over', 'the'], ['the', 'lazy'], ['lazy', 'dog']]


In [None]:
s = "s1. s2? s3!"
s = s.replace('?', '.')
s = s.replace('!', '.')
print(s)
lines = s.split('.')
lines.pop()
lines

s1. s2. s3.


['s1', ' s2', ' s3']

In [None]:
d = {'discover': ['discover', 'discovers', 'discovered', 'discovery', 'discoveries', 'discovering']}
print(list(d.keys())[0])

discover
