In [1]:
# LOAD THE FILES FOR THIS NOTEBOOK
!wget -q --show-progress --no-check-certificate 'https://docs.google.com/uc?export=download&id=1cuIbCCZZutqlO-d9s9KxI6GxfbHd8rnb' -O 'Class 7.zip'
from zipfile import ZipFile
with ZipFile('Class 7.zip', 'r') as zipObj:
  zipObj.extractall()

!wget https://nlp.stanford.edu/software/stanford-corenlp-4.5.1.zip
with ZipFile('stanford-corenlp-4.5.1.zip', 'r') as zipObj:
  zipObj.extractall()

--2022-11-22 00:52:56--  https://nlp.stanford.edu/software/stanford-corenlp-4.5.1.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-4.5.1.zip [following]
--2022-11-22 00:52:56--  https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-4.5.1.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 505225173 (482M) [application/zip]
Saving to: ‘stanford-corenlp-4.5.1.zip’


2022-11-22 00:54:28 (5.28 MB/s) - ‘stanford-corenlp-4.5.1.zip’ saved [505225173/505225173]



<b>LING 193 - Lecture 20<br>
Basic syntax applications in Python</b><br>
Andrew McInnerney<br>
November 9, 2022

First, I'm loading in our autocorrect functions from earlier in the course.

In [2]:
def firstrow(m):
    return [i for i in range(m+1)]

def substitution_penalty(letter1, letter2):
    if letter1 == letter2:
        return 0
    else:
        return 2

def nextrow(priorrow, word1, letter):
    row = [priorrow[0] + 1]
    priorcell = row[0]
    for i in range(len(word1)):
        insertion = priorrow[i+1] + 1
        deletion = priorcell + 1
        substitution = priorrow[i] + substitution_penalty(word1[i], letter)
        priorcell = min(insertion, deletion, substitution)
        row.append(priorcell)
    return row

def minedit(word1, word2):
    m = len(word1)
    n = len(word2)
    priorrow = firstrow(m)
    table = [priorrow]
    for i in range(n):
        row = nextrow(priorrow, word1, word2[i])
        table.append(row)
        priorrow = row
    return table

def citformat(word):
  punctuation = ["!","?",",",".","-","~",":",";", "'", '"']
  while word[-1] in punctuation:
    word = word[:-1]
  while word[0] in punctuation:
    word = word[1:]
  return word.upper()

with open("subtlex_words.txt") as file: # This changes our dictionary to a shorter one of more frequent words
    words = file.read().splitlines()
frequencies = {}
for entry in words:
    word = entry.split()[0].upper()
    freq = int(entry.split()[1])
    if word not in frequencies:
        frequencies[word] = freq
dictionary = {word:freq for word,freq in frequencies.items() if freq > 12}

def correct(typo):
    errors = {}
    mindist = 100
    for word in dictionary:
        table = minedit(typo,word)
        distance = table[-1][-1]
        if distance < mindist:
          errors = {}
          mindist = distance
        if distance <= mindist:
          errors[word] = distance
    return max(errors, key = dictionary.get)

Creating some simple sentences of the form:

Det N V Det N<br>
N Aux Adv V Det Adj N<br>
N Aux V P Det N

In [3]:
s1 = "The manager hired ths employees".split()
s2 = "John will not buy anu tomatoes".split()
s3 = "We are going to take ix classes".split()

"Correcting" these sentences:

In [4]:
for i in range(len(s1)):
  s1[i] = correct(s1[i].upper()).lower()
print(" ".join(s1).capitalize())

for i in range(len(s2)):
  s2[i] = correct(s2[i].upper()).lower()
print(" ".join(s2).capitalize())

for i in range(len(s3)):
  s3[i] = correct(s3[i].upper()).lower()
print(" ".join(s3).capitalize())

The manager hired this employees
John will not buy an tomatoes
We are going to take i classes


# 2 Part-of-Speech tagging

Notice that some of the words aren't corrected accurately. 
- The word *ths* should be corrected to *the*, not *ths*. 
- The word *anl* should be corrected to *any*, not *an*.
- The word *ix* should be corrected to *six*, not *i*.

We could take various approaches to solve each of these problems. For the word *an*, for instance, we could create a special rule that says *an* should only be considered if the next word starts with a vowel.

For the other examples, we may want to consider part of speech tagging. Take the first sentence. We could penalize *this* as a possible correction if the following noun is plural.

To do this, we would need part-of-speech information. We can get this from the Natural Language Toolkit (nltk). Below, we load in nltk, along with the part-of-speech tagger we need.

In [None]:
# Installing stanford's Core NLP into this notebook
%pip install stanfordcorenlp

In [None]:
# Importing into Python
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('stanford-corenlp-4.5.1')

Now take a look at the information we get from our control sentences, using the `pos_tag()` method:

In [None]:
s1 = "The manager hired the employees."
s2 = "John will not buy any tomatoes."
s3 = "We are going to take six classes."

print(nlp.pos_tag(s1))
print(nlp.pos_tag(s2))
print(nlp.pos_tag(s3))

[('The', 'DT'), ('manager', 'NN'), ('hired', 'VBD'), ('the', 'DT'), ('employees', 'NNS'), ('.', '.')]
[('John', 'NNP'), ('will', 'MD'), ('not', 'RB'), ('buy', 'VB'), ('any', 'DT'), ('tomatoes', 'NNS'), ('.', '.')]
[('We', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('to', 'TO'), ('take', 'VB'), ('six', 'CD'), ('classes', 'NNS'), ('.', '.')]


And here's what we get for our test sentences:

In [None]:
s1 = "The manager hired ths employees".split()
s2 = "John will not buy anl tomatoes".split()
s3 = "We are going to take ix classes".split()

for i in range(len(s1)):
  s1[i] = correct(s1[i].upper()).lower()
s1 = " ".join(s1)

for i in range(len(s2)):
  s2[i] = correct(s2[i].upper()).lower()
s2 = " ".join(s2)

for i in range(len(s3)):
  s3[i] = correct(s3[i].upper()).lower()
s3 = ' '.join(s3)

print(nlp.pos_tag(s1))
print(nlp.pos_tag(s2))
print(nlp.pos_tag(s3))

[('the', 'DT'), ('manager', 'NN'), ('hired', 'VBD'), ('this', 'DT'), ('employees', 'NNS')]
[('john', 'NNP'), ('will', 'MD'), ('not', 'RB'), ('buy', 'VB'), ('an', 'DT'), ('tomatoes', 'NNS')]
[('we', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('to', 'TO'), ('take', 'VB'), ('i', 'PRP'), ('classes', 'NNS')]


Note: You can find a list of part-of-speech tags [here](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html).

To correct *ths* to *the* instead of *this*, we need to penalize the word *this* in the context of a whole sentence, taking into account the part-of-speech of the following word. We would want to define a function `autocorrect()` that takes into account not only the minedit distance between two words, but also rules about which categories can occur where. If you choose the coding option for your final project, you might consider implementing some rules like the ones 

# 3 Bigrams

To solve our third error (correcting *ix* to *six* instead of *i*), we need to refer to *bigrams*. We can get the information we need from nltk's corpora. We will specifically use teh Brown Corpus:

In [None]:
%pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
print("the brown corpus contains",len(brown.fileids()),"texts with a total of",len(brown.sents()),"sentences and",len(brown.words()),"words")
print("there are an average of",len(brown.words())/len(brown.sents()),"words per sentence")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


the brown corpus contains 500 texts with a total of 57340 sentences and 1161192 words
there are an average of 20.250994070456922 words per sentence


Here I'm turning all the sentences in the Brown Corpus into tagged lists.

In [None]:
browntags = brown.tagged_sents()
print("The browntags object is a list of all the sentences of the brown corpus with part-of-speech tags.")
print("Example:")
for i in browntags[:10]:
  print(i)

The browntags object is a list of all the sentences of the brown corpus with part-of-speech tags.
Example:
[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]
[('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('

Now we can calculate part-of-speech bigrams.

In [None]:
bigrams = {} # Create an empty dictionary
for sentence in browntags: # Look at each tagged sentence in browncorp
  for i in range(len(sentence)-1): # We iterate over each word except the last one
    currentword = sentence[i] # The current word is the one at position i
    nextword = sentence[i+1] # The next word is the one at position i+1
    bigram = currentword[1]+"_"+nextword[1] # The current bigram is the part of speech label from those two words
    if bigram in bigrams: # If the current bigram is already a key in the bigrams dictionary
      bigrams[bigram] += 1 # Then we increment the count for that bigram
    else:
      bigrams[bigram] = 1 # Otherwise we create an entry for that bigram

print("We have identified",len(bigrams),"bigrams")

We have identified 8053 bigrams


Let's take a look at the most and least common of these:

In [None]:
sortedbigrams = sorted(bigrams, key=bigrams.get, reverse = True)
for bigram in sortedbigrams[0:10]:
  print(bigram, bigrams[bigram])
print("...")
for bigram in sortedbigrams[:-10:-1]:
  print(bigram, bigrams[bigram])

AT_NN 48372
IN_AT 43271
NN_IN 42252
JJ_NN 28407
NN_. 19857
AT_JJ 19487
NN_, 18279
IN_NN 17225
NNS_IN 14504
TO_VB 12291
...
FW-NN_QL 1
BE_FW-RB 1
PPO_BEZ* 1
FW-PP$-TL_'' 1
FW-NN-TL_FW-PP$-TL 1
FW-UH-TL_FW-NN-TL 1
``_FW-UH-TL 1
NP$-TL_CC-TL 1
PPO_FW-NN 1


Let's keep only the most frequent bigrams:

In [None]:
sortedbigrams = sortedbigrams[:800]
for bigram in sortedbigrams[:10]:
  print(bigram, bigrams[bigram])
print("...")
for bigram in sortedbigrams[-10:]:
  print(bigram, bigrams[bigram])

To get the right correction in `s3` (i.e. turning *ix* into *six* not *i*), we need to take bigram frequency into account in our `autocorrect()` function. We should have some logic that boosts the score of a particular word if it creates a more frequent bigram.

Another strategy is to use *trigrams*. There will be many more of these to calculate, but the principles are basically the same.

# Parsing
Here we will look at Stanford's `stanza` package. First we need to bring some files into Google Colab, which we can do with this code:

In [None]:
%pip install stanza
import stanza
stanza.download('en')
nlp = stanza.Pipeline('en',processors='tokenize,pos,constituency')

We can use stanza to generate constituent structures for example sentences. Here is some code to do that:

In [None]:
doc = nlp('I am watching my computer do something interesting. It is great.')
for sentence in doc.sentences:
  structure = sentence.constituency.children
  print(structure)

((S (NP (PRP I)) (VP (VBP am) (VP (VBG watching) (S (NP (PRP$ my) (NN computer)) (VP (VB do) (NP (NP (NN something)) (ADJP (JJ interesting))))))) (. .)),)
((S (NP (PRP It)) (VP (VBZ is) (ADJP (JJ great))) (. .)),)


Note: The constituent labels can be found [here](http://surdeanu.cs.arizona.edu/mihai/teaching/ista555-fall13/readings/PennTreebankConstituents.html).

I'm now going to calculate the frequency of category bigrams. I.e., if a constituent of a given type immediately contains another of a given type, then that's a bigram.

For example, an NP like *the dog*, which consists of a determine *the* and an noun *dog* would have bigrams (NP, Det) and (NP, N).

To calculate category bigrams, I'm going to use a couple of functions:

In [None]:
def get_treelet_bigrams(treelet):
  cat = treelet.label
  daughters = treelet.children
  if cat in CFG_bigrams:
    for daughter in daughters:
      subcat = daughter.label
      if subcat in CFG_bigrams[cat]:
        CFG_bigrams[cat][subcat] += 1
      else:
        CFG_bigrams[cat][subcat] = 1
  else:
    CFG_bigrams[cat] = {}
    for daughter in daughters:
      subcat = daughter.label
      CFG_bigrams[cat][subcat] = 1

In [None]:
def get_CFG_bigrams(tree):
  subtrees = [tree]
  while not subtrees == []:
    daughters = []
    for subtree in subtrees:
      if type(subtree) == stanza.models.constituency.parse_tree.Tree:
        get_treelet_bigrams(subtree)
        daughters = daughters + list(subtree.children)
    subtrees = daughters

We can use these to create a dictionary keeping track of the category bigrams in a set of sentences, like this:

In [None]:
CFG_bigrams = {}
for sentence in doc.sentences:
  tree = sentence.constituency.children[0]
  get_CFG_bigrams(tree)
CFG_bigrams

This is a very simple example.

What's more useful is to do this with a large set of sentences. We can use the Brown Corpus, for instance. I'm just going to use the first 100 sentences in the corpus, since sentences take a while to process.

In [None]:
brownstr = ' '.join([' '.join(s) for s in brown.sents()[:100]])
doc = nlp(brownstr)

And now I'll get CFG bigrams.

In [None]:
CFG_bigrams = {}
for sentence in doc.sentences:
  tree = sentence.constituency.children[0]
  get_CFG_bigrams(tree)
CFG_bigrams

Most of these are lexical items. I'll drop those.

In [None]:
old = CFG_bigrams.copy()
for key in old:
  if CFG_bigrams[key] == {}:
    CFG_bigrams.pop(key)
CFG_bigrams

The results are a list of observed frequencies for each rule. With a large enough corpus, we could get a good measure of the probability of any given tree branch. We can then calculate the probability of a full tree by factoring in the probabilities of all its branches.

In theory, we can use the probability of trees to weigh an autocorrection or speech recognition system.

