In [1]:
# LOAD THE FILES FOR THIS NOTEBOOK
!wget -q --show-progress --no-check-certificate 'https://docs.google.com/uc?export=download&id=1sVJ9Tq6KYxf3_DzWiDJoxnMgghHo22Md' -O 'nltk_stuff.zip'
from zipfile import ZipFile
with ZipFile('nltk_stuff.zip', 'r') as zipObj:
  zipObj.extractall('../root/nltk_data/taggers')

!wget -q --show-progress --no-check-certificate 'https://docs.google.com/uc?export=download&id=1cuIbCCZZutqlO-d9s9KxI6GxfbHd8rnb' -O 'Class 7.zip'
from zipfile import ZipFile
with ZipFile('Class 7.zip', 'r') as zipObj:
  zipObj.extractall()



<b>LING 193 - Lecture 19<br>
Basic syntax applications in Python</b><br>
Andrew McInnerney<br>
November 7, 2022

First, I'm loading in our autocorrect functions from earlier in the course.

In [2]:
def firstrow(m):
    return [i for i in range(m+1)]

def substitution_penalty(letter1, letter2):
    if letter1 == letter2:
        return 0
    else:
        return 2

def nextrow(priorrow, word1, letter):
    row = [priorrow[0] + 1]
    priorcell = row[0]
    for i in range(len(word1)):
        insertion = priorrow[i+1] + 1
        deletion = priorcell + 1
        substitution = priorrow[i] + substitution_penalty(word1[i], letter)
        priorcell = min(insertion, deletion, substitution)
        row.append(priorcell)
    return row

def minedit(word1, word2):
    m = len(word1)
    n = len(word2)
    priorrow = firstrow(m)
    table = [priorrow]
    for i in range(n):
        row = nextrow(priorrow, word1, word2[i])
        table.append(row)
        priorrow = row
    return table

def citformat(word):
  punctuation = ["!","?",",",".","-","~",":",";", "'", '"']
  while word[-1] in punctuation:
    word = word[:-1]
  while word[0] in punctuation:
    word = word[1:]
  return word.upper()

with open("subtlex_words.txt") as file: # This changes our dictionary to a shorter one of more frequent words
    words = file.read().splitlines()
frequencies = {}
for entry in words:
    word = entry.split()[0].upper()
    freq = int(entry.split()[1])
    if word not in frequencies:
        frequencies[word] = freq
dictionary = {word:freq for word,freq in frequencies.items() if freq > 12}

def correct(typo):
    errors = {}
    mindist = 100
    for word in dictionary:
        table = minedit(typo,word)
        distance = table[-1][-1]
        if distance < mindist:
          errors = {}
          mindist = distance
        if distance <= mindist:
          errors[word] = distance
    return max(errors, key = dictionary.get)

Creating some simple sentences of the form:

Det N V Det N<br>
N Aux Adv V Det Adj N<br>
N Aux V P Det N

In [3]:
s1 = "The manager hired ths employees".split()
s2 = "John will not buy anl tomatoes".split()
s3 = "We are going to take ix classes".split()

"Correcting" these sentences:

In [4]:
for i in range(len(s1)):
  s1[i] = correct(s1[i].upper()).lower()
print(" ".join(s1).capitalize())

for i in range(len(s2)):
  s2[i] = correct(s2[i].upper()).lower()
print(" ".join(s2).capitalize())

for i in range(len(s3)):
  s3[i] = correct(s3[i].upper()).lower()
print(" ".join(s3).capitalize())

The manager hired this employees
John will not buy an tomatoes
We are going to take i classes


# 2 Part-of-Speech tagging

Notice that some of the words aren't corrected accurately. 
- The word *ths* should be corrected to *the*, not *ths*. 
- The word *anl* should be corrected to *any*, not *an*.
- The word *ix* should be corrected to *six*, not *i*.

We could take various approaches to solve each of these problems. For the word *an*, for instance, we could create a special rule that says *an* should only be considered if the next word starts with a vowel.

For the other examples, we may want to consider part of speech tagging. Take the first sentence. We could penalize *this* as a possible correction if the following noun is plural.

To do this, we would need part-of-speech information. We can get this from the Natural Language Toolkit (nltk). Below, we load in nltk, along with the part-of-speech tagger we need.

In [5]:
# Installing nltk into this notebook
%pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
# Importing into Python
import nltk

Now take a look at the information we get from our control sentences, using the `pos_tag()` method:

In [7]:
s1 = "The manager hired the employees".split()
s2 = "John will not buy any tomatoes".split()
s3 = "We are going to take six classes".split()

print(nltk.pos_tag(s1))
print(nltk.pos_tag(s2))
print(nltk.pos_tag(s3))

[('The', 'DT'), ('manager', 'NN'), ('hired', 'VBD'), ('the', 'DT'), ('employees', 'NNS')]
[('John', 'NNP'), ('will', 'MD'), ('not', 'RB'), ('buy', 'VB'), ('any', 'DT'), ('tomatoes', 'NNS')]
[('We', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('to', 'TO'), ('take', 'VB'), ('six', 'CD'), ('classes', 'NNS')]


And here's what we get for our test sentences:

In [8]:
s1 = "The manager hired ths employees".split()
s2 = "John will not buy anl tomatoes".split()
s3 = "We are going to take ix classes".split()

for i in range(len(s1)):
  s1[i] = correct(s1[i].upper()).lower()

for i in range(len(s2)):
  s2[i] = correct(s2[i].upper()).lower()

for i in range(len(s3)):
  s3[i] = correct(s3[i].upper()).lower()

print(nltk.pos_tag(s1))
print(nltk.pos_tag(s2))
print(nltk.pos_tag(s3))

[('the', 'DT'), ('manager', 'NN'), ('hired', 'VBD'), ('this', 'DT'), ('employees', 'NNS')]
[('john', 'NN'), ('will', 'MD'), ('not', 'RB'), ('buy', 'VB'), ('an', 'DT'), ('tomatoes', 'NN')]
[('we', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('to', 'TO'), ('take', 'VB'), ('i', 'NN'), ('classes', 'NNS')]


To correct *ths* to *the* instead of *this*, we need to penalize the word *this* in the context of a whole sentence, taking into account the part-of-speech of the following word. We would want to define a function `autocorrect()` that takes into account not only the minedit distance between two words, but also rules about which categories can occur where. 

# 3 Bigrams

To solve our third error (correcting *ix* to *six* instead of *i*), we need to refer to *bigrams*. We can get the information we need from nltk's corpora. We will specifically use teh Brown Corpus:

In [9]:
nltk.download('brown')
from nltk.corpus import brown
print("the brown corpus contains",len(brown.fileids()),"texts with a total of",len(brown.sents()),"sentences and",len(brown.words()),"words")
print("there are an average of",len(brown.words())/len(brown.sents()),"words per sentence")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


the brown corpus contains 500 texts with a total of 57340 sentences and 1161192 words
there are an average of 20.250994070456922 words per sentence


Here I'm turning all the sentences in the Brown Corpus into tagged lists.

In [10]:
browncorp = []
for text in brown.fileids():
  for sentence in brown.sents(text):
    browncorp.append(nltk.pos_tag(sentence))
print("The browncorp is a list of all the sentences of the brown corpus with part-of-speech tags.")
print("Example:")
print(browncorp[0])

The browncorp is a list of all the sentences of the brown corpus with part-of-speech tags.
Example:
[('The', 'DT'), ('Fulton', 'NNP'), ('County', 'NNP'), ('Grand', 'NNP'), ('Jury', 'NNP'), ('said', 'VBD'), ('Friday', 'NNP'), ('an', 'DT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NNP'), ('recent', 'JJ'), ('primary', 'JJ'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'DT'), ('evidence', 'NN'), ("''", "''"), ('that', 'IN'), ('any', 'DT'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]


Now we can calculate part-of-speech bigrams.

In [23]:
bigrams = {} # Create an empty dictionary
for sentence in browncorp: # Look at each tagged sentence in browncorp
  for i in range(len(sentence)-1): # We iterate over each word except the last one
    currentword = sentence[i] # The current word is the one at position i
    nextword = sentence[i+1] # The next word is the one at position i+1
    bigram = currentword[1]+"_"+nextword[1] # The current bigram is the part of speech label from those two words
    if bigram in bigrams: # If the current bigram is already a key in the bigrams dictionary
      bigrams[bigram] += 1 # Then we increment the count for that bigram
    else:
      bigrams[bigram] = 1 # Otherwise we create an entry for that bigram

print("We have identified",len(bigrams),"bigrams")

# for bigram in bigrams:
#   print(f"{bigram}: {bigrams[bigram]}")


We have identified 1345 bigrams


Let's take a look at the most and least common of these:

In [22]:
sortedbigrams = sorted(bigrams, key=bigrams.get, reverse = True)
for bigram in sortedbigrams[0:10]:
  print(bigram, bigrams[bigram])
print("...")
for bigram in sortedbigrams[:-10:-1]:
  print(bigram, bigrams[bigram])

DT_NN 55590
IN_DT 52178
NN_IN 45328
JJ_NN 36223
DT_JJ 28839
NN_. 20018
NN_, 19476
NNP_NNP 18879
IN_NN 17756
JJ_NNS 16036
...
POS_VBG 1
FW_CD 1
UH_VB 1
NNPS_WP$ 1
MD_VBZ 1
PRP$_EX 1
UH_PRP$ 1
``_POS 1
JJ_UH 1


Let's keep only the most frequent bigrams:

In [24]:
sortedbigrams = sortedbigrams[:700]
for bigram in sortedbigrams[0:10]:
  print(bigram, bigrams[bigram])
print("...")
for bigram in sortedbigrams[:-10:-1]:
  print(bigram, bigrams[bigram])

DT_NN 55590
IN_DT 52178
NN_IN 45328
JJ_NN 36223
DT_JJ 28839
NN_. 20018
NN_, 19476
NNP_NNP 18879
IN_NN 17756
JJ_NNS 16036
...
CD_WDT 46
WRB_VBP 46
,_UH 47
NNS_WP$ 47
VBN_VBP 47
JJS_. 47
WRB_EX 47
:_WP 47
NNPS_'' 47


To get the right correction in `s3` (i.e. turning *ix* into *six* not *i*), we need to take bigram frequency into account in our `autocorrect()` function. We should have some logic that boosts the score of a particular word if it creates a more frequent bigram.

Another strategy is to use *trigrams*. There will be many more of these to calculate, but the principles are basically the same.